> ######变量聚类
> setwd("/Users/yaozhilin/Downloads/R_edu/data")
> accepts<-read.csv("accepts.csv")
> #导入ClustOfvar包——>用hclustvar对变量进行聚类及用stability确定聚类数量——>用
> #cutreevar把hclustvar的树参照stability数据进行切割
> library(ClustOfVar)
> orgData<-accepts[,c(-1,-2,-3,-5,-6)]
> tree<-hclustvar(orgData)
> plot(tree)
> part <- cutreevar(tree,7,matsim = TRUE)#聚成7类查看
> print(part)
Call:
cutreevar(obj = tree, k = 7, matsim = TRUE)
name description
"$var" "list of variables in each cluster"
"$sim" "similarity matrix in each cluster"
"$cluster" "cluster memberships"
"$wss" "within-cluster sum of squares"
"$E" "gain in cohesion (in %)"
"$size" "size of each cluster"
"$scores" "synthetic score of each cluster"
"$coef" "coef of the linear combinations defining the synthetic scores of each cluster"
> summary(part)
Call:
cutreevar(obj = tree, k = 7, matsim = TRUE)
Data:
number of observations: 5845
number of variables: 19
number of clusters: 7
Cluster 1 :
squared loading correlation
purch_price 0.90 -0.95
loan_amt 0.90 -0.95
msrp 0.90 -0.95
vehicle_year 0.16 -0.40
Cluster 2 :
squared loading correlation
fico_score 0.69 0.84
tot_derog 0.60 -0.78
rev_util 0.31 -0.56
Cluster 3 :
squared loading correlation
tot_rev_tr 0.70 0.84
tot_rev_line 0.61 0.79
tot_rev_debt 0.61 0.79
tot_open_tr 0.61 0.83
tot_tr 0.55 0.74
age_oldest_tr 0.26 0.51
Cluster 4 :
squared loading correlation
down_pyt 0.73 0.85
ltv 0.73 -0.85
Cluster 5 :
squared loading correlation
1 1
Cluster 6 :
squared loading correlation
1 1
Cluster 7 :
squared loading correlation
used_ind 0.79 -0.89
veh_mileage 0.79 -0.89
Gain in cohesion (in %): 59.07
> part$sim #查看聚类后同类的相关性矩阵
$cluster1
vehicle_year purch_price msrp loan_amt
vehicle_year 1.00000000 0.05961112 0.08725898 0.05578607
purch_price 0.05961112 1.00000000 0.74949104 0.80971082
msrp 0.08725898 0.74949104 1.00000000 0.75508338
loan_amt 0.05578607 0.80971082 0.75508338 1.00000000
$cluster2
tot_derog rev_util fico_score
tot_derog 1.00000000 0.02360717 0.21185851
rev_util 0.02360717 1.00000000 0.06931344
fico_score 0.21185851 0.06931344 1.00000000
$cluster3
tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt tot_rev_line
tot_tr 1.0000000 0.22550267 0.2348436 0.19762456 0.14830133 0.3122191
age_oldest_tr 0.2255027 1.00000000 0.0301182 0.04633906 0.05585033 0.1761412
tot_open_tr 0.2348436 0.03011820 1.0000000 0.59139623 0.28310367 0.1651928
tot_rev_tr 0.1976246 0.04633906 0.5913962 1.00000000 0.43259981 0.2627265
tot_rev_debt 0.1483013 0.05585033 0.2831037 0.43259981 1.00000000 0.3411766
tot_rev_line 0.3122191 0.17614119 0.1651928 0.26272651 0.34117657 1.0000000
$cluster4
down_pyt ltv
down_pyt 1.0000000 0.2086998
ltv 0.2086998 1.0000000
$cluster5
loan_term
loan_term 1
$cluster6
tot_income
tot_income 1
$cluster7
veh_mileage used_ind
veh_mileage 1.0000000 0.3313023
used_ind 0.3313023 1.0000000
> part$var
$cluster1
squared loading correlation
purch_price 0.9047017 -0.9511581
loan_amt 0.9046381 -0.9511247
msrp 0.8983588 -0.9478394
vehicle_year 0.1562745 -0.3953246
$cluster2
squared loading correlation
fico_score 0.6927142 0.8380766
tot_derog 0.6003257 -0.7763639
rev_util 0.3121919 -0.5587414
$cluster3
squared loading correlation
tot_rev_tr 0.6994471 0.8409839
tot_rev_line 0.6125771 0.7859612
tot_rev_debt 0.6116821 0.7853868
tot_open_tr 0.6094718 0.8271140
tot_tr 0.5475506 0.7399666
age_oldest_tr 0.2552606 0.5052449
$cluster4
squared loading correlation
down_pyt 0.7284184 0.8534743
ltv 0.7284184 -0.8534817
$cluster5
squared loading correlation
1 1
$cluster6
squared loading correlation
1 1
$cluster7
squared loading correlation
used_ind 0.7877943 -0.8875778
veh_mileage 0.7877943 -0.8876091
> part <- cutreevar(tree,16,matsim = TRUE)#聚成16类查看
> print(part)
Call:
cutreevar(obj = tree, k = 16, matsim = TRUE)
name description
"$var" "list of variables in each cluster"
"$sim" "similarity matrix in each cluster"
"$cluster" "cluster memberships"
"$wss" "within-cluster sum of squares"
"$E" "gain in cohesion (in %)"
"$size" "size of each cluster"
"$scores" "synthetic score of each cluster"
"$coef" "coef of the linear combinations defining the synthetic scores of each cluster"
> summary(part)
Call:
cutreevar(obj = tree, k = 16, matsim = TRUE)
Data:
number of observations: 5845
number of variables: 19
number of clusters: 16
Cluster 1 :
squared loading correlation
1 1
Cluster 2 :
squared loading correlation
1 1
Cluster 3 :
squared loading correlation
1 1
Cluster 4 :
squared loading correlation
1 1
Cluster 5 :
squared loading correlation
tot_open_tr 0.88 0.96
tot_rev_tr 0.88 0.94
Cluster 6 :
squared loading correlation
1 1
Cluster 7 :
squared loading correlation
1 1
Cluster 8 :
squared loading correlation
1 1
Cluster 9 :
squared loading correlation
1 1
Cluster 10 :
squared loading correlation
loan_amt 0.93 0.96
purch_price 0.93 0.96
msrp 0.90 0.95
Cluster 11 :
squared loading correlation
1 1
Cluster 12 :
squared loading correlation
1 1
Cluster 13 :
squared loading correlation
1 1
Cluster 14 :
squared loading correlation
1 1
Cluster 15 :
squared loading correlation
1 1
Cluster 16 :
squared loading correlation
1 1
Gain in cohesion (in %): 96.85
> part$sim #查看相关性矩阵
$cluster1
vehicle_year
vehicle_year 1
$cluster2
tot_derog
tot_derog 1
$cluster3
tot_tr
tot_tr 1
$cluster4
age_oldest_tr
age_oldest_tr 1
$cluster5
tot_open_tr tot_rev_tr
tot_open_tr 1.0000000 0.5913962
tot_rev_tr 0.5913962 1.0000000
$cluster6
tot_rev_debt
tot_rev_debt 1
$cluster7
tot_rev_line
tot_rev_line 1
$cluster8
rev_util
rev_util 1
$cluster9
fico_score
fico_score 1
$cluster10
purch_price msrp loan_amt
purch_price 1.0000000 0.7494910 0.8097108
msrp 0.7494910 1.0000000 0.7550834
loan_amt 0.8097108 0.7550834 1.0000000
$cluster11
down_pyt
down_pyt 1
$cluster12
loan_term
loan_term 1
$cluster13
ltv
ltv 1
$cluster14
tot_income
tot_income 1
$cluster15
veh_mileage
veh_mileage 1
$cluster16
used_ind
used_ind 1
> part$var
$cluster1
squared loading correlation
1 1
$cluster2
squared loading correlation
1 1
$cluster3
squared loading correlation
1 1
$cluster4
squared loading correlation
1 1
$cluster5
squared loading correlation
tot_open_tr 0.8845115 0.9597241
tot_rev_tr 0.8845115 0.9404847
$cluster6
squared loading correlation
1 1
$cluster7
squared loading correlation
1 1
$cluster8
squared loading correlation
1 1
$cluster9
squared loading correlation
1 1
$cluster10
squared loading correlation
loan_amt 0.9275256 0.9630813
purch_price 0.9253048 0.9619276
msrp 0.9036108 0.9506082
$cluster11
squared loading correlation
1 1
$cluster12
squared loading correlation
1 1
$cluster13
squared loading correlation
1 1
$cluster14
squared loading correlation
1 1
$cluster15
squared loading correlation
1 1
$cluster16
squared loading correlation
1 1