本文旨在描述如何使用Python实现基本的树形图。要实现这样的树形图,首先需要有一个数值矩阵。每一行代表一个实体(这里是一辆汽车)。每列都是描述汽车的变量。目标是将实体聚类以了解谁与谁有共同点。python下通过scipy中hierarchy.linkage进行聚类,hierarchy.dendrogram画树形图。参考文档:https://python-graph-gallery.com/dendrogram/
该章节主要内容有:
画树形图,往往第一列是数据实体名字,即物体种类。其他列分别为物体变量。
# 导入库
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster import hierarchy
import numpy as np
# Import the mtcars dataset from the web + keep only numeric variables
url = 'https://python-graph-gallery.com/wp-content/uploads/mtcars.csv'
df = pd.read_csv(url)
df
.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3cpre>\3ccode>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }
model
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
0
Mazda RX4
21.0
6
160.0
110
3.90
2.620
16.46
0
1
4
4
1
Mazda RX4 Wag
21.0
6
160.0
110
3.90
2.875
17.02
0
1
4
4
2
Datsun 710
22.8
4
108.0
93
3.85
2.320
18.61
1
1
4
1
3
Hornet 4 Drive
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
4
Hornet Sportabout
18.7
8
360.0
175
3.15
3.440
17.02
0
0
3
2
5
Valiant
18.1
6
225.0
105
2.76
3.460
20.22
1
0
3
1
6
Duster 360
14.3
8
360.0
245
3.21
3.570
15.84
0
0
3
4
7
Merc 240D
24.4
4
146.7
62
3.69
3.190
20.00
1
0
4
2
8
Merc 230
22.8
4
140.8
95
3.92
3.150
22.90
1
0
4
2
9
Merc 280
19.2
6
167.6
123
3.92
3.440
18.30
1
0
4
4
10
Merc 280C
17.8
6
167.6
123
3.92
3.440
18.90
1
0
4
4
11
Merc 450SE
16.4
8
275.8
180
3.07
4.070
17.40
0
0
3
3
12
Merc 450SL
17.3
8
275.8
180
3.07
3.730
17.60
0
0
3
3
13
Merc 450SLC
15.2
8
275.8
180
3.07
3.780
18.00
0
0
3
3
14
Cadillac Fleetwood
10.4
8
472.0
205
2.93
5.250
17.98
0
0
3
4
15
Lincoln Continental
10.4
8
460.0
215
3.00
5.424
17.82
0
0
3
4
16
Chrysler Imperial
14.7
8
440.0
230
3.23
5.345
17.42
0
0
3
4
17
Fiat 128
32.4
4
78.7
66
4.08
2.200
19.47
1
1
4
1
18
Honda Civic
30.4
4
75.7
52
4.93
1.615
18.52
1
1
4
2
19
Toyota Corolla
33.9
4
71.1
65
4.22
1.835
19.90
1
1
4
1
20
Toyota Corona
21.5
4
120.1
97
3.70
2.465
20.01
1
0
3
1
21
Dodge Challenger
15.5
8
318.0
150
2.76
3.520
16.87
0
0
3
2
22
AMC Javelin
15.2
8
304.0
150
3.15
3.435
17.30
0
0
3
2
23
Camaro Z28
13.3
8
350.0
245
3.73
3.840
15.41
0
0
3
4
24
Pontiac Firebird
19.2
8
400.0
175
3.08
3.845
17.05
0
0
3
2
25
Fiat X1-9
27.3
4
79.0
66
4.08
1.935
18.90
1
1
4
1
26
Porsche 914-2
26.0
4
120.3
91
4.43
2.140
16.70
0
1
5
2
27
Lotus Europa
30.4
4
95.1
113
3.77
1.513
16.90
1
1
5
2
28
Ford Pantera L
15.8
8
351.0
264
4.22
3.170
14.50
0
1
5
4
29
Ferrari Dino
19.7
6
145.0
175
3.62
2.770
15.50
0
1
5
6
30
Maserati Bora
15.0
8
301.0
335
3.54
3.570
14.60
0
1
5
8
31
Volvo 142E
21.4
4
121.0
109
4.11
2.780
18.60
1
1
4
2
# 通常获得数据表格需要将车名设置行标题,这里model代表车的类型
df = df.set_index('model')
df
.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3cpre>\3ccode>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
model
Mazda RX4
21.0
6
160.0
110
3.90
2.620
16.46
0
1
4
4
Mazda RX4 Wag
21.0
6
160.0
110
3.90
2.875
17.02
0
1
4
4
Datsun 710
22.8
4
108.0
93
3.85
2.320
18.61
1
1
4
1
Hornet 4 Drive
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
Hornet Sportabout
18.7
8
360.0
175
3.15
3.440
17.02
0
0
3
2
Valiant
18.1
6
225.0
105
2.76
3.460
20.22
1
0
3
1
Duster 360
14.3
8
360.0
245
3.21
3.570
15.84
0
0
3
4
Merc 240D
24.4
4
146.7
62
3.69
3.190
20.00
1
0
4
2
Merc 230
22.8
4
140.8
95
3.92
3.150
22.90
1
0
4
2
Merc 280
19.2
6
167.6
123
3.92
3.440
18.30
1
0
4
4
Merc 280C
17.8
6
167.6
123
3.92
3.440
18.90
1
0
4
4
Merc 450SE
16.4
8
275.8
180
3.07
4.070
17.40
0
0
3
3
Merc 450SL
17.3
8
275.8
180
3.07
3.730
17.60
0
0
3
3
Merc 450SLC
15.2
8
275.8
180
3.07
3.780
18.00
0
0
3
3
Cadillac Fleetwood
10.4
8
472.0
205
2.93
5.250
17.98
0
0
3
4
Lincoln Continental
10.4
8
460.0
215
3.00
5.424
17.82
0
0
3
4
Chrysler Imperial
14.7
8
440.0
230
3.23
5.345
17.42
0
0
3
4
Fiat 128
32.4
4
78.7
66
4.08
2.200
19.47
1
1
4
1
Honda Civic
30.4
4
75.7
52
4.93
1.615
18.52
1
1
4
2
Toyota Corolla
33.9
4
71.1
65
4.22
1.835
19.90
1
1
4
1
Toyota Corona
21.5
4
120.1
97
3.70
2.465
20.01
1
0
3
1
Dodge Challenger
15.5
8
318.0
150
2.76
3.520
16.87
0
0
3
2
AMC Javelin
15.2
8
304.0
150
3.15
3.435
17.30
0
0
3
2
Camaro Z28
13.3
8
350.0
245
3.73
3.840
15.41
0
0
3
4
Pontiac Firebird
19.2
8
400.0
175
3.08
3.845
17.05
0
0
3
2
Fiat X1-9
27.3
4
79.0
66
4.08
1.935
18.90
1
1
4
1
Porsche 914-2
26.0
4
120.3
91
4.43
2.140
16.70
0
1
5
2
Lotus Europa
30.4
4
95.1
113
3.77
1.513
16.90
1
1
5
2
Ford Pantera L
15.8
8
351.0
264
4.22
3.170
14.50
0
1
5
4
Ferrari Dino
19.7
6
145.0
175
3.62
2.770
15.50
0
1
5
6
Maserati Bora
15.0
8
301.0
335
3.54
3.570
14.60
0
1
5
8
Volvo 142E
21.4
4
121.0
109
4.11
2.780
18.60
1
1
4
2
# 同时需要删除行标索引的标题名
del df.index.name
df
.dataframe tbody tr th:only-of-type { vertical-align: middle }
\3cpre>\3ccode>.dataframe tbody tr th { vertical-align: top }
.dataframe thead th { text-align: right }
mpg
cyl
disp
hp
drat
wt
qsec
vs
am
gear
carb
Mazda RX4
21.0
6
160.0
110
3.90
2.620
16.46
0
1
4
4
Mazda RX4 Wag
21.0
6
160.0
110
3.90
2.875
17.02
0
1
4
4
Datsun 710
22.8
4
108.0
93
3.85
2.320
18.61
1
1
4
1
Hornet 4 Drive
21.4
6
258.0
110
3.08
3.215
19.44
1
0
3
1
Hornet Sportabout
18.7
8
360.0
175
3.15
3.440
17.02
0
0
3
2
Valiant
18.1
6
225.0
105
2.76
3.460
20.22
1
0
3
1
Duster 360
14.3
8
360.0
245
3.21
3.570
15.84
0
0
3
4
Merc 240D
24.4
4
146.7
62
3.69
3.190
20.00
1
0
4
2
Merc 230
22.8
4
140.8
95
3.92
3.150
22.90
1
0
4
2
Merc 280
19.2
6
167.6
123
3.92
3.440
18.30
1
0
4
4
Merc 280C
17.8
6
167.6
123
3.92
3.440
18.90
1
0
4
4
Merc 450SE
16.4
8
275.8
180
3.07
4.070
17.40
0
0
3
3
Merc 450SL
17.3
8
275.8
180
3.07
3.730
17.60
0
0
3
3
Merc 450SLC
15.2
8
275.8
180
3.07
3.780
18.00
0
0
3
3
Cadillac Fleetwood
10.4
8
472.0
205
2.93
5.250
17.98
0
0
3
4
Lincoln Continental
10.4
8
460.0
215
3.00
5.424
17.82
0
0
3
4
Chrysler Imperial
14.7
8
440.0
230
3.23
5.345
17.42
0
0
3
4
Fiat 128
32.4
4
78.7
66
4.08
2.200
19.47
1
1
4
1
Honda Civic
30.4
4
75.7
52
4.93
1.615
18.52
1
1
4
2
Toyota Corolla
33.9
4
71.1
65
4.22
1.835
19.90
1
1
4
1
Toyota Corona
21.5
4
120.1
97
3.70
2.465
20.01
1
0
3
1
Dodge Challenger
15.5
8
318.0
150
2.76
3.520
16.87
0
0
3
2
AMC Javelin
15.2
8
304.0
150
3.15
3.435
17.30
0
0
3
2
Camaro Z28
13.3
8
350.0
245
3.73
3.840
15.41
0
0
3
4
Pontiac Firebird
19.2
8
400.0
175
3.08
3.845
17.05
0
0
3
2
Fiat X1-9
27.3
4
79.0
66
4.08
1.935
18.90
1
1
4
1
Porsche 914-2
26.0
4
120.3
91
4.43
2.140
16.70
0
1
5
2
Lotus Europa
30.4
4
95.1
113
3.77
1.513
16.90
1
1
5
2
Ford Pantera L
15.8
8
351.0
264
4.22
3.170
14.50
0
1
5
4
Ferrari Dino
19.7
6
145.0
175
3.62
2.770
15.50
0
1
5
6
Maserati Bora
15.0
8
301.0
335
3.54
3.570
14.60
0
1
5
8
Volvo 142E
21.4
4
121.0
109
4.11
2.780
18.60
1
1
4
2
# 执行分层聚类
Z = hierarchy.linkage(df, 'ward')
# 函数原型如下:
# scipy.cluster.hierarchy.linkage(y, method='single', metric='euclidean', optimal_ordering=False)
# y输入矩阵,method聚类方法,metric距离计算方法。通常ward比较靠谱
# optimal_ordering重新排序链接矩阵,以使连续叶之间的距离最小,这样树形结构更为直观,但是计算速度变慢。
# 参数选择见:https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
# Make the dendrogram
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance (Ward)')
# 画聚类图,常用参数labels设定横坐标下标,leaf_rotation标题旋转
# 详细使用见:https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.dendrogram.html
hierarchy.dendrogram(Z, labels=df.index, leaf_rotation=90);
叶标签 leaf label
聚类簇数 number of clusters
颜色 color
截减 truncate
方向 orientation
Z = hierarchy.linkage(df, 'ward')
hierarchy.dendrogram(Z, leaf_rotation=90, leaf_font_size=8, labels=df.index);
# 聚类簇数 number of clusters
# Calculate the distance between each sample
Z = hierarchy.linkage(df, 'ward')
# Control number of clusters in the plot + add horizontal line.
# color_threshold设定颜色阈值,小于olor_threshold根据簇节点为一簇
hierarchy.dendrogram(Z, color_threshold=240)
# 画水平线,y纵坐标,c颜色,lw线条粗细,linestyle线形
plt.axhline(y=240, c='grey', lw=1, linestyle='dashed');
# 颜色 color
# Calculate the distance between each sample
Z = hierarchy.linkage(df, 'ward')
# Set the colour of the cluster here: 设置聚类颜色
hierarchy.set_link_color_palette(['#b30000','#996600', '#b30086'])
# Make the dendrogram and give the colour above threshold
# above_threshold_color设置color_threshold上方链接的颜色
hierarchy.dendrogram(Z, color_threshold=240, above_threshold_color='grey')
# Add horizontal line.
plt.axhline(y=240, c='grey', lw=1, linestyle='dashed');
# 截减 truncate
# 原始观察矩阵很大时,树形图很难读取。截断用于压缩树形图。有几种模式:
# 1 None 不执行截断
# 2 lastp lastp设置叶子节点数,最底层节点数
# 3 level 根据level设置图中层最大数
# Calculate the distance between each sample
Z = hierarchy.linkage(df, 'ward')
# method 1: lastp
# you will have 4 leaf at the bottom of the plot
hierarchy.dendrogram(Z, truncate_mode = 'lastp', p=4);
# method 2: level
# No more than ``p`` levels of the dendrogram tree are displayed.
hierarchy.dendrogram(Z, truncate_mode = 'level', p=2);
# 方向 orientation
# Calculate the distance between each sample
Z = hierarchy.linkage(df, 'ward')
# Orientation of the dendrogram
# 设置层次树的朝向,orientation可选"top", "left", "bottom", "right",默认top
hierarchy.dendrogram(Z, orientation="right", labels=df.index);
# Orientation of the dendrogram
hierarchy.dendrogram(Z, orientation="bottom", labels=df.index);
# Calculate the distance between each sample
Z = hierarchy.linkage(df, 'ward')
# Make the dendro
# 画树状图
hierarchy.dendrogram(Z, labels=df.index, leaf_rotation=0, orientation="left", color_threshold=240, above_threshold_color='grey')
# Create a color palette with 3 color for the 3 cyl possibilities
# 设置渐变颜色,共三种颜色
my_palette = plt.cm.get_cmap("Accent", 3)
# transforme the 'cyl' column in a categorical variable. It will allow to put one color on each level.
# 根据cyl设置颜色参数,对参数进行分类
df['cyl']=pd.Categorical(df['cyl'])
# 获得每种汽车cyl对应的颜色
my_color=df['cyl'].cat.codes
# Apply the right color to each label
ax = plt.gca()
# 获得y轴坐标标签
xlbls = ax.get_ymajorticklabels()
num=-1
for lbl in xlbls:
num+=1
val=my_color[num]
# 设置颜色
lbl.set_color(my_palette(val))
手机扫一扫
移动阅读更方便
你可能感兴趣的文章