0%

聚类算法

聚类算法

K-means算法

主要代码与接口

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
#X,y 数据
n_cluster = 4 #选择分类数目
#建模
cluster = KMeans(n_clusters = n_cluster,random_state = 0).fit(X)

#查看聚类结果
y_pred = cluster.labels_

#利用DataFrame查看结果分类情况
res = pd.DataFrame(y_pred)
res.iloc[:,0].unique()

#KMeans因为不需要建立模型或者预测结果,因此我们只要fit就能够得到聚类结果了
#KMeans也有接口predict和fit_predict,表示学习数据X并对X的类进行预测
#但所得到的结果和我们不调用predict,直接fit之后调用属性Labels一模一样
pre = cluster1.fit_predict(X)

#查看类的中心(聚类点坐标中心,也可以作为某一类的特征)
centroid = cluster1.cluster_centers_
centroid

标准score:轮廓系数

1
2
3
4
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples

silhouette_score(X,pre) #越接近1越好

自动化暴力调参

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples

maxScore = -float('inf')
for n_clusters in range(3,20):
cluster = KMeans(n_clusters = n_clusters,random_state = 0).fit(X)
pre_n = cluster.labels_
curScore = silhouette_score(X,pre_n)
if curScore > maxScore:
maxScore = curScore
max_clusters = n_clusters
print("最大轮廓系数:",maxScore,"最优簇数:",max_clusters)

DBSCAN算法

主要代码与接口

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
from sklearn import metrics
import seaborn as sns
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
#X,y为数据

#建模
db = DBSCAN(eps=i, min_samples=j).fit(X) #因为不同的参数组合,有可能导致计算得分出错,所以用try

#进行预测
pre_n= db.labels_#得到DBSCAN预测的分类便签

#评判标准即轮廓系数
curScore=metrics.silhouette_score(X_s,pre_n) #轮廓系数评价聚类的好坏,值越大越好

#计算噪声点个数占总数的比例
raito = len(pre_n[pre_n[:] == -1]) / len(labels) #结果为-1的点为噪声点

# 获取分簇的数目
n_clusters_ = len(set(pre_n)) - (1 if -1 in pre_n else 0)

print([i,j,k,raito,n_clusters_])

自动化暴力调参

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
from sklearn import metrics
import seaborn as sns
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

rs= []#存放各个参数的组合计算出来的模型评估得分和噪声比
eps = np.arange(5,50,1) #eps参数从0.2开始到4,每隔0.1进行一次
min_samples=np.arange(2,20,1)#min_samples参数从2开始到20

best_score=0
best_score_eps=0
best_score_min_samples=0

for i in eps:
for j in min_samples:
try:#因为不同的参数组合,有可能导致计算得分出错,所以用try
db = DBSCAN(eps=i, min_samples=j).fit(X)
pre_n= db.labels_#得到DBSCAN预测的分类便签
curScore=metrics.silhouette_score(X_s,pre_n) #轮廓系数评价聚类的好坏,值越大越好
raito = len(pre_n[pre_n[:] == -1]) / len(labels) #计算噪声点个数占总数的比例
n_clusters_ = len(set(pre_n)) - (1 if -1 in pre_n else 0) # 获取分簇的数目
rs.append([i,j,k,raito,n_clusters_])

if curScore>best_score:
best_score=curScore
best_score_eps=i
best_score_min_samples=j

except:
db='' #这里用try就是遍历i,j 计算轮廓系数会出错的,出错的就跳过
else:
db=''

print(best_score
,best_score_eps
,best_score_min_samples)