• K-means 不知k值 自动无监督分类


      1 # -*- coding:UTF-8 -*-
      2 from numpy import *
      3 import jieba as jb
      4 import time
      5 # 计算权值,并存储为txt
      6 # 计算所有文本包含的总词数
      7 def wordsCount(dataSet):
      8     wordsCnt = 0
      9     for document in dataSet:
     10         wordsCnt += len(document)
     11     return wordsCnt
     13 # 创建不重复的词条列表
     14 def createVocabList(dataSet):
     15     vocabSet = set([])
     16     for document in dataSet:
     17         vocabSet = vocabSet | set(document)
     18     return list(vocabSet)
     20 # 将文本转化为词袋模型
     21 def bagOfWords2Vec(vocabList, inputSet):
     22     returnVec = [0] * len(vocabList)
     23     for word in inputSet:
     24         if word in vocabList:
     25             returnVec[vocabList.index(word)] += 1
     26         else:
     27             print("the word: %s is not in my Vocabulary!" % word)
     28     return returnVec
     30 # 计算包含某个词的文本数
     31 def wordInFileCount(word, cutWordList):
     32     fileCnt = 0
     33     for i in cutWordList:
     34         for j in i:
     35             if word == j:
     36                 fileCnt = fileCnt + 1
     37             else:
     38                 continue
     39     return fileCnt
     41 def calTFIDF(dataSet):
     42     fileCnt = len(dataSet)  # 文本数
     43     vocabList = createVocabList(dataSet)  # 词条列表
     44     tfidfSet = []
     46     for line in dataSet:
     47         wordsBag = bagOfWords2Vec(vocabList, line)  # 每行文本对应的词袋向量
     48         lineWordsCnt = 0
     49         for i in range(len(wordsBag)):
     50             lineWordsCnt += wordsBag[i]  # 计算每个文本中包含的总词数
     51         tfidfList = [0] * len(vocabList)
     52         for word in line:
     53             wordinfileCnt = wordInFileCount(word, dataSet)  # 包含该词的文本数
     54             wordCnt = wordsBag[vocabList.index(word)]  # 该词在文本中出现的次数
     55             tf = float(wordCnt) / lineWordsCnt
     56             idf = math.log(float(fileCnt) / (wordinfileCnt + 1))
     57             tfidf = tf * idf
     58             tfidfList[vocabList.index(word)] = tfidf
     59         print(tfidfList)
     60         print(map(str, tfidfList))
     61         tfidfSet.append(tfidfList)
     63     return tfidfSet
     65 # 计算余弦距离
     66 def gen_sim(A, B):
     67     num = float(dot(mat(A), mat(B).T))
     68     denum = linalg.norm(A) * linalg.norm(B)
     69     if denum == 0:
     70         denum = 1
     71     cosn = num / denum
     72     sim = 0.5 + 0.5 * cosn  # 余弦值为[-1,1],归一化为[0,1],值越大相似度越大
     73     sim = 1 - sim  # 将其转化为值越小距离越近
     74     return sim
     77 # 计算两个簇的评均距离
     78 def distAvg(dataSet1, dataSet2):
     79     avgD = 0
     80     sumD = 0
     81     m = shape(dataSet1)[0]
     82     n = shape(dataSet2)[0]
     83     for i in range(m):
     84         for j in range(n):
     85             dist = gen_sim(dataSet1[i], dataSet2[j])
     86             sumD += dist
     87     avgD = sumD / (m * n)
     88     return avgD
     90 # 找到距离最近的两个簇
     91 def findMin(M):
     92     minDist = inf
     93     m = shape(M)[0]
     94     for i in range(m):
     95         for j in range(m):
     96             if i != j and M[i, j] < minDist:
     97                 minDist = M[i, j]
     98                 minI = i
     99                 minJ = j
    100     return minI, minJ, minDist
    103 # 层次聚类算法
    104 def hCluster(dataSet, k, dist, distMeas=distAvg):
    105     m = shape(dataSet)[0]
    106     clusterAssment = mat(zeros((m, 1)))
    107     performMeasure = []
    108     M = mat(zeros((m, m)))  # 距离矩阵
    109     # 初始化聚类簇,每个样本作为一个类
    110     for ii in range(m):
    111         clusterAssment[ii, 0] = ii
    113     for i in range(m):
    114         for j in range(i + 1, m):
    115             dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
    116             dataSetj = dataSet[nonzero(clusterAssment[:, 0].A == j)[0], :]
    117             M[i, j] = distMeas(dataSeti, dataSetj)
    118             M[j, i] = M[i, j]
    119         if mod(i,10) == 0: print(i)
    120     q = m  # 设置当前聚类个数
    121     minDist = 0
    122     # while (q > k):
    123     while (minDist < dist):
    124         i, j, minDist = findMin(M)  # 找到距离最小的两个簇
    125         # 把第j个簇归并到第i个簇
    126         clusterAssment[nonzero(clusterAssment[:, 0].A == j)[0], 0] = i
    127         for l in range(j + 1, q):  # 将j之后的簇重新编号
    128             clusterAssment[nonzero(clusterAssment[:, 0].A == l)[0], 0] = l - 1
    129         M = delete(M, j, axis=0)
    130         M = delete(M, j, axis=1)
    131         for l in range(q - 1):  # 重新计算第i个簇和其他簇直接的距离
    132             dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
    133             dataSetl = dataSet[nonzero(clusterAssment[:, 0].A == l)[0], :]
    134             M[i, l] = distMeas(dataSeti, dataSetl)
    135             M[l, i] = M[i, l]
    137         # DBI = DBIvalue(dataSet, clusterAssment, q)
    138         # DI = DIvalue(dataSet, clusterAssment, q)
    139         DBI = 0
    140         DI = 0
    142         performMeasure.append([q - 1, minDist, DBI, DI])
    144         q = q - 1
    146         print(u'当前簇的个数是:', q)
    147         print(u'距离最小的两个簇是第%d个和第%d个,距离是%f,DBI值是%f,DI值是%f' % (
    148             i, j, minDist, DBI, DI))
    150     return clusterAssment, mat(performMeasure)
    152 def saveResult(clusterAssment):
    153     listResult = clusterAssment.tolist()  # 矩阵转换为list
    154     for i in range(len(listResult)):
    155         print(map(str, listResult[i]))
    158 if __name__ =='__main__':
    159     a=["实施", "效益","节本","10"]
    160     m=mat(calTFIDF(a))
    161     clustAssing, performMeasure = hCluster(m, 0, 0.3)
    162     print(clustAssing)
    163     saveResult(clustAssing)
