• Hierarchical cluster算法介绍


      突然想记录几个聚类算法,由于实力有限就先介绍一下层次聚类算法(Hierarchical cluster algorithm),这个聚类算法思想简单,但实现起来感觉复杂度挺大;以前看过《集体智慧编程》里介绍过,里面是用python实现的,由于python里面的列表和字典用起来方便,故实现该算法还行;这里我用c++重新写了一下,感觉代码蛮臃肿,可能是自己的c++没有学习好吧!!!对于容器的使用还不够熟练,这里贴出来的目的是希望哪位大牛看到了指导一二,这里感激不尽。废话不多说了,进入正题吧!

    ************************************************************************************************************

    Hierarchical cluster Algorithm的大致介绍

      层次聚类算法有两种实现思想,一种是初始时将每个待聚类的数据样本视为一个cluster,采用合并的方式,每次合并两个"距离"最近的cluster,直到合并成一个cluster为止(当然可以在达到自己设定想得到的cluster个数时终止迭代);另一种刚好与第一种相反,初始时将所有的数据样本视为一个cluster,采用分解的方式(这里没有实现就不说太多)。

    ************************************************************************************************************

    算法的步骤及相关问题

      算法步骤:  (1)初始时,将每个数据样本视为一个cluster(选取一个度量两个cluster距离的方式),

           (2)计算任意两个cluster之间的距离;每次选取距离最小的两个cluster,

           (3)合并(2)中选择的两个cluster,将合并产生的新cluster加入cluster set中,并删除被合并的两个cluster,

           (4)重复(2)(3),知道cluster set中元素只剩下一个为止。

      相关问题: (1)度量两个cluster之间的距离,应该选择哪种距离???《集体智慧编程》中选择的是Pearson,当然也可以直接选用欧氏距离

            (2)如何合并两个cluster,即新的cluster对应的属性值如何表示???这里是用被合并的两个cluster的平均值表示新的cluster

    ******************************************************************************************************************

      1 /**
      2 ** Hierarchical cluster Algorithm
      3 ** step:(1)Firstly,regard each sample as a cluster, and
      4          (2)Each time merge two clusters if the distance between them is lowest.
      5          (3)then add the new cluster into cluster set, and delete two clusters merged from cluster set.
      6 ** method: (1)as to merging, here replace the old two clusters with their average;
      7            (2)measure the distance with the Pearson similarity.
      8 ** Time:2013/7/10 
      9 **/
     10 #include <iostream>
     11 #include <map>
     12 #include <vector>
     13 #include <string>
     14 #include <fstream> 
     15 #include <cstring>
     16 #include <sstream> 
     17 #include <cmath>
     18 #include <iterator>
     19 using namespace std;
     20 //cluster
     21 typedef    struct bicluster{
     22     vector<double> attri;//attribute
     23     int  cid;//cluster id 
     24 }Bicluster;
     25 //a pair
     26 typedef struct lowpair{
     27     int leftid;
     28     int rightid;
     29     double dist;
     30 }Lpair;
     31 
     32 /*****************************************************************
     33 ** convert string(char*) to double(or other type)
     34 ** here should be included <sstream> before using the stringstream
     35 ******************************************************************/
     36 double str2double(char* str){
     37     stringstream ss;
     38     ss << str;
     39     double tmp;
     40     ss >> tmp;
     41     return tmp;    
     42 }
     43 /*****************************************************************
     44 ** split the string containing some special tokens
     45 ******************************************************************/
     46 string split(string &str, vector<double>& dvec, const char* tok){
     47     char *pch = NULL;
     48     pch = strtok(const_cast<char*>(str.c_str()), tok);
     49     string stmp(pch);
     50     while( pch != NULL ){
     51         pch = strtok(NULL, tok);
     52         if( !pch )
     53             break;
     54         dvec.push_back(str2double(pch));
     55     }
     56     return stmp;
     57 }
     58 /******************************************************************
     59 ** read data from 'blogdata.txt'
     60 ** @is ------- a reference to ifstream object(input)
     61 ** @data ----- a map used to store the data (output)
     62 ******************************************************************/
     63 bool readfile(ifstream &is, map<string, vector<double> >& mydata){
     64     if( is.fail() ){
     65         cerr << "can't open the file !!!" << endl;
     66         return false;
     67     }
     68     //ignore the first line of file
     69     string str;
     70     getline(is, str);
     71     
     72     //store the data read from file into mydata 
     73     while( !is.eof() ){
     74         vector<double> dtmp;
     75         string tmp;
     76         getline(is, str);
     77         tmp = split(str, dtmp, "	");
     78         mydata.insert(pair<string,vector<double> >(tmp, dtmp));
     79     }
     80     return true;         
     81 }
     82 /*****************************************************************
     83 ** compute the distance between two clusters
     84 ** Note that Pearson value devotes to the similarity between 
     85     two clusters, that is, the greater the Pearson value, the 
     86     lower the distance between them.
     87 *****************************************************************/ 
     88 double distPearson(vector<double>& left, vector<double>& right){
     89     double sum1 = 0;
     90     double sum2 = 0;
     91     int len = left.size();
     92     for(int i=0; i<len; ++i){
     93         sum1 += left[i];
     94         sum2 += right[i];
     95     }
     96     
     97     /**
     98     ** maybe you will feel it's complex, 
     99     **  and here we could replace Pearson with Euclidean distance
    100     **/
    101     double sum1Sq = 0;
    102     double sum2Sq = 0;
    103     for(int j=0; j<len; ++j){
    104         sum1Sq += pow(left[j], 2);
    105         sum2Sq += pow(right[j], 2);
    106     }
    107     
    108     double pSum = 0, num, den;
    109     for(int k=0; k<len; ++k)
    110         pSum += left[k]*right[k];
    111     num = pSum - sum1*sum2 / len;
    112     den = sqrt((sum1Sq - pow(sum1,2)/len) * (sum1Sq - pow(sum2,2)/len));
    113     if( den == 0 )
    114         return 0;
    115     return 1.0 - num/den;
    116 }
    117 /*************************************************************
    118 ** Given two clusters, the distance between them 
    119     should be checked whether it exists before compute it.
    120 **************************************************************/
    121 bool isExist(vector<Lpair> &lp, int leftid, int rightid, double &d){
    122     vector<Lpair>::iterator it = lp.begin();
    123     for(; it!=lp.end(); ++it){
    124         if( (it->leftid==leftid) && (it->rightid==rightid) ){
    125             d = it->dist;//if the distance has been computed, assign its value to d
    126             return true;
    127         }        
    128     }
    129     d = 0;
    130     return false;
    131 }
    132 /*************************************************************
    133 ** Given a cluster's id, delete the cluster from cluster set
    134 **************************************************************/
    135 void Del(vector<Bicluster> &cvec, int clusterid){
    136     vector<Bicluster>::iterator it = cvec.begin();
    137     for(; it!=cvec.end(); ++it){
    138         if( it->cid == clusterid )
    139             break;
    140     }
    141     cvec.erase(it);
    142 } 
    143 /*************************************************************
    144 ** Hierarchical Cluster Algorithm
    145 **************************************************************/
    146 void HierarchicalCluster(map<string, vector<double> > &mydata){
    147     vector<Lpair> distances;//used to store the distance
    148      
    149     //firstly,regard each sample as a cluster
    150     vector<Bicluster> cvec;
    151     map<string, vector<double> >::iterator it = mydata.begin();
    152     int myid = 0;
    153     for(; it!= mydata.end(); ++it){
    154         Bicluster btmp;
    155         btmp.attri = it->second;
    156         btmp.cid = myid++;
    157         cvec.push_back(btmp);
    158     } 
    159     myid = -1;
    160     //search the pair
    161     while( cvec.size()>1 ){
    162         Lpair lowp;
    163         double closedis = distPearson(cvec[0].attri,cvec[1].attri);
    164         lowp.leftid = cvec[0].cid, lowp.rightid = cvec[1].cid;
    165         lowp.dist = closedis;
    166         
    167         int leftps = 0, rightps = 1;
    168         for(int ix=0; ix<cvec.size(); ++ix){
    169             for(int iy=ix+1; iy<cvec.size(); ++iy){
    170                 double d;
    171                 int lid = cvec[ix].cid, rid = cvec[iy].cid;
    172                 if( !isExist(distances,lid,rid,d) ){
    173                     Lpair lptmp;
    174                     lptmp.dist = distPearson(cvec[ix].attri, cvec[iy].attri);
    175                     lptmp.leftid = lid;
    176                     lptmp.rightid= rid;
    177                     distances.push_back(lptmp);
    178                     d = lptmp.dist;
    179                   } 
    180                  if( d < lowp.dist ){
    181                      lowp.leftid = lid;
    182                      lowp.rightid = rid;
    183                      leftps = ix;
    184                      rightps = iy;
    185                      lowp.dist = d;
    186                  }
    187             }
    188         }
    189         //create a new cluster
    190         Bicluster ncluster;
    191         for(int i=0; i<cvec[0].attri.size(); ++i){
    192             double av;
    193             av = (cvec[leftps].attri[i] + cvec[rightps].attri[i]) / 2.0;
    194             ncluster.attri.push_back(av);
    195         }
    196         ncluster.cid = myid--;//assign negative to the new cluster's id
    197         cout << "leftid: " << lowp.leftid <<  ", rightid: " << lowp.rightid << endl;
    198         //delete the pair
    199         Del(cvec, lowp.leftid); 
    200         Del(cvec, lowp.rightid);
    201         cvec.push_back(ncluster);
    202     } 
    203 } 
    204 int main()
    205 {
    206     ifstream is("blogdata.txt");
    207     if( is.fail() ){
    208         cerr << "error!!!" << endl;
    209         exit(-1);
    210     }
    211     map<string, vector<double> > mydata;
    212     if(readfile(is, mydata))
    213         HierarchicalCluster(mydata);
    214     return 0;
    215 }

      代码写的有点乱且复杂,最后显示的结果不是树状图(python很易实现),只是简单的显示了每次被合并的两个cluster的id.代码中用到的数据可以从http://kiwitobes.com/clusters/blog.txt下载得到。

  • 相关阅读:
    ranorex
    vue.js
    逻辑思维
    laravel-luntan
    python学习--基础
    git
    Laravel-高级篇-Auth-数据迁移-数据填充
    Laravel-高级篇-Artisan
    Laravel-表单篇-零散信息
    Laravel-表单篇-controller
  • 原文地址:https://www.cnblogs.com/Happyhe/p/3183828.html
Copyright © 2020-2023  润新知