• 数据挖掘算法以及其实现zz


    实验一    分类技术及其应用

    实习要求: 基于线性回归模型拟合一个班学生的学习成绩,建立预测模型。数据可由自己建立100个学生的学习成绩。

    1)    算法思想:

    最小二乘法
    设经验方程是y=F(x),方程中含有一些待定系数an,给出真实值{(xi,yi)|i=1,2,...n},将这些x,y值 代入方程然后作差,可以描述误差:yi-F(xi),为了考虑整体的误差,可以取平方和,之所以要平方是考虑到误差可正可负直接相加可以相互抵消,所以记 误差为:

    e=∑(yi-F(xi))^2

    它是一个多元函数,有an共n个未知量,现在要求的是最小值。所以必然满足对各变量的偏导等于0,于是得到n个方程:

    de/da1=0
    de/da2=0
    ...
    de/dan=0

    n个方程确定n个未知量为常量是理论上可以解出来的。用这种误差分析的方法进行回归方程的方法就是最小二乘法。

    线性回归
    如果经验方程是线性的,形如y=ax+b,就是线性回归。按上面的分析,误差函数为:

    e=∑(yi-axi-b)^2

    各偏导为:

    de/da=2∑(yi-axi-b)xi=0
    de/db=-2∑(yi-axi-b)=0

    于是得到关于a,b的线性方程组:

    (∑xi^2)a+(∑xi)b=∑yixi
    (∑xi)a+nb=∑yi

    设A=∑xi^2,B=∑xi,C=∑yixi,D=∑yi,则方程化为:

    Aa+Bb=C
    Ba+nb=D

    解出a,b得:

    a=(Cn-BD)/(An-BB)
    b=(AD-CB)/(An-BB)

    2)    编程实现算法

    C++程序:

    #include<iostream>

    #include<math.h>

    using namespace std;

    void main()

    {

       double x,y,A=0.0,B=0.0,C=0.0,D=0.0,delta,a,b;

       int n,sno,avgstudy;

       cout<<"请拟合输入样本数目"<<endl;

       cin>>n;

       for(int i=0;i<n;++i)

       {  cout<<"请输入第"<<i+1<<"个学生学号"<<endl;

          cin>>sno;

       cout<<"请输入学生上自习时间,按照每天小时计算"<<endl;

                cin>>x;

           cout<<"请输入学生请输入平均成绩"<<endl;   

          cin>>y;

          A+=x*x;

          B+=x;

          C+=x*y;

          D+=y;

       }

        delta=A*n-B*B;

       a=((C*n-B*D)/delta);

       b=((A*D-C*B)/delta);

       cout<<"a="<<a<<"b="<<b<<endl;

       if(fabs(delta)<1e-10)

       {

          cerr<<"Error!Divide by zero"<<endl;

       }

       else

       {

         cout<<"a="<<((C*n-B*D)/delta)<<endl

                 <<"b="<<((A*D-C*B)/delta)<<endl;

       }

       cout<<"输入您想预测的成绩,先输入平均日自习时间(小时)"<<endl;

              cin>>avgstudy;

              cout<<a*avgstudy+b;

    }

    }

    3)    输出运算结果

    输入是将各个同学的上自习的时间 按照小时计算

    比如(4,85)(5,94),将成绩和上自习时间进行相应的线性回归

    ,推导出相应的线型方程,以便今后对其他学生上自习以及成绩的估测。

    实习二    聚类技术及其应用

    实习题1  编程验证单连接凝聚聚类算法,实验数据可使用第五章表5.2 的数据进行。要求输出层次聚类过程中每一步的聚类结果。

    实习题利用K-均值聚类算法对如下数据进行聚类,其中输入K=3,数据集为

    { 2,4,10,12,3,20,30,11,25,23,34,22} 。

    要求输出每个类及其中的元素。

    1)算法基本思想的描述

    Given k, the k-means algorithm is implemented in four steps:

        – Partition objects into k nonempty subsets

        – Compute seed points as the centroids of the  clusters of the  current partition (the centroid is the center, i.e., mean point, of the cluster)

       – Assign each object to the cluster with the nearest  seed point

       – Go back to Step 2, stop when no more new assignment

    2)编程实现算法

    //***********引入库函数

     

    #include "iostream.h"

    #include "math.h"

    #include "stdlib.h"

    #include "iomanip.h"

    #include "time.h"

    #include "fstream.h"

     

    //*************定义常量

    const int TRUE=1;

    const int FALSE=0;

    const int MarkovLengh=10000;

    const int MaxInnerLoop=10000;

    const int MaxOuterLoop=60;

    const double CO=0.1;

    const double DeclineRate=0.95;

    const long MAX=100000;

    const int AcceptRate=1;

    const double ForceDecline=0.9;

     

     

    //************定义全局变量

     

    int DataNum;               //聚类样本数目

    int Dimension;             //样本维数

    int K;                     //分类数

    double *DataSet;            //指向浮点型的指针

    int HALT=0;

    int Row=3;

     

     

     

    //***************************************************************

    //  类GETDATA:设定全局变量,维数,样本数,和类别数等        *** 

    //               随机生成样本或手工输入样本的类              ***

    //***************************************************************

     

     

    class GETDATA{

     

    public: 

         GETDATA();

         void Display();

         void Initial();

         void Input();

         double FRand(double,double);

        double rand1,rand2;          //随机数的高低值

     

    };

     

    GETDATA::GETDATA()

    {

         int i,j;

         Dimension=2;

         DataNum=50;

         K=4;

         DataSet=new double[Dimension*DataNum];

         for(i=0;i<DataNum;i++)

         {

             for(j=0;j<Dimension;j++)

                  DataSet[i*Dimension+j]=(((double)rand()/(double)RAND_MAX)*100);

         }

    }

     

    //*****************显示当前待聚类的样本(维数,个数,类别数等)

     

    void GETDATA::Display()

    {

         int i,j;

             cout<<" 当前样本集如下:"<<endl<<" {"<<endl;

             for(i=0;i<DataNum;i++)

                  {

                       cout<<" [";

                       for(j=0;j<Dimension;j++)

                           {

                               cout<<" "<<setw(8)<<DataSet[i*Dimension+j];                           

                           }

                       cout<<" ]  ";

                       if((i+1)%Row==0)

                           cout<<endl;

             }   

             cout<<endl<<" }"<<endl;

     

             cout<<endl<<" 以上实数样本集由计算机在---100之间随机产,其中:"<<endl;

     

         cout<<endl<<" 样本维数Dimension= "<<Dimension<<endl;

         cout<<" 样本数  DataNum= "<<DataNum<<endl;

         cout<<" 类别数  K= "<<K<<endl;

    }

     

     

    //****************输入待聚类样本,包括维数,个数,类别数等

     

    void GETDATA::Input()

    {

         char flag;

         int i,j;

         double s=0;

         cout<<endl<<" 请依次输入: 维数 样本数目 类别数"<<endl;

         cout<<endl<<" 维数Dimension: ";

         cin>>Dimension;

         cout<<endl<<" 样本数目DataNum: ";

         cin>>DataNum;

         cout<<endl<<" 类别数K:";

         cin>>K;

         cout<<endl<<" 随机生成数据输入R  人工输入按B: "<<endl;  delete[]DataSet;

         DataSet=new double[Dimension*DataNum];   

         cin>>flag;

         if(flag=='R'||flag=='r')

         {

             cout<<" 输入随机数生成范围(最小值和最大值):"

                  <<endl<<" 最小值:";

             cin>>rand1;

             cout<<endl<<" 最大值:";

             cin>>rand2;

             for(i=0;i<DataNum;i++)

             {

                  for(j=0;j<Dimension;j++)

                       DataSet[i*Dimension+j]=FRand(rand1,rand2);

             }

     

         }

             else

                  if(flag=='H'||flag=='h')

                  {

                       for(i=0;i<DataNum;i++)

                       {

                           cout<<endl<<" 请输入第"<<i+1<<" 个样本的"<<Dimension<<" 个分量";

                           for(j=0;j<Dimension;j++)

                                cin>>DataSet[i*Dimension+j];

                       }

                  }

                  else

                       cout<<endl<<" 非法数据!";

    }

     

    //****************初始化聚类样本

     

    void GETDATA::Initial()

    {

         char ch;

         GETDATA::Display();

         cout<<endl<<" 重新录入样本输入A  开始聚类B: ";

         cin>>ch;

         while(!(ch=='A'||ch=='a')&&!(ch=='B'||ch=='b'))

         {

             cout<<endl<<" 重新录入样本输入A  开始聚类B: ";

             cin>>ch;

         }

     

         if(ch=='A'||ch=='a')       

             GETDATA::Input();

    }

     

    double GETDATA::FRand(double rand1,double rand2)

    {

         return rand1+(double)(((double)rand()/(double)RAND_MAX)*(rand2-rand1));

    }

     

     

     

     

    //***********************************************************

    // 类SSA:    K-均值算法的实现                          *** 

    //           功能:根据设定的K,DataNum,Dimension等聚类   ***

    //***********************************************************

     

    class SAA

    {

    public:

         struct DataType

         {

         double *data;

         int father;

         double *uncle;

         };

     

         struct ClusterType

         {

         double *center;

         int sonnum;

        

         };

     

         SAA();

         void Initialize();

         void KMeans();

         void SA( );

         void DisPlay();

     

     

         void GetDataset(DataType *p1,double *p2,int datanum,int dim);

         void GetValue(double *str1,double *str2,int dim);

         int  FindFather(double *p,int k);

         double SquareDistance(double *str1,double *str2,int dim);

         int  Compare(double *p1,double *p2,int dim);

         void NewCenterPlus(ClusterType *p1,int t,double *p2,int dim);

         void NewCenterReduce(ClusterType *p1,int t,double *p2,int dim);

         double MaxFunc();

         void Generate(DataType *p1,ClusterType *c1);

         double Compare(DataType *p1,ClusterType *c1,DataType *p2,ClusterType *c2);

         void CopyStatus(DataType *p1,ClusterType *c1,DataType *p2,ClusterType *c2);             

         int  SecondFather(DataType *p,int t,int k);

         double AimFunction(DataType *q,ClusterType *c);

         double FRand(double ,double);

         void KMeans1();

     

    protected:

     

    double Temp;

    //double CO;

    //double DeclineRate;

    //int MarkovLengh;

    //int MaxInnerLoop;

    //int MaxOuterLoop;

    double AimFunc;

     

    DataType *DataMember, *KResult,*CurrentStatus,*NewStatus;

    ClusterType * ClusterMember,*NewCluster,*CurrentCluster;

          

    }; //end of class SAA

     

     

     

    //************建立构造函数,初始化保护成员

    SAA::SAA()

    {   

         int i;

    //   DeclineRate=(double)0.9;

    //   MarkovLengh=1000;

    //   MaxInnerLoop=200;

    //   MaxOuterLoop=10;

    //   CO=1;

     

        

         DataMember=new DataType[DataNum];

         ClusterMember=new ClusterType[K];

     

         for(i=0;i<DataNum;i++)

         {

             DataMember[i].data=new double[Dimension];

     

             DataMember[i].uncle=new double[K];

         }   

         for(i=0;i<K;i++)

             ClusterMember[i].center=new double[Dimension];

     

         GetDataset(DataMember,DataSet,DataNum,Dimension);

     

                 

    }//endSAA

     

     

    //****************初始化参数,及开始搜索状态

     

    void SAA::Initialize( )

    {

        

         //K-均值聚类法建立退火聚类的初始状态

    //   KMeans();

     

          

    }

     

    //*******************k-均值法进行聚类

    //************接口:数据,数量,维数,类别

    //逐点聚类方式

     

    void SAA::KMeans()

    {

        

         int i,j,M=1;

         int pa,pb,fa;

         ClusterType *OldCluster;   

     

         //初始化聚类中心

     

         OldCluster=new ClusterType[K];

         for(i=0;i<K;i++)

         {

         //   cout<<endl<<i+1<<"中心:";

             GetValue(ClusterMember[i].center,DataMember[i].data,Dimension);

             ClusterMember[i].sonnum=1;

     

             OldCluster[i].center=new double[Dimension];

             GetValue(OldCluster[i].center,ClusterMember[i].center,Dimension);

         }

     

     

         for(i=0;i<DataNum;i++) 

         {

    //       cout<<endl<<i+1<<": "<<ClusterMember[0].center[0]<<" "<<ClusterMember[1].center[0]<<" son: "<<ClusterMember[0].sonnum;         

             for(j=0;j<K;j++)

             {

                  DataMember[i].uncle[j]=SquareDistance(DataMember[i].data,ClusterMember[j].center,Dimension);

    //            cout<<"   "<<i+1<<"->"<<j+1<<": "<<DataMember[i].uncle[j];   //"类中心"<<ClusterMember[j].center[0]<<": "<<DataMember[i].uncle[j]<<"  ";

             }

             pa=DataMember[i].father=FindFather(DataMember[i].uncle,K);

             if(i>=K)

             {

    //            cout<<endl<<pa<<" 类样本数:"<<ClusterMember[pa].sonnum;

                  ClusterMember[pa].sonnum+=1;

    //            cout<<endl<<pa<<" 类样本数:"<<ClusterMember[pa].sonnum;

                  NewCenterPlus(ClusterMember,pa,DataMember[i].data,Dimension);

    //            cout<<endl<<i+1<<"->"<<pa+1<<"类 :"<<ClusterMember[pa].center[0];

                  GetValue(OldCluster[pa].center,ClusterMember[pa].center,Dimension);

             }

         }

     

         //开始聚类,直到聚类中心不再发生变化。××逐个修改法××

         while(!HALT)

         {

             //一次聚类循环:.重新归类;.修改类中心

             for(i=0;i<DataNum;i++) 

             {

    //            cout<<endl;

                  for(j=0;j<K;j++)

                  {

    //                 cout<<"  D "<<DataMember[i].data[0]<<"  "<<ClusterMember[j].center[0]<<"  ";

                       DataMember[i].uncle[j]=SquareDistance(DataMember[i].data,ClusterMember[j].center,Dimension);

    //              cout<<DataMember[i].data[0]<<"->"<<ClusterMember[0l].center[0]<<" : "<<DataMember[i].uncle[0]<<endl;

    //                 cout<<i+1<<"->"<<j+1<<" "<<DataMember[i].uncle[j];

                  }

                 

                  fa=DataMember[i].father;

     

                 if(fa!=FindFather(DataMember[i].uncle,K)&&ClusterMember[fa].sonnum>1)

                  {

     

                       pa=DataMember[i].father;

                       ClusterMember[pa].sonnum-=1;

     

                       pb=DataMember[i].father=FindFather(DataMember[i].uncle,K);

                       ClusterMember[pb].sonnum+=1;

     

                       NewCenterReduce(ClusterMember,pa,DataMember[i].data,Dimension);

                       NewCenterPlus(ClusterMember,pb,DataMember[i].data,Dimension);

                      

                      

    /*                 cout<<endl<<"*********************"<<M<<" 次聚类:*****************";  //聚一次类输出一次结果

                       cout<<endl<<DataMember[i].data[0]<<" in "<<pa+1<<"类-> "<<pb+1<<"类: ";

     

                       for(t=0;t<K;t++)

                       {

                           cout<<endl<<" 第"<<t+1 <<"类中心: "<<ClusterMember[t].center[0]<<"  样本个数:"<<ClusterMember[t].sonnum;

                       }

     

                       DisPlay();

                       M=M+1;

    */

                  }                     

     

             }//endfor

                 

        

             //判断聚类是否完成,HALT=1,停止聚类

             HALT=0;

             for(j=0;j<K;j++)

                  if(Compare(OldCluster[j].center,ClusterMember[j].center,Dimension))

                       break;

             if(j==K)

                  HALT=1;

     

                 

             for(j=0;j<K;j++)

                  GetValue(OldCluster[j].center,ClusterMember[j].center,Dimension);

        

         }//endwhile

     

    }//end of KMeans

     

    //批聚类方式

     

    void SAA::KMeans1()

    {

         int i,j,M=1;

         int pa,pb,fa;

         ClusterType *OldCluster;   

     

         //初始化聚类中心

     

         OldCluster=new ClusterType[K];

         for(i=0;i<K;i++)

             OldCluster[i].center=new double[Dimension];

     

             for(j=0;j<K;j++)

                  GetValue(OldCluster[j].center,ClusterMember[j].center,Dimension);

     

         //开始聚类,直到聚类中心不再发生变化。××逐个修改法××

         while(!HALT)

         {

             //一次聚类循环:.重新归类;.修改类中心

             for(i=0;i<DataNum;i++) 

             {

                  for(j=0;j<K;j++)

                       DataMember[i].uncle[j]=SquareDistance(DataMember[i].data,ClusterMember[j].center,Dimension);

                  fa=DataMember[i].father;

     

                 if(fa!=FindFather(DataMember[i].uncle,K)&&ClusterMember[fa].sonnum>1)

                  {

     

                       pa=DataMember[i].father;

                       ClusterMember[pa].sonnum-=1;

     

                       pb=DataMember[i].father=FindFather(DataMember[i].uncle,K);

                       ClusterMember[pb].sonnum+=1;

     

                       NewCenterReduce(ClusterMember,pa,DataMember[i].data,Dimension);

                       NewCenterPlus(ClusterMember,pb,DataMember[i].data,Dimension);

                  }                     

     

             }//endfor

                 

        

             //判断聚类是否完成,HALT=1,停止聚类

             HALT=0;

             for(j=0;j<K;j++)

                  if(Compare(OldCluster[j].center,ClusterMember[j].center,Dimension))

                       break;

             if(j==K)

                  HALT=1;

     

                 

             for(j=0;j<K;j++)

                  GetValue(OldCluster[j].center,ClusterMember[j].center,Dimension);

        

         }//endwhile

     

    }

     

    //几个经常需要调用的小函数

     

    void SAA::NewCenterPlus(ClusterType *p1,int t,double *p2,int dim)

    {

         int i;

         for(i=0;i<dim;i++)

             p1[t].center[i]=p1[t].center[i]+(p2[i]-p1[t].center[i])/(p1[t].sonnum);

    }

     

     

    void SAA::NewCenterReduce(ClusterType *p1,int t,double *p2,int dim)

    {

         int i;

         for(i=0;i<dim;i++)

             p1[t].center[i]=p1[t].center[i]+(p1[t].center[i]-p2[i])/(p1[t].sonnum);

    }

     

     

    void SAA::GetDataset(DataType *p1,double *p2,int datanum,int dim)

    {

         int i,j;

         for(i=0;i<datanum;i++)

         {

            

             for(j=0;j<dim;j++)

                  p1[i].data[j]=p2[i*dim+j];

         }

    }

     

    void SAA::GetValue(double *str1,double *str2,int dim)

    {

         int i;

         for(i=0;i<dim;i++)    

             str1[i]=str2[i];

    }

     

    int  SAA::FindFather(double *p,int k)

    {

         int i,N=0;

         double min=30000;

     

         for(i=0;i<k;i++)  

             if(p[i]<min)

             {

                  min=p[i];

                  N=i;

             }

         return N;

    }

     

    double SAA::SquareDistance(double *str1,double *str2,int dim)

    {

         double dis=0;

         int i;

         for(i=0;i<dim;i++)

             dis=dis+(double)(str1[i]-str2[i])*(str1[i]-str2[i]);

         return dis;

    }

     

    int  SAA::Compare(double *p1,double *p2,int dim)

    {

         int i;

         for(i=0;i<dim;i++)

             if(p1[i]!=p2[i])

                  return 1;

         return 0;

    }   

     

     

    double SAA::FRand(double a,double b)

    {

         return a+(double)(((double)rand()/(double)RAND_MAX)*(b-a));

     

    }

     

     

    void SAA::DisPlay()

    {

         int i,N,j,t;

         ofstream  result("聚类过程结果显示.txt",ios::ate);

         for(i=0;i<K;i++)

         {

             N=0;

             cout<<endl<<endl<<"******************** 第"<<i+1<<" 类样本:*******************"<<endl;

             result<<endl<<endl<<"******************** 第"<<i+1<<" 类样本:*******************"<<endl;

             for(j=0;j<DataNum;j++)

                  if(DataMember[j].father==i)

                  {

                       cout<<" [";

                       for(t=0;t<Dimension;t++)

                       cout<<" "<<setw(5)<<DataMember[j].data[t];

                       cout<<" ]  ";              

                       if((N+1)%Row==0)

                           cout<<endl;

     

                       result<<" [";

                       for(t=0;t<Dimension;t++)

                       result<<" "<<setw(5)<<DataMember[j].data[t];

                       result<<" ]  ";                 

                       if((N+1)%Row==0)

                           result<<endl;

     

                       N=N+1;

                  }

         }//end for

     

         cout<<endl<<endl<<"  聚类结果,总体误差准则函数:"<<AimFunction(DataMember,ClusterMember)<<endl;

         result<<endl<<"  聚类结果,总体误差准则函数:"<<AimFunction(DataMember,ClusterMember)<<endl;

     

         result.close();

     

    }//end of Display

     

    double SAA::AimFunction(DataType *q,ClusterType *c)

    {

         int i,j;

         double *p;

         p=new double[K];

         for(i=0;i<K;i++)

             p[i]=0;

         for(i=0;i<K;i++)

         {

             for(j=0;j<DataNum;j++)

                  if(q[j].father==i)

                  {

                       p[i]=p[i]+SquareDistance(c[i].center,q[j].data,Dimension);

                  }

         }

     

                 

         AimFunc=0;

         for(i=0;i<K;i++)

             AimFunc=AimFunc+p[i];

         return AimFunc;

     

    }

     

     

     

    //************************************

    //            主函数入口         ****  

    //************************************

     

     

    void main()

    {

         //用户输入数据

         srand((unsigned)time(NULL));

         GETDATA getdata;

         getdata.Initial();

         ofstream file("聚类过程结果显示.txt",ios::trunc);   //聚类结果存入“聚类结果显示.txt”文件中

     

         //k-均值聚类方法聚类

         SAA saa;    //****此行不可与上行互换。

        

         saa.KMeans();    //逐个样本聚类

    //   saa.KMeans1();   //批处理方式聚类,可以比较saa.KMeans()的区别

         cout<<endl<<"***********************K-均值聚类结果:**********************";

         file<<endl<<"***********************K-均值聚类结果:**********************"<<endl;

     

         file.close();

         saa.DisPlay();

     

         cout<<endl<<"  程序运行结束!"<<endl;

                  }

    }3)输出运算结果

     

     

    实习三   关联规则挖掘及其应用

    实习题:Apriori算法是一种最有影响的挖掘布尔关联规则频繁项集的算法。它将关联规则挖掘算法的设计分解为两个子问题:(1) 找到所有支持度大于最小支持度的项集,这些项集称被为频繁项集(Frequent Itemset)。(2) 使用第一步产生的频繁集产生期望的规则。

    在图书馆管理系统中积累了大量的读者借还书的历史记录,基于Apriori算法挖掘最大频繁项目集,由此产生关联规则。数据格式可参阅文献

    参考文献:彭仪普,熊拥军: 关联挖掘在文献借阅历史数据分析中的应用.情报杂志. 2005年第8期

    1)    算法基本思想的描述

    首先产生频繁1-项集L1,然后是频繁2-项集L2,直到有某个r值使得Lr为空,这时算法停止。这里在第k次循环中,过程先产生候选k-项集的集合Ck,Ck中的每一个项集是对两个只有一个项不同的属于Lk-1的频集做一个(k-2)-连接来 产生的。Ck中的项集是用来产生频集的候选集,最后的频集Lk必须是Ck的一个子集。Ck中的每个元素需在交易数据库中进行验证来决定其是否加入Lk,这 里的验证过程是算法性能的一个瓶颈。

    为了生成所有频集,使用了递推的方法。其核心思想简要描述如下:

    (1)     L1 = {large 1-itemsets};

    (2)     for (k=2; Lk-1&sup1;F; k++) do begin

    (3)         Ck=apriori-gen(Lk-1);   //新的候选集

    (4)         for all transactions t&Icirc;D do begin

    (5)                  Ct=subset(Ck,t);    //事务t中包含的候选集

    (6)           for all candidates c&Icirc; Ct  do

    (7)           c.count++;

    (8)         end

    (9)        Lk={c&Icirc; Ck |c.count&sup3;minsup}

    (10)    end

    (11)                   Answer=∪kLk;

    1.Find all frequent itemsets: By definition, each of these itemsets will occur at least as  frequently as a predetermined minimum support count, min sup

    2. Generate strong association rules from the frequent itemsets: By definition, these  rules must satisfy minimum support and minimum confidence

    3.Apriori pruning principle: If there is any itemset which is infrequent, its superset should not be generated/tested!

    Method:

         – generate length (k+1) candidate itemsets from two length k frequent  itemsets which have K-1 kinds same itemsets, and

    –  test the candidates against DB

    2)    编程实现算法

    1.   Item.h 源文件

    /*----------------------------------------------------------------------

      File     : Item.h

      Contents : itemset management

      Author   : Bart Goethals

      Update   : 4/4/2003

    ----------------------------------------------------------------------*/

     

    #include <set>

    using namespace std;

     

    class Item

    {

    public:

     

         Item(int i) : id(i), support(0), children(0) {}

         Item(const Item &i) : id(i.id), support(i.support), children(i.children) {}

         ~Item(){}

     

         int getId() const {return id;}

     

         int Increment(int inc = 1) const {return support+=inc;}

     

         set<Item> *makeChildren() const;

         int deleteChildren() const;

     

         int getSupport() const {return support;}

         set<Item> *getChildren() const {return children;}

     

         bool operator<(const Item &i) const{return id < i.id;}

     

    private:

     

         const int id;

     

         mutable int support;

         mutable set<Item> *children;

    };

     

     

     

    2. AprioriRules.h 源文件

     

    class Itemset

    {

     public:

      Itemset(int l) : length(l) {t = new int[l];}

      Itemset(const Itemset &is) : length(is.length), support(is.support)

        {

          t = new int[length];

          for(int i=0;i<length;i++) t[i] = is.t[i];

        }

     

      ~Itemset(){delete [] t;}

        

      int length;

      int *t;

      int support;

    };

     

    class AprioriRules

    {

     public:

     

      AprioriRules();

      ~AprioriRules();

     

      void setData(char *fn);

      int setOutputRules(char *fn);

      void setMinConf(float mc){minconf=mc;}

      int generateRules();

      void setMaxHead(int m){maxhead=m;}

      void setVerbose(){verbose=true;}

        

     private:    

     

      Itemset *getNextSet();

      int generateRules(set<Item> *current, int *iset, int depth);

      int processSet(set<Item> *items, int sl, int *iset, int sup, int *head, int spos, int depth);

     

      Item *trie;

      float minconf;

      int maxhead;

      ofstream rulesout;

      FILE *data;

      bool verbose;

    };

    3.AprioriRules.cpp源文件

    /*----------------------------------------------------------------------

      File     : AprioriRules.cpp

      Contents : apriori algorithm for finding association rules

      Author   : Bart Goethals

      Update   : 16/04/2003

    ----------------------------------------------------------------------*/

     

    #include <iostream>

    #include <fstream>

    #include <stdio.h>

    #include <set>

    #include <vector>

    #include <time.h>

    using namespace std;

    #include "Item.h"

    #include "AprioriRules.h"

     

    AprioriRules::AprioriRules()

    {

      data=0;

      minconf=0;

      maxhead=0;

      trie = new Item(0);

      verbose = false;

    }

     

    AprioriRules::~AprioriRules()

    {

      if(data) fclose(data);

      if(trie) {

        trie->deleteChildren();

        delete trie;

      }

    }

     

    void AprioriRules::setData(char *fn)

    {

      data = fopen(fn,"rt");

    }

     

    int AprioriRules::setOutputRules(char *fn)

    {

      rulesout.open(fn);

      if(!rulesout.is_open()) {

        cerr << "error: could not open " << fn << endl;

        return -1;

      }

      return 0;

    }

     

    Itemset *AprioriRules::getNextSet()

    {

      Itemset *t;

      vector<int> list;

      char c;

     

      do {

        int item=0, pos=0;

        c = getc(data);

        while((c >= '0') && (c <= '9')) {

          item *=10;

          item += int(c)-int('0');

          c = getc(data);

          pos++;

        }

        if(pos) list.push_back(item);

      }while(c != ' ' && !feof(data));

     

      if(feof(data)) return 0;

     

      int size = list.size() - 1;

      if(size>=0) {

        t = new Itemset(size);

        t->support = list[size];

        for(int i=0; i<size; i++) t->t[i] = list[i];

        return t;

      }

      else return getNextSet();

    }

     

    int AprioriRules::generateRules()

    {

      int size=0;

      clock_t start;

        

      // Read all frequent itemsets

      if(verbose) cout << "reading frequent itemsets" << flush;

      start = clock();

      while(Itemset *t = getNextSet()) {

        set<Item>::iterator it;

        set<Item>* items = trie->makeChildren();

            

        for(int depth=0;depth < t->length; depth++) {

          it = items->find(Item(t->t[depth]));

          if(it == items->end()) it = items->insert(Item(t->t[depth])).first;

          items = it->makeChildren();

        }

        if(t->length) it->Increment(t->support);

        else trie->Increment(t->support);

            

        size = (t->length>size? t->length : size);

        delete t;

      }

      if(verbose) cout << "[" << (clock()-start)/double(CLOCKS_PER_SEC) << "s]" << endl << flush;

        

      // generate rules

      if(verbose) cout << "generating rules" << flush;

      int *iset = new int[size];

      int added = generateRules(trie->getChildren(), iset, 1);

      delete [] iset;

      if(verbose) cout << "[" << (clock()-start)/double(CLOCKS_PER_SEC) << "s]" << endl << flush;

     

      return added;

    }

     

    int AprioriRules::generateRules(set<Item> *current, int *iset, int depth)

    {

      if(current==0) return 0;

      int added = 0;

        

      for(set<Item>::iterator runner = current->begin(); runner!= current->end(); runner++) {

            

        iset[depth-1] = runner->getId();

        if(depth > 1) {

          int *tmp = new int[depth];

          added += processSet(trie->getChildren(), depth, iset, runner->getSupport(), tmp, 0,1);

          delete [] tmp;

        }

            

        added += generateRules(runner->getChildren(), iset, depth+1);

      }

        

      return added;

    }

     

    int AprioriRules::processSet(set<Item> *items, int sl, int *iset, int sup, int *head, int spos, int depth)

    {

      int loper = spos;

      set<Item>::iterator runner, it;

      int added=0,i,j,k;

        

      spos = sl;

      while(--spos >= loper) {

        head[depth-1] = iset[spos];

        runner = items->find(Item(iset[spos]));

     

        // find body and its support

        set<Item> *tmp = trie->getChildren();

        int *body = new int[sl-depth];

        for(i=j=k=0; i<sl; i++) {

          if(j<depth && iset[i]==head[j]) j++;

          else {

         it = tmp->find(Item(iset[i]));

         tmp = it->getChildren();

         body[k++] = iset[i];

          }

        }

    //    float intr = (float(sup)*float(trie->getSupport()))/(float(runner->getSupport())*float(it->getSupport()));

        float conf = float(sup)/float(it->getSupport());

            

        if(conf>=minconf) {

          for(i=0; i<sl-depth; i++) rulesout << body[i] << " ";

          rulesout << "=> ";

          for(i=0; i<depth; i++) rulesout << head[i] << " ";

          rulesout << "(" << sup << ", " << conf << ")" << endl;

    //      rulesout << "(" << sup << ", " << conf << ", " << intr << ")" << endl;

          added++;

        }

        delete [] body;

     

        if(conf>=minconf && depth<sl-1) {

          if(maxhead) {

         if(depth<maxhead) added += processSet(runner->getChildren(), sl, iset, sup, head, spos+1, depth+1);

          }

          else added += processSet(runner->getChildren(), sl, iset, sup, head, spos+1, depth+1);

        }

      }

        

      return added;

    }

     

    4.Item.cpp

    /*----------------------------------------------------------------------

      File     : Item.cpp

      Contents : itemset management

      Author   : Bart Goethals

      Update   : 4/4/2003

    ----------------------------------------------------------------------*/

     

    #include "Item.h"

     

    set<Item> *Item::makeChildren() const

    {

         if(children) return children;

         return children = new set<Item>;

    }

     

    int Item::deleteChildren() const

    {

         int deleted=0;

     

         if(children)

         {

             for(set<Item>::iterator it = children->begin(); it != children->end(); it++)

             {

                  deleted += it->deleteChildren();

             }

             delete children;

             children = 0;

             deleted++;

         }

     

         return deleted;

    }

    5/*----------------------------------------------------------------------

      File     : aprioritest.cpp

      Contents : apriori algorithm for finding association rules

      Author   : Bart Goethals

      Update   : 16/04/2003

    ----------------------------------------------------------------------*/

     

    #include <stdio.h>

    #include <time.h>

    #include <iostream>

    #include <set>

    #include <vector>

    #include <fstream>

    using namespace std;

    #include "Item.h"

    #include "AprioriRules.h"

     

    int main(int argc, char *argv[])

    {

      cout << "Apriori association rule mining implementation" << endl;

      cout << "by Bart Goethals, 2000-2003" << endl;

      cout << "http://www.cs.helsinki.fi/u/goethals/" << endl << endl;

     

      if(argc < 3) {

        cerr << "usage: " << argv[0] << " setsfile minconf [output]" << endl;

      }

      else {

        clock_t start = clock();

        AprioriRules r;

        r.setVerbose();

        r.setData(argv[1]);

        r.setMinConf(atof(argv[2]));

    //    r.setMaxHead(1);

        if(argc==4) r.setOutputRules(argv[3]);

     

        start = clock();

        cout << "generating rules " << flush;

        int rules = r.generateRules();

        cout << rules << " [" << (clock()-start)/double(CLOCKS_PER_SEC) << "s]" << endl;

        if(argc==4) cout << "Written to " << argv[3] << endl;

      }

     

      return 0;

    }

    3)输出运算结果

    由于数据方面以及程序的一些问题,本实验的测试结果没能产生。

    数据挖掘课程总结:

      1. 在这门比较新的课程中学到了许多的新概念.
      2. 学会了查找相关资料,自学能力进一步加强.
      3. 认识到了自身应用方面的缺陷,在以后的学习中抓紧时间改进.
      4. 在查阅资料的过程中学到了很多新的算法
      5. 进一步加强了编程方面的能力,掌握数据挖掘在现实世界中的应用的以及其重要作用
  • 相关阅读:
    【Todo】Java线程面试题 Top 50 (转载)
    【Todo】秒杀系统 & 乐观锁 & Nginx反向代理
    【Todo】C++和Java里面的浮点数及各种数字表示
    asp.net操作word的表格
    Android消息推送(二)--基于MQTT协议实现的推送功能
    单点更新线段树 RMQ
    英语月结
    AppWidget应用(二)---PendingIntent 之 getActivity
    2 WAN 和1 Evo/3g Routeros PCC 方法负载平衡
    Android 网络权限配置
  • 原文地址:https://www.cnblogs.com/end/p/3328386.html
Copyright © 2020-2023  润新知