• 模式识别之线性判别---naive bayes朴素贝叶斯代码实现


    http://blog.csdn.net/xceman1997/article/details/7955349

    http://www.cnblogs.com/yuyang-DataAnalysis/archive/2012/01/31/2333760.html

    http://zhan.renren.com/dmeryuyang?gid=3602888497999161050&checked=true

    http://blog.csdn.net/yanqingan/article/details/6125812

    bool NaiveBayes::Train (const char * sFileSample, int iClassNum, int iFeaTypeNum,
    string & sSegmenter, int iFeaExtractNum, const char * sFileModel, bool bCompactModel)
    {
    // 防御性代码
    if (iClassNum <= 0 || iFeaTypeNum <= 0 || iFeaExtractNum <= 0)
    return false;

    ifstream in (sFileSample, ios_base::binary);
    ofstream out (sFileModel);
    if (!in || !out)
    {
    cerr << "Can not open the file" << endl;
    return false;
    }

    // 这些都是临时数据结构,用来临时存储模型参数,特征选择需要的参数等等
    // 1. the temp data structure for model parameters
    // 1.1 the total number of document in training samples
    int iTotalDocNum = 0;
    // 1.2 the prior probability of class, temparaly it store the doc number in this class
    double * pClassPriorProb = new double [iClassNum];
    memset (pClassPriorProb, 0, iClassNum*sizeof(double));
    // 1.3 the prior probability of feature type, temparaly it stores the doc number in this feature (这个主要用于特征选择,bayes模型本身并不需要这个参数)
    double * pFeaItemPriorProb = new double [iFeaTypeNum];
    memset (pFeaItemPriorProb, 0, iFeaTypeNum*sizeof(double));
    // 1.4 the chi-square value that feature falls into class, temparaly it stores the doc number for this class and feature (可以看到,特征选择算法主要用卡方选择)
    double ** ppChiMatrix = new double * [iClassNum];
    for (int i=0; i<iClassNum; i++)
    {
    ppChiMatrix[i] = new double [iFeaTypeNum];
    memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));
    }
    // 1.5 the post-probability for class and feature
    double ** ppPProbMatrix = new double * [iClassNum];
    for (int i=0; i<iClassNum; i++)
    {
    ppPProbMatrix[i] = new double [iFeaTypeNum];
    memset (ppChiMatrix[i], 0, iFeaTypeNum*sizeof(double));
    }
    // 1.6 for the feature selection (表示哪些特征被选中了)
    int * pFeaSelected = new int [iFeaTypeNum];
    memset (pFeaSelected, 0, iFeaTypeNum*sizeof(int));

    // 2. iterate the training samples and fill count into the temp data structure
    string sLine;
    int i = 0;
    while (getline (in, sLine))
    {
    // show some information on screen
    if (0 == i%10000)
    cout << i << " ";
    i++;

    // 2.1 the total number of doc
    iTotalDocNum++;

    // 2.2 split the sample into class and feature items
    string::size_type iSeg = sLine.find_first_of (sSegmenter);
    string sTmp = sLine.substr (0, iSeg);
    int iClassId = atoi (sTmp.c_str());
    if (iClassId >= iClassNum)
    continue;
    pClassPriorProb [iClassId]++;

    // 2.3 count the rest feature items
    iSeg += sTmp.length();
    sTmp = sLine.substr (iSeg);
    istringstream isLine (sTmp);
    string sTmpItem;
    while (isLine >> sTmpItem)
    {
    int iFeaItemId = atoi (sTmpItem.c_str());
    if (iFeaItemId >= iFeaTypeNum)
    continue;
    // add the count
    pFeaItemPriorProb [iFeaItemId]++;
    ppChiMatrix [iClassId][iFeaItemId]++;

    }
    }

    // 3. calculate the model parameters
    // 3.1 the chi-square value as well as the post-probabilty
    for (int i=0; i<iClassNum; i++)
    {
    for (int j=0; j<iFeaTypeNum; j++)
    {
    double dA = ppChiMatrix[i][j];
    double dB = pFeaItemPriorProb[j] - dA; // currently pFeaItemPriorProb[i] == sum_i (ppChiMatrix[i][j])
    double dC = pClassPriorProb [i] - dA; // currently pClassPriorProb[i] == sum_j (ppChiMatrix[i][j])
    double dD = (double)iTotalDocNum - dA - dB - dC;

    // the chi value
    double dNumerator = dA * dD;
    dNumerator -= dB * dC;
    dNumerator = pow (dNumerator, 2.0);
    double dDenominator = dA + dB;
    dDenominator *= (dC + dD);
    dDenominator += DBL_MIN; // for smoothing
    ppChiMatrix[i][j] = dNumerator / dDenominator;

    // the post-probability: p(feature|class)
    ppPProbMatrix[i][j] = dA / pClassPriorProb [i];
    }
    }

    // 3.2 the prior probability of class
    for (int i=0; i<iClassNum; i++)
    pClassPriorProb [i] /= iTotalDocNum;

    // 3.3 the prior probability of feature
    for (int i=0; i<iFeaTypeNum; i++)
    pFeaItemPriorProb [i] /= iTotalDocNum;


    // 4. feature selection (这个函数下一篇文章再详细讲)
    FeaSelByChiSquare (ppChiMatrix, ppPProbMatrix, iClassNum,
    iFeaTypeNum, iFeaExtractNum, pFeaSelected);

    // 5. dump the model into txt file

    if (bCompactModel) // output the parameters only for predicting
    {
    // 5.1 the prior probability of class
    out << iClassNum << endl;
    for (int i=0; i<iClassNum; i++)
    {
    out << pClassPriorProb [i] << " ";
    }
    // 5.2 the actual selected feature type number
    int iActualFeaNum = 0;
    for (int j=0; j<iFeaTypeNum; j++)
    {
    if (1 == pFeaSelected[j])
    iActualFeaNum ++;
    }
    out << iActualFeaNum << endl;
    // 5.3 the post probability
    for (int i=0; i<iClassNum; i++)
    {
    for (int j=0; j<iFeaTypeNum; j++)
    {
    if (1 == pFeaSelected[j])
    {
    out << j << ":" << ppPProbMatrix[i][j] << " ";
    }
    }
    }
    }
    else // output the full information
    {
    // 5.1 the total number of document
    out << iTotalDocNum << endl;

    // 5.2 the prior probability of class
    out << iClassNum << endl;
    for (int i=0; i<iClassNum; i++) // classindex:priorprob
    {
    out << i << ":" << pClassPriorProb [i] << " ";
    }

    // 5.3 the prior probability of feature type: this is NO used in bayes model, record this for more info
    // and whether this feature is selected or not by any class
    out << iFeaTypeNum << " ";
    for (int i=0; i<iFeaTypeNum; i++) // featureId:priorprob:selected or not
    {
    out << i << ":" << pFeaItemPriorProb[i] << ":" << pFeaSelected << " ";
    }

    // 5.4 the chi-square value for class-feature pair
    for (int i=0; i<iClassNum; i++)
    {
    for (int j=0; j<iFeaTypeNum; j++)
    {
    out << ppChiMatrix[i][j] << " ";
    }
    }

    // 5.5 the post probability
    for (int i=0; i<iClassNum; i++)
    {
    for (int j=0; j<iFeaTypeNum; j++)
    {
    out << ppPProbMatrix[i][j] << " ";
    }
    }
    }

    // last, release the memory
    delete [] pClassPriorProb;
    delete [] pFeaItemPriorProb;
    for (int i=0; i<iClassNum; i++)
    {
    delete [] ppChiMatrix[i];
    }
    delete [] ppChiMatrix;
    for (int i=0; i<iClassNum; i++)
    {
    delete [] ppPProbMatrix[i];
    }
    delete [] ppPProbMatrix;
    delete [] pFeaSelected;

    return true;
    }

  • 相关阅读:
    性能篇系列—stream详解
    Java正则表达式详细解析
    干货系列性能篇之——序列化
    面试官之问:知道你的接口“QPS”是多少吗?
    Java性能之优化RPC网络通信
    Spring之 JDBC 异常
    Java性能之synchronized锁的优化
    浅谈Java中switch分支语句
    Spring Boot 之异步执行方法
    Java性能 -- Lock优化
  • 原文地址:https://www.cnblogs.com/pengkunfan/p/4931701.html
Copyright © 2020-2023  润新知