• LDA主题模型学习笔记5:C源代码理解


    1。说明

    本文对LDA原始论文的作者所提供的C代码中LDA的主要逻辑部分做凝视,原代码可在这里下载到:https://github.com/Blei-Lab/lda-c

    这份代码实现论文《Latent Dirichlet Allocation》中介绍的LDA模型。用变分EM算法求解參数。

    为了使代码在vs2013中执行。做了一些微小修改,但不影响原代码的逻辑。

    vs2013project可在我的资源中下载:

    http://download.csdn.net/detail/happyer88/8861773

    ----------------------------------------------------------------------

    2。准备知识

    2.1,LDA原理及推导

    《Latent Dirichlet Allocation》论文

    我的LDA学习笔记1-4系列

    2.2,充分统计量

    https://en.wikipedia.org/wiki/Sufficient_statistic

    ----------------------------------------------------------------------

    3,代码凝视

    3.1 main.c

    原代码中main函数在lda-estimate.c中,创建vsproject时把它挪到了main.c中。

    <span style="font-size:14px;">#include  <stdio.h>
    #include  <stdlib.h>
    #include  <io.h>
    #include<time.h>
    #include "cokus.h"
    #include "lda-alpha.h"
    #include"lda-data.h"
    #include"lda-estimate.h"
    #include"lda-inference.h"
    #include"lda.h"
    #include"utils.h"
    
    char * datasetName	= "scene8";	//数据集名字,必须与目录名字同样
    int expec = 1;		// expec==1,expect , inf
    int vocabularySize_global = 512; // 字典大小
    int k = 100; //topic的数目
    char* params ="../settings.txt"; //预计:预计过程须要的參数
    char* params1 ="../inf-settings.txt";     //判断:判断过程须要的參数
    char  dataset_train[500];	//预计:预计參数的数据文件
    char  dataset_test[500];   //判断:判断的数据文件
    char dir_trainData[500];    //预计:预计的中间数据和终于数据目录路径
    char dir_testData[500];		//判断:判断的中间数据和终于数据目录路径
    char model_pre[500];
    void assignParameter();
    int main()
    {
    	corpus* corpus;
    	clock_t start,finish;
    	double totaltime;
    	long double totaltime_EMiteration;
    	assignParameter();
    	//myCreateDirectory();
    
    	start=clock();
    	if(expec)
    	{
    		INITIAL_ALPHA = 1;    //狄利克雷分布的參数alpha
    		NTOPICS =k;           //主题个数
    		read_settings(params);  //读取參数。。

    最大迭代次数。收敛条件阈值;EM的最大迭代次数、收敛条件阈值; corpus = read_data(dataset_train); //读取数据。

    。。

    数据格式:(每一行)在一个文档中出现的word总数目(去掉次数=0的)index_word1:counts index_word2:counts ........... totaltime_EMiteration = run_em("seeded", dir_trainData, corpus); //求解參数。。。EM过程求解參数--输入:中间数据和终于数据存放目录、语料库 printf("inferencing test images! "); read_settings(params1); corpus = read_data(dataset_test); infer(model_pre, dir_testData, corpus); //用完開始释放 } else { read_settings(params1); corpus = read_data(dataset_test); infer(model_pre, dir_testData, corpus); } finish=clock(); totaltime=(double)(finish-start)/CLOCKS_PER_SEC; printf("nTopic = %d, nTerm = %d estimation time: ", k, vocabularySize_global); printf(" EM iteration takes %f seconds(this is %f miniutes) ", totaltime_EMiteration*60, totaltime_EMiteration); printf("Running Time(--estimate trainData and inference trainData and testData--):%f ",totaltime); printf(" train--- final data are saved to: %s ", dir_trainData); printf("test---- final data are saved to: %s ", dir_testData); getch(); return(0); } void assignParameter() { sprintf(dataset_train,"../train.txt"); sprintf(dataset_test,"../test.txt"); sprintf(dir_trainData, "../ResultData"); sprintf(dir_testData, "../ResultData"); sprintf(model_pre, "../ResultData/final"); } </span>


    3.2 lda.h

    自己定义数据结构

    #ifndef LDA_H
    #define LDA_H
    
    typedef struct
    {
        int* words; //文档中的单词,这里存的是该单词在文档集字典中的ID
        int* counts; //每一个单词文档中出现次数
        int length; //文档中出现的单词个数,去重的。也就是反复出现的单词不计
        int total;  //文档中总单词数,不去重
    } document;
    
    
    typedef struct
    {
        document* docs;
        int num_terms; //文档集中出现的单词个数。去重的。也就是文档集字典大小
        int num_docs; //文档集中文档个数
    } corpus;
    
    
    typedef struct
    {
        double alpha; //论文中的模型參数alpha,本来应该是k维。程序中实现的是对称分布的Dirichlet,k维的值是同样的
        double** log_prob_w; //论文中的模型參数beta。每一行存一个主题的词分布,维<span style="font-family: Arial, Helvetica, sans-serif;">度k*V</span>
        int num_topics; //主题个数
        int num_terms;
    } lda_model;
    
    
    typedef struct
    {
        double** class_word;//模型參数beta的充分统计量。维度:主题个数*文档集字典大小(K*V)
        double* class_total;//存主题分布z的 充分统计量,维度:主题个数K
        double alpha_suffstats;  //模型參数alpha的充分统计量
        int num_docs;
    } lda_suffstats;
    
    #endif
    

    3.3 lda-model.c

    主要是初始化lda模型(有三种方法),一种是全部值都为0,'random'是用随机数,'seeded'是随机挑选一些文档来初始化模型

    还有计算模型參数alpha , beta (lda_mle)

    #include "lda-model.h"
    
    /*
     * compute MLE lda model from sufficient statistics
     *
     */
    
    void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha)
    {
        int k; int w;
    
        for (k = 0; k < model->num_topics; k++)
        {
            for (w = 0; w < model->num_terms; w++)
            {
                if (ss->class_word[k][w] > 0)
                {
    				//log_prob_w是模型參数beta,主题-词分布
    				//class_word和class_total都是充分统计量(sufficient statistic)
    				//所以log相减是在做归一化,beta中的值是概率。要在0-1之间
                    model->log_prob_w[k][w] =
                        log(ss->class_word[k][w]) -
                        log(ss->class_total[k]);
                }
                else
                    model->log_prob_w[k][w] = -100;
            }
        }
        if (estimate_alpha == 1)
        {
    		//用牛顿方法优化得到alpha
    		//注意这里alpha_suffstats的值
            model->alpha = opt_alpha(ss->alpha_suffstats,
                                     ss->num_docs,
                                     model->num_topics);
    
            printf("new alpha = %5.5f
    ", model->alpha);
        }
    }
    
    /*
     * allocate sufficient statistics
     *
     */
    
    lda_suffstats* new_lda_suffstats(lda_model* model)
    {
        int num_topics = model->num_topics;
        int num_terms = model->num_terms;
        int i,j;
    
        lda_suffstats* ss = malloc(sizeof(lda_suffstats));
        ss->class_total = malloc(sizeof(double)*num_topics);
        ss->class_word = malloc(sizeof(double*)*num_topics);
        for (i = 0; i < num_topics; i++)
        {
    		ss->class_total[i] = 0;
    		ss->class_word[i] = malloc(sizeof(double)*num_terms);
    		for (j = 0; j < num_terms; j++)
    		{
    			ss->class_word[i][j] = 0;
    		}
        }
        return(ss);
    }
    void free_lda_ss(lda_suffstats* ss, lda_model* model)
    {
    	int i=0;
    	for (i=0; i < model->num_topics; i++)
    		free(ss->class_word[i]);
    	free(ss->class_word);
    	free(ss->class_total);
    	free(ss);
    }
    
    /*
     * various intializations for the sufficient statistics
     *
     */
    
    void zero_initialize_ss(lda_suffstats* ss, lda_model* model)
    {
        int k, w;
        for (k = 0; k < model->num_topics; k++)
        {
            ss->class_total[k] = 0;
            for (w = 0; w < model->num_terms; w++)
            {
                ss->class_word[k][w] = 0;
            }
        }
        ss->num_docs = 0;
        ss->alpha_suffstats = 0;
    }
    
    
    void random_initialize_ss(lda_suffstats* ss, lda_model* model)
    {
        int num_topics = model->num_topics;
        int num_terms = model->num_terms;
        int k, n;
        for (k = 0; k < num_topics; k++)
        {
            for (n = 0; n < num_terms; n++)
            {
                ss->class_word[k][n] += 1.0/num_terms + myrand();
                ss->class_total[k] += ss->class_word[k][n];
            }
        }
    }
    
    
    void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
    {
        int num_topics = model->num_topics;
        int i, k, d, n;
        document* doc;
    	
        for (k = 0; k < num_topics; k++)//每一个主题用一些文档的来初始化其主题-词 分布 的充分统计量
        {
            for (i = 0; i < NUM_INIT; i++)//在文档集中随机挑选NUM_INIT=1个文档
            {
                d = floor(myrand() * c->num_docs); //随机挑选
                printf("initialized with document %d
    ", d);
                doc = &(c->docs[d]);
                for (n = 0; n < doc->length; n++)
                {
    				//将NUM_INIT个文档的词频统计,作为第k个主题的词分布的统计量
                    ss->class_word[k][doc->words[n]] += doc->counts[n]; 
                }
            }
            for (n = 0; n < model->num_terms; n++)
            {
                ss->class_word[k][n] += 1.0;//由于后面要对它求log,所以值必须大于0
    			//是对class_word按行求和的结果,是主题k被选中的次数,也就是该主题下的词出现次数的和
                ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
            }
    		//这样用文档的词频信息初始化,total必定不为0
    		//if (ss->class_total[k] == 0)
    		//	ss->class_total[k] = 1;         
        }
    }
    
    /*
     * allocate new lda model
     *
     */
    
    lda_model* new_lda_model(int num_terms, int num_topics)
    {
        int i,j;
        lda_model* model;
    
        model = malloc(sizeof(lda_model));
        model->num_topics = num_topics;
        model->num_terms = num_terms;
        model->alpha = 1.0;
        model->log_prob_w = malloc(sizeof(double*)*num_topics);
        for (i = 0; i < num_topics; i++)
        {
    	model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
    	for (j = 0; j < num_terms; j++)
    	    model->log_prob_w[i][j] = 0;
        }
        return(model);
    }
    
    
    /*
     * deallocate new lda model
     *
     */
    
    void free_lda_model(lda_model* model)
    {
        int i;
    
        for (i = 0; i < model->num_topics; i++)
        {
    		free(model->log_prob_w[i]);
        }
        free(model->log_prob_w);
    	free(model);
    }
    
    
    /*
     * save an lda model
     *
     */
    
    void save_lda_model(lda_model* model, char* model_root)
    {
        char filename[100];
        FILE* fileptr;
        int i, j;
    
        sprintf(filename, "%s.beta", model_root);
        fileptr = fopen(filename, "w");
        for (i = 0; i < model->num_topics; i++)
        {
    		for (j = 0; j < model->num_terms; j++)
    		{
    			fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
    		}
    		fprintf(fileptr, "
    ");
        }
        fclose(fileptr);
    
        sprintf(filename, "%s.other", model_root);
        fileptr = fopen(filename, "w");
        fprintf(fileptr, "num_topics %d
    ", model->num_topics);
        fprintf(fileptr, "num_terms %d
    ", model->num_terms);
        fprintf(fileptr, "alpha %5.10f
    ", model->alpha);
        fclose(fileptr);
    }
    
    
    lda_model* load_lda_model(char* model_root)
    {
        char filename[100];
        FILE* fileptr;
        int i, j, num_terms, num_topics;
        float x, alpha;
    	lda_model* model;
    
        sprintf(filename, "%s.other", model_root);
        printf("loading %s
    ", filename);
        fileptr = fopen(filename, "r");
        fscanf(fileptr, "num_topics %d
    ", &num_topics);
        fscanf(fileptr, "num_terms %d
    ", &num_terms);
        fscanf(fileptr, "alpha %f
    ", &alpha);
        fclose(fileptr);
    
        model = new_lda_model(num_terms, num_topics);
        model->alpha = alpha;
    
        sprintf(filename, "%s.beta", model_root);
        printf("loading %s
    ", filename);
        fileptr = fopen(filename, "r");
        for (i = 0; i < num_topics; i++)
        {
            for (j = 0; j < num_terms; j++)
            {
                fscanf(fileptr, "%f", &x);
                model->log_prob_w[i][j] = x;
            }
        }
        fclose(fileptr);
        return(model);
    }
    

    3.3 lda-estimate.c

    当中包括和模型求解相关的函数,em算法(run_em)和e-step(doc_e_step)

    #include "lda-estimate.h"
    
    /*
     * perform inference on a document and update sufficient statistics
     *
     */
    int LAG=5;
    double doc_e_step(document* doc, double* gamma, double** phi,
                      lda_model* model, lda_suffstats* ss)
    {
        double likelihood;
        int n, k;
    	double gamma_sum;
        // posterior inference
    	
        likelihood = lda_inference(doc, model, gamma, phi);
    
        // update sufficient statistics
    
    	//这里更新alpha的 充分统计量
    	//alpha_suffstats = sum(digamma(gamma)) - K*digamma(gamm_sum)
        gamma_sum = 0;
        for (k = 0; k < model->num_topics; k++)
        {
            gamma_sum += gamma[k];
            ss->alpha_suffstats += digamma(gamma[k]); //log gamma函数的一阶导数
        }
        ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);
    
        for (n = 0; n < doc->length; n++)
        {
            for (k = 0; k < model->num_topics; k++)
            {
    			//phi[n][k]是第n个word由第k个主题生成的概率。在log space
                ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
                ss->class_total[k] += doc->counts[n]*phi[n][k];
            }
        }
    	//增加充分统计量的文档数
        ss->num_docs = ss->num_docs + 1;
    
        return(likelihood);
    }
    
    
    /*
     * writes the word assignments line for a document to a file
     *
     */
    
    int write_word_assignment(FILE* result, FILE* f, document* doc, double** phi, lda_model* model)
    {
    	int n;
    	//f中保存phi, result中保存结果:[wordID:概率最大的topicID]
    	fprintf(result, "%03d", doc->length); 
    	for (n = 0; n < doc->length; n++)
    	{
    		int k;
    		for (k=0;k< model->num_topics;k++) 
    			fprintf(f, "%f	",phi[n][k]); //一行相应一个word由每个topic生成的概率
    		fprintf(f, "
    ");
    
    		fprintf(result, " %04d:%02d",
    			doc->words[n], argmax(phi[n], model->num_topics));//argmax 找出phi[n]中最大的元素相应的索引位置,也就是topicID
    	}
    	fprintf(result, "
    ");  //一行相应一个文档的 每个word相应的概率最大的topic
    	fflush(f);
    	fflush(result);
    	return 0;
    	
    }
    
    /*
     * saves the gamma parameters of the current dataset
     *
     */
    
    void save_gamma(char* filename, double** gamma, int num_docs, int num_topics)
    {
        FILE* fileptr;
        int d, k;
        fileptr = fopen(filename, "w");
    
        for (d = 0; d < num_docs; d++)
        {
    	fprintf(fileptr, "%5.10f", gamma[d][0]);
    	for (k = 1; k < num_topics; k++)
    	{
    	    fprintf(fileptr, " %5.10f", gamma[d][k]);
    	}
    	fprintf(fileptr, "
    ");
        }
        fclose(fileptr);
    }
    
    
    /*
     * run_em
     *
     */
    
    long double  run_em(char* start, char* directory, corpus* corpus)
    {
    	clock_t start_EM, finish_EM;
    	double * theta;
    	FILE * thetaFile;
    
        int d, n;
        lda_model *model = NULL;
        double **var_gamma, **phi;
    	FILE* likelihood_file;
    	int max_length;
    	char filename[500];
    	char filename1[500];
    	int i;
    	double likelihood, likelihood_old, converged;
    	lda_suffstats* ss;
    	FILE* w_asgn_file;
    	FILE* result;
    
        // allocate variational parameters 
    	        //为变分參数gamma分配空间,维度:文档数*主题数 
        var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
        for (d = 0; d < corpus->num_docs; d++)
    		var_gamma[d] = malloc(sizeof(double) * NTOPICS);
    	        //为变分參数phi分配空间。维度:文档集中文档的最大单词数(去重) * 主题数
        max_length = max_corpus_length(corpus);
        phi = malloc(sizeof(double*)*max_length);
        for (n = 0; n < max_length; n++)
    		phi[n] = malloc(sizeof(double) * NTOPICS);
    
        // initialize model
        ss = NULL;
        if (strcmp(start, "seeded")==0)
        {
            model = new_lda_model(corpus->num_terms, NTOPICS);
            ss = new_lda_suffstats(model);
            corpus_initialize_ss(ss, model, corpus);  //初始化tw分布
            lda_mle(model, ss, 0);  //compute MLE lda model from sufficient statistics
            model->alpha = INITIAL_ALPHA;
        }
        else if (strcmp(start, "random")==0)
        {
            model = new_lda_model(corpus->num_terms, NTOPICS);
            ss = new_lda_suffstats(model);
            random_initialize_ss(ss, model);
            lda_mle(model, ss, 0);
            model->alpha = INITIAL_ALPHA;
        }
        else
        {
            model = load_lda_model(start);
            ss = new_lda_suffstats(model);
        }
    
        sprintf(filename,"%s/000",directory);
        save_lda_model(model, filename);
    
        // run expectation maximization
    
        i = 0;
        likelihood_old = 0;
    	converged = 1;
        sprintf(filename, "%s/likelihood.dat", directory);
        likelihood_file = fopen(filename, "w");
    
    	start_EM = clock();
    	//em迭代继续运行条件:下面1和2同一时候满足
    	//1,
    	//converaged<0 也就是新值比旧值好
    	//或converaged>EM_CONVERGED 新值和旧值还不够近似
    	//或迭代步骤运行太少(<=2)
    	//2,
    	//当前迭代step数在规定的最大迭代步数以内
    	//或者没有指定最大迭代步数(-1)
        while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) &&  ((i <= EM_MAX_ITER) || (EM_MAX_ITER == -1))   )
        {
            i++; printf("**** em iteration %d ****
    ", i);
            likelihood = 0;
            zero_initialize_ss(ss, model); //把统计量的值都赋为0
    
            // e-step
    	//固定alpha和beta。对每一篇文档找到优化的gamma和phi,更新充分统计量,计算似然
            for (d = 0; d < corpus->num_docs; d++)
            {
                if ((d % 10) == 0) printf("document %d in %d EM iteration
    ",d, i);
                likelihood += doc_e_step(&(corpus->docs[d]),
                                         var_gamma[d],
                                         phi,
                                         model,
                                         ss);
            }
    
            // m-step
    		
    	//依据当前的充分统计量。更新模型參数alpha,beta
            lda_mle(model, ss, ESTIMATE_ALPHA);
    
            // check for convergence
    
            converged = (likelihood_old - likelihood) / (likelihood_old);
            if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
            likelihood_old = likelihood;
    
            // output model and likelihood
    
            fprintf(likelihood_file, "%10.10f	%5.5e
    ", likelihood, converged);
            fflush(likelihood_file);
            if ((i % LAG) == 0)
            {
                sprintf(filename,"%s/%03d",directory, i);
                save_lda_model(model, filename);
                sprintf(filename,"%s/%03d.gamma",directory, i);
                save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
            }
        }
    	//EM迭代结束
    	finish_EM = clock();
    	printf("nTopic = %d, nTerm = %d estimation time: 
    ", model->num_topics, model->num_terms);
    	printf("  EM iteration takes %f seconds(this is %d miniutes)
    ", (double)(finish_EM-start_EM)/CLOCKS_PER_SEC, (finish_EM-start_EM)/CLOCKS_PER_SEC/60);
    
        // output the final model
        sprintf(filename,"%s/final",directory);
        save_lda_model(model, filename);   //此函数中保存了beta到final.beta文件  , 还有.other文件
        sprintf(filename,"%s/final.gamma",directory);
        save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
    
        
    	// output theta
    	theta = (double*)malloc(sizeof(double)*model->num_topics);
    	sprintf(filename1, "%s/final.theta", directory);  
    	thetaFile = fopen(filename1, "w");
    	// output the word assignments (for visualization)
    	sprintf(filename1, "%s/result-doc-assgn.dat", directory);  
    	result = fopen(filename1, "w");
        for (d = 0; d < corpus->num_docs; d++)
        {
    	sprintf(filename, "%s/result_%d_phi.dat", directory,d);          //调试这一部分有越界的错误,完成,filename数组空间太小。
    	w_asgn_file = fopen(filename, "w");
            printf("final e step document %d
    ",d);
            likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
    	write_word_assignment(result, w_asgn_file, &(corpus->docs[d]), phi, model);    
    	computeTheta(  thetaFile,  &(corpus->docs[d]), phi,  model,  theta);
    	fclose(w_asgn_file);
        }
        fclose(result);
    	fclose(thetaFile);
        fclose(likelihood_file);
    	//释放空间
    	free(theta);
    	for (d = 0; d < corpus->num_docs; d++)
    		free(var_gamma[d]);
    	free(var_gamma); 
    	for (n = 0; n < max_length; n++)
    		free(phi[n]);
    	free(phi);
    	free_lda_ss( ss,  model);
    	free_lda_model(model);
    
    	return (long double)(finish_EM-start_EM)/CLOCKS_PER_SEC/60;
    }
    
    void computeTheta( FILE* thetaFile, document* doc, double** phi, lda_model* model, double * theta)
    {
    	int n;
    
    	for (n=0; n< model->num_topics; n++)
    		theta[n] = 0;
    	for (n = 0;  n<doc->length; n++)
    	{
    		int topicIndex = argmax(phi[n], model->num_topics);
    		theta[  topicIndex  ] = theta[  topicIndex  ] + doc->counts[ n ];
    	}
    
    	for (n=0; n<model->num_topics; n++)
    	{
    		theta[n] = theta[n]/doc->total;
    		fprintf(thetaFile, "%f	", theta[n]);
    	}
    	fprintf(thetaFile, "
    ");
    	fflush(thetaFile);
    
    }
    
    /*
     * read settings.
     *
     */
    
    void read_settings(char* filename)
    {
        FILE* fileptr;
        char alpha_action[100];
        fileptr = fopen(filename, "r");
        fscanf(fileptr, "var max iter %d
    ", &VAR_MAX_ITER);
        fscanf(fileptr, "var convergence %f
    ", &VAR_CONVERGED);
        fscanf(fileptr, "em max iter %d
    ", &EM_MAX_ITER);
        fscanf(fileptr, "em convergence %f
    ", &EM_CONVERGED);
        fscanf(fileptr, "alpha %s", alpha_action);
        if (strcmp(alpha_action, "fixed")==0)
        {
    	ESTIMATE_ALPHA = 0;
        }
        else
        {
    	ESTIMATE_ALPHA = 1;
        }
        fclose(fileptr);
    }
    
    
    /*
     * inference only
     *
     */
    
    void infer(char* model_root, char* save, corpus* corpus)
    {
        FILE* fileptr;
    	FILE* result;
    	FILE* w_asgn_file;
        char filename[100]; 
    	char filename1[200];
        int i, d, n;
        lda_model *model;
        double **var_gamma, likelihood, **phi;
        document* doc;
    
    	/*double ***corpusPhi;
    	corpusPhi = (double***)malloc(sizeof(double**)*(corpus->num_docs));
    	for (i=0;i<corpus.num_docs;j++)
    	{
    	corpusPhi[i] = (double**)malloc(sizeof(double*)*)
    	}*/
    	sprintf(filename1, "%s/result-doc-assgn.dat", save);  
    	result = fopen(filename1, "w");
    
        model = load_lda_model(model_root);
        var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
        for (i = 0; i < corpus->num_docs; i++)
    		var_gamma[i] = (double*)malloc(sizeof(double)*model->num_topics);
    
    	//int max_length = max_corpus_length(corpus);  
    
        sprintf(filename, "%s-lda-lhood.dat", save);
        fileptr = fopen(filename, "w");
    
        for (d = 0; d < corpus->num_docs; d++)
        {
    		if (((d % 100) == 0) && (d>0)) printf("document %d
    ",d);
    
    		doc = &(corpus->docs[d]);
    		phi = (double**) malloc(sizeof(double*) * doc->length);
    		//phi = (double**) malloc(sizeof(double*) * max_length);  
    		for (n = 0; n < doc->length; n++)
    		//for (n = 0; n < max_length; n++)                            
    			phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
    		likelihood = lda_inference(doc, model, var_gamma[d], phi);
    
    		fprintf(fileptr, "%5.5f
    ", likelihood);
    
    		//输出每个文档的phi到文件result_%d_phi.dat中   另外每个word相应的概率最大的topic保存在文件result-doc-assgn.dat中  一行相应一个文档
    		sprintf(filename, "%s/result_%d_phi.dat", save,d);                                               
    		w_asgn_file = fopen(filename, "w");
    		printf("final e step document %d
    ",d);
    		write_word_assignment(result,w_asgn_file, &(corpus->docs[d]), phi, model);
    		fclose(w_asgn_file);
    		for (n = 0; n < doc->length; n++)
    			free(phi[n]);
    		free(phi);
        }
        fclose(fileptr);
        sprintf(filename, "%s-gamma.dat", save);
        save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
    
    	fclose(result);
    	for (d = 0; d < corpus->num_docs; d++)
    		free(var_gamma[d]);
    	free(var_gamma); 
    
    	free_lda_model(model);
    }
    
    
    

    3.4 lda-inference.c

    当中包括变分參数求解相关的函数


    #include "lda-inference.h"
    
    /*
     * variational inference
     *
     */
    int lisnan(double x) { 
    	return x != x; 
    }
    double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi)
    {
        double converged = 1;
        double phisum = 0, likelihood = 0;
        double likelihood_old = 0,  *oldphi=(double *)malloc(sizeof(double)*(model->num_topics));
        
        int k, n, var_iter;
       double *digamma_gam=(double *)malloc(sizeof(double)*(model->num_topics));
    
        // compute posterior dirichlet
    
        for (k = 0; k < model->num_topics; k++)
        {
    	//初始化变分參数gamma=alpha + 当前文档中单词个数(不去重) N / 主题个数 k
            var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
            //log gamma函数的一阶导数
    	digamma_gam[k] = digamma(var_gamma[k]);
            //初始化变分參数phi=1/k
    	for (n = 0; n < doc->length; n++)
                phi[n][k] = 1.0/model->num_topics;
        }
        var_iter = 0;
    	//開始迭代
        while ((converged > VAR_CONVERGED) &&
               ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
        {
    	var_iter++;
    	for (n = 0; n < doc->length; n++)
    	{
                phisum = 0;
                for (k = 0; k < model->num_topics; k++)
                {
                    oldphi[k] = phi[n][k];
    		//更新变分參数 phi
    		//就是论文中变分判断算法的式子 phi(n,i) = b(i,wn) * exp(digamma(gamma(i)))
    		//这里由于有exp所以在log空间计算,算得的phi也是log space的
                    phi[n][k] =
                        digamma_gam[k] +
                        model->log_prob_w[k][doc->words[n]];
    
                    if (k > 0)
                        phisum = log_sum(phisum, phi[n][k]);//在log space对phi求和
                    else
                        phisum = phi[n][k]; // note, phi is in log space
                }
    
                for (k = 0; k < model->num_topics; k++)
                {
    		//归一化,使phi(n)和为1
                    phi[n][k] = exp(phi[n][k] - phisum);
    		//更新变分參数 gamma
                    var_gamma[k] =
                        var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
                    // !!! a lot of extra digamma's here because of how we're computing it
                    // !!! but its more automatically updated too.
                    digamma_gam[k] = digamma(var_gamma[k]);
                }
            }
    
            likelihood = compute_likelihood(doc, model, phi, var_gamma);
            assert(!isnan(likelihood));
            converged = (likelihood_old - likelihood) / likelihood_old;
            likelihood_old = likelihood;
    
            // printf("[LDA INF] %8.5f %1.3e
    ", likelihood, converged);
        }//迭代结束
        return(likelihood);
    }
    
    
    /*
     * compute likelihood bound
     *
     */
    //依照论文附录(15)式计算L(gamma,phi;alpha,beta)
    double
    compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma)
    {
        double likelihood = 0, digsum = 0, var_gamma_sum = 0, *dig=(double *)malloc(sizeof(double)*(model->num_topics));
        int k, n;
    
        for (k = 0; k < model->num_topics; k++)
        {
    	dig[k] = digamma(var_gamma[k]);
    	var_gamma_sum += var_gamma[k];
        }
        digsum = digamma(var_gamma_sum);
    	//论文(14)式中的Eq,第1个和第4个是合在一起再拆分算的,第2,3,5个是合在一起算的
    	//Eq[logp(theta|alpha)]中的前两个部分 和 Eq[logq(theta)]中第一部分 
        likelihood =
    	log_gamma(model->alpha * model -> num_topics)
    	- model -> num_topics * log_gamma(model->alpha)
    	- (log_gamma(var_gamma_sum));
    
        for (k = 0; k < model->num_topics; k++)
        {
        //Eq[logp(theta|alpha)]中的第三个部分 和 Eq[logq(theta)]中剩余的 
    	likelihood +=
    	    (model->alpha - 1)*(dig[k] - digsum) + log_gamma(var_gamma[k])
    	    - (var_gamma[k] - 1)*(dig[k] - digsum);
    	//Eq[logp(z|theta)] + Eq[logp(w|z,beta)] - Eq[logq(z)]
    	for (n = 0; n < doc->length; n++)
    	{
                if (phi[n][k] > 0)
                {
                    likelihood += doc->counts[n]*
                        (phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
                                    + model->log_prob_w[k][doc->words[n]]));
                }
            }
        }
        return(likelihood);
    }
    


    3.5 lda-data.c

    数据集读入

    #include "lda-data.h"
    
    corpus* read_data(char* data_filename)
    {
        FILE *fileptr;
        int length, count, word, n, nd, nw;
        corpus* c;
    
        printf("reading data from %s
    ", data_filename);
        c = malloc(sizeof(corpus));
        c->docs = 0;
        c->num_terms = 0;
        c->num_docs = 0;
        fileptr = fopen(data_filename, "r");
        nd = 0; nw = 0;
        while ((fscanf_s(fileptr, "%10d", &length) != EOF)) //读入每行数据的第一个数字,是文档的字典大小(文档中单词去重的个数)
        {
    	//对于第nd个文档
    	c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1)); //(数据类型*)realloc(要改变内存大小的指针名,新的大小)  新的大小一定要大于原来的大小。不然的话会导致数据丢失!

    c->docs[nd].length = length; //文档中出现过的单词的个数。也就是文档字典大小,是去重的 c->docs[nd].total = 0; //文档中总单词个数。不去重,是对counts的求和。 c->docs[nd].words = malloc(sizeof(int)*length); //文档中的word在文档集字典中的ID c->docs[nd].counts = malloc(sizeof(int)*length); //文档中word出现次数 for (n = 0; n < length; n++)//读入每行数据剩下的数据。词频统计 { fscanf_s(fileptr, "%10d:%10d", &word, &count); //读入每一个 [wordID:word出现次数] word = word - OFFSET; c->docs[nd].words[n] = word; c->docs[nd].counts[n] = count; c->docs[nd].total += count; if (word >= nw) { nw = word + 1; } //nw记录文档集最大的那个word ID。也就是文档集字典中的单词个数 //if (word >= nw) { nw = word; } } nd++; } fclose(fileptr); c->num_docs = nd; c->num_terms = nw; printf("number of docs : %d ", nd); printf("number of terms : %d ", nw); return(c); } int max_corpus_length(corpus* c)//输出数据集中单词数(去重后)最多的文档的单词数。这个length是去重后的长度 { int n, max = 0; for (n = 0; n < c->num_docs; n++) if (c->docs[n].length > max) max = c->docs[n].length; return(max); }


    3.6 lda-alpha.c

    牛顿法计算模型參数alpha

    #include "lda-alpha.h"
    #include "lda-inference.h"
    /*
     * objective function and its derivatives
     *
     */
    
    double alhood(double a, double ss, int D, int K)
    { return(D * (log_gamma(K * a) - K * log_gamma(a)) + (a - 1) * ss); }
    
    double d_alhood(double a, double ss, int D, int K)
    { return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
    
    double d2_alhood(double a, int D, int K)
    { return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
    
    
    /*
     * newtons method
     *
     */
    
    double opt_alpha(double ss, int D, int K)
    {
        double a, log_a, init_a = 100;
        double f, df, d2f;
        int iter = 0;
    
        log_a = log(init_a);
        do
        {
            iter++;
            a = exp(log_a);
            if (isnan(a))
            {
                init_a = init_a * 10;
                printf("warning : alpha is nan; new init = %5.5f
    ", init_a);
                a = init_a;
                log_a = log(a);
            }
            f = alhood(a, ss, D, K); //附录A4.2中的L(a)
            df = d_alhood(a, ss, D, K); //L对a的一阶偏导
            d2f = d2_alhood(a, D, K); //二阶偏导
            log_a = log_a - df/(d2f * a + df);//迭代公式
            printf("alpha maximization : %5.5f   %5.5f
    ", f, df);
        }
        while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
        return(exp(log_a));
    }
    


    还有cokus.c 和 utils.c 中是一些数学计算的函数。






  • 相关阅读:
    正则化方法:L1和L2 regularization、数据集扩增、dropout
    xgboost原理及应用
    机器学习系列------1. GBDT算法的原理
    c++ stl容器set成员函数介绍及set集合插入,遍历等用法举例
    STL中的set容器的一点总结
    2016-12-17 新浪博客服务器挂掉了,所有博客页面都无法打开
    Centos 6.5 下php5.6.2 的编译安装
    Docker的基本组成
    Docker简介
    基于Dubbo框架构建分布式服务(集群容错&负载均衡)
  • 原文地址:https://www.cnblogs.com/zsychanpin/p/6779998.html
Copyright © 2020-2023  润新知