1。说明
本文对LDA原始论文的作者所提供的C代码中LDA的主要逻辑部分做凝视,原代码可在这里下载到:https://github.com/Blei-Lab/lda-c
这份代码实现论文《Latent Dirichlet Allocation》中介绍的LDA模型。用变分EM算法求解參数。
为了使代码在vs2013中执行。做了一些微小修改,但不影响原代码的逻辑。
vs2013project可在我的资源中下载:
http://download.csdn.net/detail/happyer88/8861773
----------------------------------------------------------------------2。准备知识
2.1,LDA原理及推导
《Latent Dirichlet Allocation》论文
我的LDA学习笔记1-4系列
2.2,充分统计量
https://en.wikipedia.org/wiki/Sufficient_statistic
----------------------------------------------------------------------
3,代码凝视
3.1 main.c
原代码中main函数在lda-estimate.c中,创建vsproject时把它挪到了main.c中。
<span style="font-size:14px;">#include <stdio.h> #include <stdlib.h> #include <io.h> #include<time.h> #include "cokus.h" #include "lda-alpha.h" #include"lda-data.h" #include"lda-estimate.h" #include"lda-inference.h" #include"lda.h" #include"utils.h" char * datasetName = "scene8"; //数据集名字,必须与目录名字同样 int expec = 1; // expec==1,expect , inf int vocabularySize_global = 512; // 字典大小 int k = 100; //topic的数目 char* params ="../settings.txt"; //预计:预计过程须要的參数 char* params1 ="../inf-settings.txt"; //判断:判断过程须要的參数 char dataset_train[500]; //预计:预计參数的数据文件 char dataset_test[500]; //判断:判断的数据文件 char dir_trainData[500]; //预计:预计的中间数据和终于数据目录路径 char dir_testData[500]; //判断:判断的中间数据和终于数据目录路径 char model_pre[500]; void assignParameter(); int main() { corpus* corpus; clock_t start,finish; double totaltime; long double totaltime_EMiteration; assignParameter(); //myCreateDirectory(); start=clock(); if(expec) { INITIAL_ALPHA = 1; //狄利克雷分布的參数alpha NTOPICS =k; //主题个数 read_settings(params); //读取參数。。。
最大迭代次数。收敛条件阈值;EM的最大迭代次数、收敛条件阈值; corpus = read_data(dataset_train); //读取数据。
。。
数据格式:(每一行)在一个文档中出现的word总数目(去掉次数=0的)index_word1:counts index_word2:counts ........... totaltime_EMiteration = run_em("seeded", dir_trainData, corpus); //求解參数。。。EM过程求解參数--输入:中间数据和终于数据存放目录、语料库 printf("inferencing test images! "); read_settings(params1); corpus = read_data(dataset_test); infer(model_pre, dir_testData, corpus); //用完開始释放 } else { read_settings(params1); corpus = read_data(dataset_test); infer(model_pre, dir_testData, corpus); } finish=clock(); totaltime=(double)(finish-start)/CLOCKS_PER_SEC; printf("nTopic = %d, nTerm = %d estimation time: ", k, vocabularySize_global); printf(" EM iteration takes %f seconds(this is %f miniutes) ", totaltime_EMiteration*60, totaltime_EMiteration); printf("Running Time(--estimate trainData and inference trainData and testData--):%f ",totaltime); printf(" train--- final data are saved to: %s ", dir_trainData); printf("test---- final data are saved to: %s ", dir_testData); getch(); return(0); } void assignParameter() { sprintf(dataset_train,"../train.txt"); sprintf(dataset_test,"../test.txt"); sprintf(dir_trainData, "../ResultData"); sprintf(dir_testData, "../ResultData"); sprintf(model_pre, "../ResultData/final"); } </span>
3.2 lda.h
自己定义数据结构
#ifndef LDA_H #define LDA_H typedef struct { int* words; //文档中的单词,这里存的是该单词在文档集字典中的ID int* counts; //每一个单词文档中出现次数 int length; //文档中出现的单词个数,去重的。也就是反复出现的单词不计 int total; //文档中总单词数,不去重 } document; typedef struct { document* docs; int num_terms; //文档集中出现的单词个数。去重的。也就是文档集字典大小 int num_docs; //文档集中文档个数 } corpus; typedef struct { double alpha; //论文中的模型參数alpha,本来应该是k维。程序中实现的是对称分布的Dirichlet,k维的值是同样的 double** log_prob_w; //论文中的模型參数beta。每一行存一个主题的词分布,维<span style="font-family: Arial, Helvetica, sans-serif;">度k*V</span> int num_topics; //主题个数 int num_terms; } lda_model; typedef struct { double** class_word;//模型參数beta的充分统计量。维度:主题个数*文档集字典大小(K*V) double* class_total;//存主题分布z的 充分统计量,维度:主题个数K double alpha_suffstats; //模型參数alpha的充分统计量 int num_docs; } lda_suffstats; #endif
3.3 lda-model.c
主要是初始化lda模型(有三种方法),一种是全部值都为0,'random'是用随机数,'seeded'是随机挑选一些文档来初始化模型
还有计算模型參数alpha , beta (lda_mle)
#include "lda-model.h" /* * compute MLE lda model from sufficient statistics * */ void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) { int k; int w; for (k = 0; k < model->num_topics; k++) { for (w = 0; w < model->num_terms; w++) { if (ss->class_word[k][w] > 0) { //log_prob_w是模型參数beta,主题-词分布 //class_word和class_total都是充分统计量(sufficient statistic) //所以log相减是在做归一化,beta中的值是概率。要在0-1之间 model->log_prob_w[k][w] = log(ss->class_word[k][w]) - log(ss->class_total[k]); } else model->log_prob_w[k][w] = -100; } } if (estimate_alpha == 1) { //用牛顿方法优化得到alpha //注意这里alpha_suffstats的值 model->alpha = opt_alpha(ss->alpha_suffstats, ss->num_docs, model->num_topics); printf("new alpha = %5.5f ", model->alpha); } } /* * allocate sufficient statistics * */ lda_suffstats* new_lda_suffstats(lda_model* model) { int num_topics = model->num_topics; int num_terms = model->num_terms; int i,j; lda_suffstats* ss = malloc(sizeof(lda_suffstats)); ss->class_total = malloc(sizeof(double)*num_topics); ss->class_word = malloc(sizeof(double*)*num_topics); for (i = 0; i < num_topics; i++) { ss->class_total[i] = 0; ss->class_word[i] = malloc(sizeof(double)*num_terms); for (j = 0; j < num_terms; j++) { ss->class_word[i][j] = 0; } } return(ss); } void free_lda_ss(lda_suffstats* ss, lda_model* model) { int i=0; for (i=0; i < model->num_topics; i++) free(ss->class_word[i]); free(ss->class_word); free(ss->class_total); free(ss); } /* * various intializations for the sufficient statistics * */ void zero_initialize_ss(lda_suffstats* ss, lda_model* model) { int k, w; for (k = 0; k < model->num_topics; k++) { ss->class_total[k] = 0; for (w = 0; w < model->num_terms; w++) { ss->class_word[k][w] = 0; } } ss->num_docs = 0; ss->alpha_suffstats = 0; } void random_initialize_ss(lda_suffstats* ss, lda_model* model) { int num_topics = model->num_topics; int num_terms = model->num_terms; int k, n; for (k = 0; k < num_topics; k++) { for (n = 0; n < num_terms; n++) { ss->class_word[k][n] += 1.0/num_terms + myrand(); ss->class_total[k] += ss->class_word[k][n]; } } } void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c) { int num_topics = model->num_topics; int i, k, d, n; document* doc; for (k = 0; k < num_topics; k++)//每一个主题用一些文档的来初始化其主题-词 分布 的充分统计量 { for (i = 0; i < NUM_INIT; i++)//在文档集中随机挑选NUM_INIT=1个文档 { d = floor(myrand() * c->num_docs); //随机挑选 printf("initialized with document %d ", d); doc = &(c->docs[d]); for (n = 0; n < doc->length; n++) { //将NUM_INIT个文档的词频统计,作为第k个主题的词分布的统计量 ss->class_word[k][doc->words[n]] += doc->counts[n]; } } for (n = 0; n < model->num_terms; n++) { ss->class_word[k][n] += 1.0;//由于后面要对它求log,所以值必须大于0 //是对class_word按行求和的结果,是主题k被选中的次数,也就是该主题下的词出现次数的和 ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n]; } //这样用文档的词频信息初始化,total必定不为0 //if (ss->class_total[k] == 0) // ss->class_total[k] = 1; } } /* * allocate new lda model * */ lda_model* new_lda_model(int num_terms, int num_topics) { int i,j; lda_model* model; model = malloc(sizeof(lda_model)); model->num_topics = num_topics; model->num_terms = num_terms; model->alpha = 1.0; model->log_prob_w = malloc(sizeof(double*)*num_topics); for (i = 0; i < num_topics; i++) { model->log_prob_w[i] = malloc(sizeof(double)*num_terms); for (j = 0; j < num_terms; j++) model->log_prob_w[i][j] = 0; } return(model); } /* * deallocate new lda model * */ void free_lda_model(lda_model* model) { int i; for (i = 0; i < model->num_topics; i++) { free(model->log_prob_w[i]); } free(model->log_prob_w); free(model); } /* * save an lda model * */ void save_lda_model(lda_model* model, char* model_root) { char filename[100]; FILE* fileptr; int i, j; sprintf(filename, "%s.beta", model_root); fileptr = fopen(filename, "w"); for (i = 0; i < model->num_topics; i++) { for (j = 0; j < model->num_terms; j++) { fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]); } fprintf(fileptr, " "); } fclose(fileptr); sprintf(filename, "%s.other", model_root); fileptr = fopen(filename, "w"); fprintf(fileptr, "num_topics %d ", model->num_topics); fprintf(fileptr, "num_terms %d ", model->num_terms); fprintf(fileptr, "alpha %5.10f ", model->alpha); fclose(fileptr); } lda_model* load_lda_model(char* model_root) { char filename[100]; FILE* fileptr; int i, j, num_terms, num_topics; float x, alpha; lda_model* model; sprintf(filename, "%s.other", model_root); printf("loading %s ", filename); fileptr = fopen(filename, "r"); fscanf(fileptr, "num_topics %d ", &num_topics); fscanf(fileptr, "num_terms %d ", &num_terms); fscanf(fileptr, "alpha %f ", &alpha); fclose(fileptr); model = new_lda_model(num_terms, num_topics); model->alpha = alpha; sprintf(filename, "%s.beta", model_root); printf("loading %s ", filename); fileptr = fopen(filename, "r"); for (i = 0; i < num_topics; i++) { for (j = 0; j < num_terms; j++) { fscanf(fileptr, "%f", &x); model->log_prob_w[i][j] = x; } } fclose(fileptr); return(model); }
3.3 lda-estimate.c
当中包括和模型求解相关的函数,em算法(run_em)和e-step(doc_e_step)
#include "lda-estimate.h" /* * perform inference on a document and update sufficient statistics * */ int LAG=5; double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) { double likelihood; int n, k; double gamma_sum; // posterior inference likelihood = lda_inference(doc, model, gamma, phi); // update sufficient statistics //这里更新alpha的 充分统计量 //alpha_suffstats = sum(digamma(gamma)) - K*digamma(gamm_sum) gamma_sum = 0; for (k = 0; k < model->num_topics; k++) { gamma_sum += gamma[k]; ss->alpha_suffstats += digamma(gamma[k]); //log gamma函数的一阶导数 } ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum); for (n = 0; n < doc->length; n++) { for (k = 0; k < model->num_topics; k++) { //phi[n][k]是第n个word由第k个主题生成的概率。在log space ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k]; ss->class_total[k] += doc->counts[n]*phi[n][k]; } } //增加充分统计量的文档数 ss->num_docs = ss->num_docs + 1; return(likelihood); } /* * writes the word assignments line for a document to a file * */ int write_word_assignment(FILE* result, FILE* f, document* doc, double** phi, lda_model* model) { int n; //f中保存phi, result中保存结果:[wordID:概率最大的topicID] fprintf(result, "%03d", doc->length); for (n = 0; n < doc->length; n++) { int k; for (k=0;k< model->num_topics;k++) fprintf(f, "%f ",phi[n][k]); //一行相应一个word由每个topic生成的概率 fprintf(f, " "); fprintf(result, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics));//argmax 找出phi[n]中最大的元素相应的索引位置,也就是topicID } fprintf(result, " "); //一行相应一个文档的 每个word相应的概率最大的topic fflush(f); fflush(result); return 0; } /* * saves the gamma parameters of the current dataset * */ void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) { FILE* fileptr; int d, k; fileptr = fopen(filename, "w"); for (d = 0; d < num_docs; d++) { fprintf(fileptr, "%5.10f", gamma[d][0]); for (k = 1; k < num_topics; k++) { fprintf(fileptr, " %5.10f", gamma[d][k]); } fprintf(fileptr, " "); } fclose(fileptr); } /* * run_em * */ long double run_em(char* start, char* directory, corpus* corpus) { clock_t start_EM, finish_EM; double * theta; FILE * thetaFile; int d, n; lda_model *model = NULL; double **var_gamma, **phi; FILE* likelihood_file; int max_length; char filename[500]; char filename1[500]; int i; double likelihood, likelihood_old, converged; lda_suffstats* ss; FILE* w_asgn_file; FILE* result; // allocate variational parameters //为变分參数gamma分配空间,维度:文档数*主题数 var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); for (d = 0; d < corpus->num_docs; d++) var_gamma[d] = malloc(sizeof(double) * NTOPICS); //为变分參数phi分配空间。维度:文档集中文档的最大单词数(去重) * 主题数 max_length = max_corpus_length(corpus); phi = malloc(sizeof(double*)*max_length); for (n = 0; n < max_length; n++) phi[n] = malloc(sizeof(double) * NTOPICS); // initialize model ss = NULL; if (strcmp(start, "seeded")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); corpus_initialize_ss(ss, model, corpus); //初始化tw分布 lda_mle(model, ss, 0); //compute MLE lda model from sufficient statistics model->alpha = INITIAL_ALPHA; } else if (strcmp(start, "random")==0) { model = new_lda_model(corpus->num_terms, NTOPICS); ss = new_lda_suffstats(model); random_initialize_ss(ss, model); lda_mle(model, ss, 0); model->alpha = INITIAL_ALPHA; } else { model = load_lda_model(start); ss = new_lda_suffstats(model); } sprintf(filename,"%s/000",directory); save_lda_model(model, filename); // run expectation maximization i = 0; likelihood_old = 0; converged = 1; sprintf(filename, "%s/likelihood.dat", directory); likelihood_file = fopen(filename, "w"); start_EM = clock(); //em迭代继续运行条件:下面1和2同一时候满足 //1, //converaged<0 也就是新值比旧值好 //或converaged>EM_CONVERGED 新值和旧值还不够近似 //或迭代步骤运行太少(<=2) //2, //当前迭代step数在规定的最大迭代步数以内 //或者没有指定最大迭代步数(-1) while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && ((i <= EM_MAX_ITER) || (EM_MAX_ITER == -1)) ) { i++; printf("**** em iteration %d **** ", i); likelihood = 0; zero_initialize_ss(ss, model); //把统计量的值都赋为0 // e-step //固定alpha和beta。对每一篇文档找到优化的gamma和phi,更新充分统计量,计算似然 for (d = 0; d < corpus->num_docs; d++) { if ((d % 10) == 0) printf("document %d in %d EM iteration ",d, i); likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); } // m-step //依据当前的充分统计量。更新模型參数alpha,beta lda_mle(model, ss, ESTIMATE_ALPHA); // check for convergence converged = (likelihood_old - likelihood) / (likelihood_old); if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; likelihood_old = likelihood; // output model and likelihood fprintf(likelihood_file, "%10.10f %5.5e ", likelihood, converged); fflush(likelihood_file); if ((i % LAG) == 0) { sprintf(filename,"%s/%03d",directory, i); save_lda_model(model, filename); sprintf(filename,"%s/%03d.gamma",directory, i); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); } } //EM迭代结束 finish_EM = clock(); printf("nTopic = %d, nTerm = %d estimation time: ", model->num_topics, model->num_terms); printf(" EM iteration takes %f seconds(this is %d miniutes) ", (double)(finish_EM-start_EM)/CLOCKS_PER_SEC, (finish_EM-start_EM)/CLOCKS_PER_SEC/60); // output the final model sprintf(filename,"%s/final",directory); save_lda_model(model, filename); //此函数中保存了beta到final.beta文件 , 还有.other文件 sprintf(filename,"%s/final.gamma",directory); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); // output theta theta = (double*)malloc(sizeof(double)*model->num_topics); sprintf(filename1, "%s/final.theta", directory); thetaFile = fopen(filename1, "w"); // output the word assignments (for visualization) sprintf(filename1, "%s/result-doc-assgn.dat", directory); result = fopen(filename1, "w"); for (d = 0; d < corpus->num_docs; d++) { sprintf(filename, "%s/result_%d_phi.dat", directory,d); //调试这一部分有越界的错误,完成,filename数组空间太小。 w_asgn_file = fopen(filename, "w"); printf("final e step document %d ",d); likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi); write_word_assignment(result, w_asgn_file, &(corpus->docs[d]), phi, model); computeTheta( thetaFile, &(corpus->docs[d]), phi, model, theta); fclose(w_asgn_file); } fclose(result); fclose(thetaFile); fclose(likelihood_file); //释放空间 free(theta); for (d = 0; d < corpus->num_docs; d++) free(var_gamma[d]); free(var_gamma); for (n = 0; n < max_length; n++) free(phi[n]); free(phi); free_lda_ss( ss, model); free_lda_model(model); return (long double)(finish_EM-start_EM)/CLOCKS_PER_SEC/60; } void computeTheta( FILE* thetaFile, document* doc, double** phi, lda_model* model, double * theta) { int n; for (n=0; n< model->num_topics; n++) theta[n] = 0; for (n = 0; n<doc->length; n++) { int topicIndex = argmax(phi[n], model->num_topics); theta[ topicIndex ] = theta[ topicIndex ] + doc->counts[ n ]; } for (n=0; n<model->num_topics; n++) { theta[n] = theta[n]/doc->total; fprintf(thetaFile, "%f ", theta[n]); } fprintf(thetaFile, " "); fflush(thetaFile); } /* * read settings. * */ void read_settings(char* filename) { FILE* fileptr; char alpha_action[100]; fileptr = fopen(filename, "r"); fscanf(fileptr, "var max iter %d ", &VAR_MAX_ITER); fscanf(fileptr, "var convergence %f ", &VAR_CONVERGED); fscanf(fileptr, "em max iter %d ", &EM_MAX_ITER); fscanf(fileptr, "em convergence %f ", &EM_CONVERGED); fscanf(fileptr, "alpha %s", alpha_action); if (strcmp(alpha_action, "fixed")==0) { ESTIMATE_ALPHA = 0; } else { ESTIMATE_ALPHA = 1; } fclose(fileptr); } /* * inference only * */ void infer(char* model_root, char* save, corpus* corpus) { FILE* fileptr; FILE* result; FILE* w_asgn_file; char filename[100]; char filename1[200]; int i, d, n; lda_model *model; double **var_gamma, likelihood, **phi; document* doc; /*double ***corpusPhi; corpusPhi = (double***)malloc(sizeof(double**)*(corpus->num_docs)); for (i=0;i<corpus.num_docs;j++) { corpusPhi[i] = (double**)malloc(sizeof(double*)*) }*/ sprintf(filename1, "%s/result-doc-assgn.dat", save); result = fopen(filename1, "w"); model = load_lda_model(model_root); var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs)); for (i = 0; i < corpus->num_docs; i++) var_gamma[i] = (double*)malloc(sizeof(double)*model->num_topics); //int max_length = max_corpus_length(corpus); sprintf(filename, "%s-lda-lhood.dat", save); fileptr = fopen(filename, "w"); for (d = 0; d < corpus->num_docs; d++) { if (((d % 100) == 0) && (d>0)) printf("document %d ",d); doc = &(corpus->docs[d]); phi = (double**) malloc(sizeof(double*) * doc->length); //phi = (double**) malloc(sizeof(double*) * max_length); for (n = 0; n < doc->length; n++) //for (n = 0; n < max_length; n++) phi[n] = (double*) malloc(sizeof(double) * model->num_topics); likelihood = lda_inference(doc, model, var_gamma[d], phi); fprintf(fileptr, "%5.5f ", likelihood); //输出每个文档的phi到文件result_%d_phi.dat中 另外每个word相应的概率最大的topic保存在文件result-doc-assgn.dat中 一行相应一个文档 sprintf(filename, "%s/result_%d_phi.dat", save,d); w_asgn_file = fopen(filename, "w"); printf("final e step document %d ",d); write_word_assignment(result,w_asgn_file, &(corpus->docs[d]), phi, model); fclose(w_asgn_file); for (n = 0; n < doc->length; n++) free(phi[n]); free(phi); } fclose(fileptr); sprintf(filename, "%s-gamma.dat", save); save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); fclose(result); for (d = 0; d < corpus->num_docs; d++) free(var_gamma[d]); free(var_gamma); free_lda_model(model); }
3.4 lda-inference.c
当中包括变分參数求解相关的函数
#include "lda-inference.h" /* * variational inference * */ int lisnan(double x) { return x != x; } double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) { double converged = 1; double phisum = 0, likelihood = 0; double likelihood_old = 0, *oldphi=(double *)malloc(sizeof(double)*(model->num_topics)); int k, n, var_iter; double *digamma_gam=(double *)malloc(sizeof(double)*(model->num_topics)); // compute posterior dirichlet for (k = 0; k < model->num_topics; k++) { //初始化变分參数gamma=alpha + 当前文档中单词个数(不去重) N / 主题个数 k var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics)); //log gamma函数的一阶导数 digamma_gam[k] = digamma(var_gamma[k]); //初始化变分參数phi=1/k for (n = 0; n < doc->length; n++) phi[n][k] = 1.0/model->num_topics; } var_iter = 0; //開始迭代 while ((converged > VAR_CONVERGED) && ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1))) { var_iter++; for (n = 0; n < doc->length; n++) { phisum = 0; for (k = 0; k < model->num_topics; k++) { oldphi[k] = phi[n][k]; //更新变分參数 phi //就是论文中变分判断算法的式子 phi(n,i) = b(i,wn) * exp(digamma(gamma(i))) //这里由于有exp所以在log空间计算,算得的phi也是log space的 phi[n][k] = digamma_gam[k] + model->log_prob_w[k][doc->words[n]]; if (k > 0) phisum = log_sum(phisum, phi[n][k]);//在log space对phi求和 else phisum = phi[n][k]; // note, phi is in log space } for (k = 0; k < model->num_topics; k++) { //归一化,使phi(n)和为1 phi[n][k] = exp(phi[n][k] - phisum); //更新变分參数 gamma var_gamma[k] = var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]); // !!! a lot of extra digamma's here because of how we're computing it // !!! but its more automatically updated too. digamma_gam[k] = digamma(var_gamma[k]); } } likelihood = compute_likelihood(doc, model, phi, var_gamma); assert(!isnan(likelihood)); converged = (likelihood_old - likelihood) / likelihood_old; likelihood_old = likelihood; // printf("[LDA INF] %8.5f %1.3e ", likelihood, converged); }//迭代结束 return(likelihood); } /* * compute likelihood bound * */ //依照论文附录(15)式计算L(gamma,phi;alpha,beta) double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) { double likelihood = 0, digsum = 0, var_gamma_sum = 0, *dig=(double *)malloc(sizeof(double)*(model->num_topics)); int k, n; for (k = 0; k < model->num_topics; k++) { dig[k] = digamma(var_gamma[k]); var_gamma_sum += var_gamma[k]; } digsum = digamma(var_gamma_sum); //论文(14)式中的Eq,第1个和第4个是合在一起再拆分算的,第2,3,5个是合在一起算的 //Eq[logp(theta|alpha)]中的前两个部分 和 Eq[logq(theta)]中第一部分 likelihood = log_gamma(model->alpha * model -> num_topics) - model -> num_topics * log_gamma(model->alpha) - (log_gamma(var_gamma_sum)); for (k = 0; k < model->num_topics; k++) { //Eq[logp(theta|alpha)]中的第三个部分 和 Eq[logq(theta)]中剩余的 likelihood += (model->alpha - 1)*(dig[k] - digsum) + log_gamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum); //Eq[logp(z|theta)] + Eq[logp(w|z,beta)] - Eq[logq(z)] for (n = 0; n < doc->length; n++) { if (phi[n][k] > 0) { likelihood += doc->counts[n]* (phi[n][k]*((dig[k] - digsum) - log(phi[n][k]) + model->log_prob_w[k][doc->words[n]])); } } } return(likelihood); }
3.5 lda-data.c
数据集读入
#include "lda-data.h" corpus* read_data(char* data_filename) { FILE *fileptr; int length, count, word, n, nd, nw; corpus* c; printf("reading data from %s ", data_filename); c = malloc(sizeof(corpus)); c->docs = 0; c->num_terms = 0; c->num_docs = 0; fileptr = fopen(data_filename, "r"); nd = 0; nw = 0; while ((fscanf_s(fileptr, "%10d", &length) != EOF)) //读入每行数据的第一个数字,是文档的字典大小(文档中单词去重的个数) { //对于第nd个文档 c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1)); //(数据类型*)realloc(要改变内存大小的指针名,新的大小) 新的大小一定要大于原来的大小。不然的话会导致数据丢失!c->docs[nd].length = length; //文档中出现过的单词的个数。也就是文档字典大小,是去重的 c->docs[nd].total = 0; //文档中总单词个数。不去重,是对counts的求和。 c->docs[nd].words = malloc(sizeof(int)*length); //文档中的word在文档集字典中的ID c->docs[nd].counts = malloc(sizeof(int)*length); //文档中word出现次数 for (n = 0; n < length; n++)//读入每行数据剩下的数据。词频统计 { fscanf_s(fileptr, "%10d:%10d", &word, &count); //读入每一个 [wordID:word出现次数] word = word - OFFSET; c->docs[nd].words[n] = word; c->docs[nd].counts[n] = count; c->docs[nd].total += count; if (word >= nw) { nw = word + 1; } //nw记录文档集最大的那个word ID。也就是文档集字典中的单词个数 //if (word >= nw) { nw = word; } } nd++; } fclose(fileptr); c->num_docs = nd; c->num_terms = nw; printf("number of docs : %d ", nd); printf("number of terms : %d ", nw); return(c); } int max_corpus_length(corpus* c)//输出数据集中单词数(去重后)最多的文档的单词数。这个length是去重后的长度 { int n, max = 0; for (n = 0; n < c->num_docs; n++) if (c->docs[n].length > max) max = c->docs[n].length; return(max); }
3.6 lda-alpha.c
牛顿法计算模型參数alpha
#include "lda-alpha.h" #include "lda-inference.h" /* * objective function and its derivatives * */ double alhood(double a, double ss, int D, int K) { return(D * (log_gamma(K * a) - K * log_gamma(a)) + (a - 1) * ss); } double d_alhood(double a, double ss, int D, int K) { return(D * (K * digamma(K * a) - K * digamma(a)) + ss); } double d2_alhood(double a, int D, int K) { return(D * (K * K * trigamma(K * a) - K * trigamma(a))); } /* * newtons method * */ double opt_alpha(double ss, int D, int K) { double a, log_a, init_a = 100; double f, df, d2f; int iter = 0; log_a = log(init_a); do { iter++; a = exp(log_a); if (isnan(a)) { init_a = init_a * 10; printf("warning : alpha is nan; new init = %5.5f ", init_a); a = init_a; log_a = log(a); } f = alhood(a, ss, D, K); //附录A4.2中的L(a) df = d_alhood(a, ss, D, K); //L对a的一阶偏导 d2f = d2_alhood(a, D, K); //二阶偏导 log_a = log_a - df/(d2f * a + df);//迭代公式 printf("alpha maximization : %5.5f %5.5f ", f, df); } while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER)); return(exp(log_a)); }
还有cokus.c 和 utils.c 中是一些数学计算的函数。