一、说明
IG是information gain 的缩写,中文名称是信息增益,是选择特征的一个很有效的方法(特别是在使用svm分类时)。这里不做详细介绍,有兴趣的可以googling一下。
chi-square 是一个常用特征筛选方法,在种子词扩展那篇文章中,有详细说明,这里不再赘述。
二、weka中的使用方法
1、特征筛选代码
package com.lvxinjian.alg.models.feature; import java.nio.charset.Charset; import java.util.ArrayList; import weka.attributeSelection.ASEvaluation; import weka.attributeSelection.AttributeEvaluator; import weka.attributeSelection.Ranker; import weka.core.Instances; import com.iminer.tool.common.util.FileTool; /** * @Description : 使用Weka的特征筛选方法(目前支持IG、Chi-square) * */ public class FeatureSelectorByWeka { /** * @function 使用weka内置的算法筛选特征 * @param eval 特征筛选方法的对象实例 * @param data arff格式的数据 * @param maxNumberOfAttribute 支持的最大的特征个数 * @param outputPath lex输出文件 * @throws Exception */ public void EvalueAndRank(ASEvaluation eval , Instances data ,int maxNumberOfAttribute , String outputPath) throws Exception { Ranker rank = new Ranker(); eval.buildEvaluator(data); rank.search(eval, data); // 按照特定搜索算法对属性进行筛选 在这里使用的Ranker算法仅仅是属性按照InfoGain/Chi-square的大小进行排序 int[] attrIndex = rank.search(eval, data); // 打印结果信息 在这里我们了属性的排序结果 ArrayList<String> attributeWords = new ArrayList<String>(); for (int i = 0; i < attrIndex.length; i++) { //如果权重等于0,则跳出循环 if (((AttributeEvaluator) eval).evaluateAttribute(attrIndex[i]) == 0) break; if (i >= maxNumberOfAttribute) break; attributeWords.add(i + " " + data.attribute(attrIndex[i]).name() + " " + "1"); } FileTool.SaveListToFile(attributeWords, outputPath, false, Charset.forName("utf8")); } }
package com.lvxinjian.alg.models.feature; import java.io.IOException; import weka.attributeSelection.ASEvaluation; import weka.attributeSelection.ChiSquaredAttributeEval; import weka.attributeSelection.InfoGainAttributeEval; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import com.iminer.alg.models.generatefile.ParameterUtils; /** * @Description : IG、Chi-square特征筛选 * */ public class WekaFeatureSelector extends FeatureSelector{ /** * 最大的特征个数 */ private int maxFeatureNum = 10000; /** * 特征文件保存路径 */ private String outputPath = null; /** * @Fields rule 对于特征过滤的规则 */ private String classname = "CLASS"; /** * 特征筛选方法,默认为IG */ private String selectMethod = "IG"; private boolean Initialization(String options){ try { String [] paramArrayOfString = options.split(" "); //初始化特征最大个数 String maxFeatureNum = ParameterUtils.getOption("maxFeatureNum", paramArrayOfString); if(maxFeatureNum.length() != 0) this.maxFeatureNum = Integer.parseInt(maxFeatureNum); //初始化类别 String classname = ParameterUtils.getOption("class", paramArrayOfString); if(classname.length() != 0) this.classname = classname; else{ System.out.println("use default class name("CLASS")"); } //初始化特征保存路径 String outputPath = ParameterUtils.getOption("outputPath", paramArrayOfString); if(outputPath.length() != 0) this.outputPath = outputPath; else{ System.out.println("please initialze output path."); return false; } String selectMethod = ParameterUtils.getOption("selectMethod", paramArrayOfString); if(selectMethod.length() != 0) this.selectMethod = selectMethod; else{ System.out.println("use default select method(IG)"); } } catch (Exception e) { e.printStackTrace(); return false; } return true; } @Override public boolean selectFeature(Object obj ,String options) throws IOException { try { if(!Initialization(options)) return false; Instances data = (Instances)obj; data.setClass(data.attribute(this.classname)); ASEvaluation selector = null; if(this.selectMethod.equals("IG")) selector = new InfoGainAttributeEval(); else if(this.selectMethod.equals("CHI")) selector = new ChiSquaredAttributeEval(); FeatureSelectorByWeka attributeSelector = new FeatureSelectorByWeka(); attributeSelector.EvalueAndRank(selector, data ,this.maxFeatureNum ,this.outputPath); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return true; } public static void main(String [] args) throws Exception { String root = "C:\Users\Administrator\Desktop\12_05\模型训练\1219\"; WekaFeatureSelector selector = new WekaFeatureSelector(); Instances data = DataSource.read(root + "train.Bigram.arff"); String options = "-maxFeatureNum 10000 -outputPath lex.txt"; selector.selectFeature(data, options); } }
参考: