基于搜狗语料库,建立的一个新闻分类系统;类别包括:
classifierMap.put(0, "IT");
classifierMap.put(1, "体育");
classifierMap.put(2, "健康");
classifierMap.put(3, "军事");
classifierMap.put(4, "招聘");
classifierMap.put(5, "教育");
classifierMap.put(6, "文化");
classifierMap.put(7, "旅游");
classifierMap.put(8, "财经");
分词器:中科院分词工具或者IK;本人采用IK分词器,通过测试发现速度快,内存消耗低,不会电脑死机,在训练数据的时候;训练集是下载的搜狗新闻数据集,对新闻分类
算法步骤:
1. 首先下载IK分词器和搜狗新闻训练集和搜狗词典(对词进行了词性标注,个人只选择了名词,考虑到内存和速度,准确率的因素)
2. 对训练集分词处理,将属于不同类别的新闻分词处理,并去除,词频低于10的词,过滤掉,节省内存和提高速度的考虑;并以文本的形式保存,以类别定义文件名字
3. 编写朴素贝叶斯分类函数,对输入文本进行分类处理,选择概率最大的作为分类类别
4. web系统采用JSP+JavaBean+Servlet的架构,软件平台式新浪云;网址:http://naivebayes.sinaapp.com;如果是无法访问,应该是服务器没有开
使用方式:输入文本,并点击新闻分类;
主程序代码:
package com.sogou.servlet; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.servlet.RequestDispatcher; import javax.servlet.ServletContext; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.sogou.util.BayesUtil; /** * Servlet implementation class BayesServlet */ @WebServlet("/bayes.do") public class BayesServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public BayesServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse * response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub this.doPost(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse * response) */ @SuppressWarnings("unchecked") protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub String newsText = request.getParameter("newsText"); newsText = new String(newsText.getBytes("ISO8859-1"), "utf-8"); ServletContext st = this.getServletContext(); List<Map<String, Integer>> trainSets = (List<Map<String, Integer>>) st .getAttribute("trainSets"); Map<Integer, String> classifierMap = (Map<Integer, String>) st .getAttribute("classifierMap"); if (classifierMap == null) { classifierMap = new HashMap<Integer, String>(); classifierMap.put(0, "IT"); classifierMap.put(1, "体育"); classifierMap.put(2, "健康"); classifierMap.put(3, "军事"); classifierMap.put(4, "招聘"); classifierMap.put(5, "教育"); classifierMap.put(6, "文化"); classifierMap.put(7, "旅游"); classifierMap.put(8, "财经"); st.setAttribute("classifierMap", classifierMap); } BayesUtil bayes = new BayesUtil(); if (trainSets == null) { String dirName = "D:/dataMing/bys"; trainSets = bayes.loadTrainSet(dirName); st.setAttribute("trainSets", trainSets); } String classifier = bayes.bayesClassifierText(trainSets, newsText, classifierMap); System.out.println(classifier); request.setAttribute("classifier", classifier); RequestDispatcher rd = request.getRequestDispatcher("./index.jsp"); rd.forward(request, response); } }
package com.sogou.util; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; public class BayesUtil { /** * 加载训练集分类词典目录,对内容分类处理 * * @param dirName * @param content */ public List<Map<String, Integer>> loadTrainSet(String dirName) { File directory = new File(dirName); File[] files = directory.listFiles(); BufferedReader br = null; List<Map<String, Integer>> list = new ArrayList<>(files.length); // 加载字典 for (int i = 0; i < files.length; i++) { try { br = new BufferedReader(new FileReader(files[i])); Map<String, Integer> hashMap = new HashMap<String, Integer>(); String line = null; while ((line = br.readLine()) != null) { String[] values = line.split(" "); hashMap.put(values[0], Integer.parseInt(values[1])); } list.add(hashMap); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } // 对传入文本或者文件处理 return list; } /** * 对传入的文本分类处理 * * @param content */ public String bayesClassifierText(List<Map<String, Integer>> trainSets, String content, Map<Integer, String> textClassifier) { IKSegmenter ik = new IKSegmenter(new StringReader(content), true); Lexeme value = null; List<String> list = new LinkedList<String>(); String text = null; try { while ((value = ik.next()) != null) { text = value.getLexemeText(); if (text.length() >= 2) { list.add(text); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } int length = trainSets.size(); long[] maxCfVal = new long[length]; int[] wordsCount = new int[length]; boolean flag = false; for (String tt : list) { for (int i = 0; i < length; i++) { if (!flag) { wordsCount[i] = trainSets.get(i).get("wordsCount"); } Integer iv = trainSets.get(i).get(tt); if (iv != null) { maxCfVal[i] += Math.log((float) iv / wordsCount[i]); } else { maxCfVal[i] += Math.log(1.0 / (wordsCount[i])); } } flag = true; } long maxValue = maxCfVal[0]; int index = 0; for (int i = 1; i < length; i++) { if (maxCfVal[i] > maxValue) { index = i; maxValue = maxCfVal[i]; } } return textClassifier.get(index); } /** * 对传入的文本文件分类 * * @param fileName */ public void bayesClassifierFile(String fileName) { } }