• 生成文本聚类java实现3


    由于carrot2对中文的理解很不靠谱,所以参考了网络上的一些资料,现在贡献出来所有代码。

    代码的思路就是找字或者词出现的频度,并进行打分,最后按照出现次数和重要性,找出重要的语汇。现在贴出来一些可用的代码。

    ClusterBuilder.java

    /** 
    * * @author * @version 创建时间:2011-3-8 下午02:02:36 * 聚类生成器 */ public class ClusterBuilder { private static final Log LOG; private List<DocCluster> clusters; private ICTHit[] docs; private int maxLevels; private ClusteringOptions[] options; private boolean useTagsAsTitle; private String wordsExcluded; private static short[] bit1Table; static { LOG = LogFactory.getLog(ClusterBuilder.class.getName()); bit1Table = new short[65536]; for (int n = 0; n < bit1Table.length; n++) { String s = Integer.toBinaryString(n); short m = 0; for (int k = 0; k < s.length(); k++) { if (s.charAt(k) == '1') { m = (short) (m + 1); } } bit1Table[n] = m; } } private static int getValidBitCount(long n) { int i3 = (int) (n % 65536L); n /= 65536L; int i2 = (int) (n % 65536L); n /= 65536L; int i1 = (int) (n % 65536L); n /= 65536L; int i0 = (int) (n % 65536L); return bit1Table[i0] + bit1Table[i1] + bit1Table[i2] + bit1Table[i3]; } private static int getDocHitCount(long[] hits) { assert (hits != null); if (hits == null) return 0; int n0 = 0; for (int i = 0; i < hits.length; i++) { n0 += getValidBitCount(hits[i]); } return n0; } public ClusterBuilder() { for (int n = 0; n < bit1Table.length; n++) { String s = Integer.toBinaryString(n); short m = 0; for (int k = 0; k < s.length(); k++) { if (s.getBytes()[k] == '1') { m = (short)(m + 1); } } bit1Table[n] = m; } } /** * * @param docsToCluster 要聚类的记录列表 * @param exWords 不使用的主题词列表,多个词用西文逗号分隔。这些词将不会作为主题词。 * @param maxLevels 最大聚类级数 * @param useTagsAsTitle 是否使用主题词作为类别主题词。如果不使用,则根据文档标题自动生成类别主题词。 */ public ClusterBuilder(ICTHit[] docsToCluster, String exWords, int maxLevels, boolean useTagsAsTitle) { this.useTagsAsTitle = useTagsAsTitle; this.wordsExcluded = exWords; this.maxLevels = maxLevels; this.docs = docsToCluster; this.options = new ClusteringOptions[3]; this.options[0] = new ClusteringOptions(); this.options[0].setDocMaxTagCount(10); this.options[0].setMinTagRelevance(60); this.options[0].setMinSameDocPercent(80); this.options[1] = new ClusteringOptions(); this.options[1].setDocMaxTagCount(8); this.options[1].setMinTagRelevance(85); this.options[1].setMinSameDocPercent(70); this.options[1].setTagMinDocCount(2); this.options[1].setMinSameDocs(2); this.options[2] = new ClusteringOptions(); this.options[2].setDocMaxTagCount(8); this.options[2].setMinTagRelevance(50); this.options[2].setMinSameDocPercent(70); this.options[2].setTagMinDocCount(2); this.options[2].setMinSameDocs(2); } /** * 对Docs记录列表执行聚类,结果存放于Clusters中 */ public void cluster() { this.clusters = createLevelClusters(docs, 0, options[0]); List subs = null; if (this.maxLevels <= 1) { return; } for (DocCluster dc : this.clusters) { if ((dc.getDocList().length < options[0].getMinDocsToCluster()) || (dc.getTags() == "其他")) continue; subs = createLevelClusters(dc.getDocList(), 1, options[1]); if (subs.size() > 1) dc.setSubclusters(subs); } } /** * 创建一个层级的聚类 * @param docs 文档列表 * @param level 层级号 * @param levelOpt 该层级的聚类选项 * @return */ private List<DocCluster> createLevelClusters(ICTHit[] docs, int level, ClusteringOptions levelOpt) { TagHitMatrix matrix = new TagHitMatrix(docs.length, levelOpt.getDocMaxTagCount()); List clusters = new ArrayList(); int i, ValidTagCount; int DocCount = 0; // 扫描文档列表,根据每个文档的主题词列表,初始化主题词文档对照表。 for (i = 0; i < docs.length; i++) { ICTHit d = docs[i]; int validTagCount = 0; if (d.getTagList() != null) { String[] tagList = d.getTagList(); for (int tagIdx = 0; (tagIdx < tagList.length) && (validTagCount < levelOpt.getDocMaxTagCount()); tagIdx++) { String tag = tagList[tagIdx].trim(); // 主题词长度大于6个字的丢弃 if ((tag.length() <= 0) || (tag.length() > 20) || ((this.wordsExcluded.length() != 0) && ((tag.contains(this.wordsExcluded)) || (this.wordsExcluded .contains(tag))))) continue; matrix.AddDocHit(tag, i); validTagCount++; } } } int maxKwDocCount = 0; List entryListToRemove = new ArrayList(); String kwWithMaxDocCount = ""; LOG.debug("有效关键词:"); for (Map.Entry entry : matrix.entrySet()) { // 统计当前主题词的命中文档数,文档数小于预设值,则该主题词将被删除 int n = getDocHitCount((long[]) entry.getValue()); if (n < levelOpt.getTagMinDocCount()) { entryListToRemove.add((String) entry.getKey()); } else { LOG.debug((String) entry.getKey() + "(" + n + "), "); DocCount += n; } if (n > maxKwDocCount) { maxKwDocCount = n; kwWithMaxDocCount = (String) entry.getKey(); } } LOG.debug(""); LOG.debug("被忽略的关键词:"); for (i = 0; i < entryListToRemove.size(); i++) { LOG.debug((String) entryListToRemove.get(i) + ", "); matrix.remove(entryListToRemove.get(i)); } LOG.debug(""); LOG.debug(entryListToRemove.size() + "个关键词被忽略。剩余" + matrix.size() + "个关键词。"); LOG.debug("最大文档数的关键词:" + kwWithMaxDocCount + ",文档数:" + maxKwDocCount + "。"); double docCountPerTag = matrix.size() > 0 ? DocCount / matrix.size() : 0.0D; LOG.debug("关键词平均文档数:" + docCountPerTag); levelOpt.setMinSameDocs((int) (docCountPerTag / (2.0D + level))); if (levelOpt.getMinSameDocs() < 1) { levelOpt.setMinSameDocs(1); } while (mergeClusters(matrix, levelOpt) > 0) { } return createResult(matrix, docs, level, levelOpt); } private int mergeClusters(TagHitMatrix matrix, ClusteringOptions opt) { if (matrix.size() == 0) return 0; long[] docHitsMerged = (long[]) null; long[] maxDocHitsMerged = (long[]) null; String word1 = ""; String word2 = ""; String word1ToMerge = ""; String word2ToMerge = ""; int i,j; int sameDocs = 0; // 初始化一个相关度数组,0到100分,共101项 List rankMatrix = new ArrayList(); for (i = 0; i < 101; i++) { rankMatrix.add(new ArrayList()); } List matrix2List = new ArrayList(); matrix2List.addAll(matrix.entrySet()); // 将主题词文档映射表中的主题词两两比对 for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) { Map.Entry hits1 = (Map.Entry) matrix2List.get(i1); word1 = (String) hits1.getKey(); for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) { Map.Entry hits2 = (Map.Entry) matrix2List.get(i2); word2 = (String) hits2.getKey(); Object[] re = getWordsRelevance(mapEntry2TagHitEntry(hits1), mapEntry2TagHitEntry(hits2), docHitsMerged, sameDocs, opt, matrix.hitsItemCount); // 计算两个词的相关性,获取两词的文档汇总表,以及相同文档数 int nRank = ((Integer) re[0]).intValue(); docHitsMerged = (long[]) re[1]; sameDocs = ((Integer) re[2]).intValue(); // 相关度小于预设阈值的忽略 if (nRank >= opt.getMinTagRelevance()) { ((List) rankMatrix.get(nRank)).add(new IdPair(i1, i2)); } } } List tagListToRemove = new ArrayList(); List entryListMerged = new ArrayList(); entryListMerged.add(new TagHitEntry("", null)); HashSet idPairTable = new HashSet(); TagHitEntry entryToMerge1; while (true) { // 找到最大相关性的两个主题词 for (i = 100; (i >= opt.getMinTagRelevance()) && (((List) rankMatrix.get(i)).size() == 0); i--){}; if (i < opt.getMinTagRelevance()) { break; } IdPair ip = (IdPair) ((List) rankMatrix.get(i)).get(0); // 合并两个类别 ((List) rankMatrix.get(i)).remove(0); entryToMerge1 = ip.Id1 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id1)) : (TagHitEntry) entryListMerged.get(-ip.Id1); TagHitEntry entryToMerge2 = ip.Id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id2)) : (TagHitEntry) entryListMerged.get(-ip.Id2); word1ToMerge = entryToMerge1.key; word2ToMerge = entryToMerge2.key; assert ((word1ToMerge.length() > 0) && (word2ToMerge.length() > 0)); String wordsMerged = word1ToMerge + "," + word2ToMerge; long[] lDocs0 = entryToMerge1.value; long[] lDocs1 = entryToMerge2.value; maxDocHitsMerged = new long[matrix.hitsItemCount]; for (i = 0; i < lDocs0.length; i++) { lDocs0[i] |= lDocs1[i];// 获取合并的文档集 } if (ip.Id1 >= 0) tagListToRemove.add(word1ToMerge); else entryListMerged.set(-ip.Id1, new TagHitEntry("", null)); if (ip.Id2 >= 0) tagListToRemove.add(word2ToMerge); else { entryListMerged.set(-ip.Id2, new TagHitEntry("", null)); } entryListMerged.add(new TagHitEntry(wordsMerged, maxDocHitsMerged)); // 替换与合并主题词有关联的其他相关主题词对的评分 int idMerged = -(entryListMerged.size() - 1); int id2 = 0; boolean CanDelete = false; for (i = 0; i <= 100; i++) { int ListCount = ((List) rankMatrix.get(i)).size(); if (ListCount == 0) { continue; } for (j = 0; j < ListCount; j++) { IdPair p = (IdPair) ((List) rankMatrix.get(i)).get(j); CanDelete = false; if ((ip.Id1 == p.Id1) || (ip.Id2 == p.Id1)) { id2 = p.Id2; CanDelete = true; } else if ((ip.Id1 == p.Id2) || (ip.Id2 == p.Id2)) { id2 = p.Id1; CanDelete = true; } if (!CanDelete) continue; if (idMerged == id2) { continue; } ((List) rankMatrix.get(i)).remove(j); j--; ListCount--; IdPair pairMerged = new IdPair(idMerged, id2); if (idPairTable.contains(pairMerged)) { continue; } TagHitEntry e2 = id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(id2)) : (TagHitEntry) entryListMerged.get(-id2); assert ((e2.key.length() != 0) && (e2.key != wordsMerged)); Object[] re = getWordsRelevance(new TagHitEntry(wordsMerged, maxDocHitsMerged), e2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount); int rank = ((Integer) re[0]).intValue(); docHitsMerged = (long[]) re[1]; sameDocs = ((Integer) re[2]).intValue(); if (rank <= opt.getMinTagRelevance()) continue; ((List) rankMatrix.get(rank)).add(pairMerged); idPairTable.add(pairMerged); } } } // 删除被合并的主题词 for (int m =0;m<tagListToRemove.size();m++){ matrix.remove(tagListToRemove.get(m)); } /** for (String w : tagListToRemove) matrix.remove(w); **/ // 添加合并而成的新主题词 for (int n=0;n<entryListMerged.size();n++){ TagHitEntry e = (TagHitEntry) entryListMerged.get(n); matrix.put(e.getKey(), e.getValue()); } /** for (TagHitEntry e : entryListMerged) { if (e.getKey().length() > 0) matrix.put(e.getKey(), e.getValue()); } **/ return 0; } private int mergeClusters1(TagHitMatrix matrix, ClusteringOptions opt) { if (matrix.size() == 0) return 0; long[] docHitsMerged = (long[]) null; long[] maxDocHitsMerged = (long[]) null; int nMaxRank = 0; String word1 = ""; String word2 = ""; String word1ToMerge = ""; String word2ToMerge = ""; int sameDocs = 0; List matrix2List = new ArrayList(); matrix2List.addAll(matrix.entrySet()); for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) { TagHitEntry hits1 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i1)); word1 = hits1.getKey(); for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) { TagHitEntry hits2 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i2)); word2 = hits2.getKey(); Object[] re = getWordsRelevance(hits1, hits2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount); int nRank = ((Integer) re[0]).intValue(); docHitsMerged = (long[]) re[1]; sameDocs = ((Integer) re[2]).intValue(); if ((nRank <= nMaxRank) || (nRank <= opt.getMinTagRelevance())) continue; nMaxRank = nRank; maxDocHitsMerged = docHitsMerged; word1ToMerge = word1; word2ToMerge = word2; } } if ((word1ToMerge.length() == 0) || (word2ToMerge.length() == 0)) { return 0; } String wordsMerged = word1ToMerge + "," + word2ToMerge; if ((nMaxRank > opt.getMinTagRelevance()) && (wordsMerged != "")) { matrix.remove(word1ToMerge); matrix.remove(word2ToMerge); matrix.put(wordsMerged, maxDocHitsMerged); LOG.debug("(" + word1ToMerge + ") - (" + word2ToMerge + ")"); return 1; } return 0; } private Object[] getWordsRelevance(TagHitEntry entry1, TagHitEntry entry2, long[] docHitsMerged, int sameDocCount, ClusteringOptions opt, int hitsItemCount) { Object[] re = new Object[3]; docHitsMerged = new long[hitsItemCount]; sameDocCount = 0; String tag1 = entry1.getKey(); String tag2 = entry2.getKey(); assert (tag2 != tag1); long[] lDocs0 = entry1.getValue(); long[] lDocs1 = entry2.getValue(); int n0 = 0; int n1 = 0; n0 = getDocHitCount(lDocs0); n1 = getDocHitCount(lDocs1); int docCountMin = Math.min(n0, n1); int docCountMax = Math.max(n0, n1); int docCountMerged = 0; long sameDocBits = 0L; long diffDocBits = 0L; int diffDocCount = 0; for (int i = 0; i < lDocs0.length; i++) { docHitsMerged[i] = lDocs0[i] | lDocs1[i];// 获取合并的文档集 docCountMerged += getValidBitCount(docHitsMerged[i]); diffDocBits = lDocs0[i] ^ lDocs1[i];// 获取不同的文档集 diffDocCount += getValidBitCount(diffDocBits); sameDocBits = lDocs0[i] & lDocs1[i];// 获取相同的文档集 sameDocCount += getValidBitCount(sameDocBits); } boolean IsSubstring = false; // 一个主题词是另一个的子串,则得分较高 if ((tag2.contains(tag1)) || (tag1.contains(tag2))) { IsSubstring = true; docCountMin += opt.getTagMinDocCount(); } if ((sameDocCount == 0) && (!IsSubstring)) { re[0] = Integer.valueOf(0); re[1] = docHitsMerged; re[2] = Integer.valueOf(sameDocCount); return re; } if (docCountMin < opt.getTagMinDocCount()) { re[0] = Integer.valueOf(0); re[1] = docHitsMerged; re[2] = Integer.valueOf(sameDocCount); return re; } int samePercent = (int) Math.round(sameDocCount * 100.0D / docCountMerged); int samePercentMin = (int) Math.round(sameDocCount * 100.0D / docCountMin); int diffPercent = (int) Math.round(diffDocCount * 100.0D / docCountMerged); LOG.debug("相关性:" + tag1 + "(" + n0 + ")-(" + n1 + ")" + tag2); LOG.debug(", SamePercent=" + samePercent); LOG.debug(", SamePercentMin=" + samePercentMin); LOG.debug(", DiffPercent=" + diffPercent); int nRank; if ((sameDocCount >= opt.getMinSameDocs()) && ((docCountMin < 10) || (samePercentMin >= opt.getMinSameDocPercent()))) { nRank = (int) Math.round((samePercentMin + samePercent) * 0.85D - diffPercent * 0.2D); } else { nRank = 0; } if (IsSubstring) nRank += 80; LOG.debug(", Rank=" + nRank); re[0] = Integer.valueOf(Math.min(nRank, 100)); re[1] = docHitsMerged; re[2] = Integer.valueOf(sameDocCount); return re; } private TagHitEntry mapEntry2TagHitEntry(Map.Entry<String, long[]> e) { return new TagHitEntry((String) e.getKey(), (long[]) e.getValue()); } @SuppressWarnings("unchecked") private List<DocCluster> createResult(TagHitMatrix matrix, ICTHit[] docs, int level, ClusteringOptions opt) { int i,j; Map<String,DocValue> clsIdList = new HashMap(); List ClassTitleList = new ArrayList(); for (Map.Entry de : matrix.entrySet()) { DocValue dv = new DocValue(); clsIdList.put((String) de.getKey(), dv); } List<Integer> otherIdList = new ArrayList(); TagHitEntry maxTagHitEntry = new TagHitEntry(); int clsCount; String tag; // 确定每个文档所属的类别 for (i = 0; i < docs.length; i++) { ICTHit d = docs[i]; TagHitMatrix.ClusterDocInfo di = matrix.docs[i]; assert (docs[i] != null); int maxTagHit = 0; clsCount = 0; for (Map.Entry hits : matrix.entrySet()) { int tagHitCount = 0; int score = 0; String clsWordListStr = "," + (String) hits.getKey() + ","; // 那个类别包含当前文档的主题词最多,该文档就属于哪个类别 for (j = 0; j < di.TagCount; j++) { tag = di.TagList[j]; score = j < 3 ? 2 : 1; assert (tag.length() > 0); if (!clsWordListStr.contains("," + tag + ",")) continue; tagHitCount += score; clsCount++; } if (maxTagHit >= tagHitCount) continue; maxTagHit = tagHitCount; maxTagHitEntry = mapEntry2TagHitEntry(hits); } if (maxTagHit > 0) { DocValue dv = (DocValue) clsIdList.get(maxTagHitEntry.getKey()); dv.idList.add(Integer.valueOf(i)); } else { otherIdList.add(Integer.valueOf(i)); } } // 生成类别列表 List<DocCluster> clusterList = new ArrayList(); String[] TagList; Object dc; for (Map.Entry<String,DocValue> kv : clsIdList.entrySet()) { DocValue dv = (DocValue) kv.getValue(); if (dv.idList.size() <= 0) continue; if (dv.idList.size() == 1) { otherIdList.add((Integer) dv.idList.get(0)); } else { dc = new DocCluster(); ((DocCluster) dc).setDocIdList(new String[dv.idList.size()]); ((DocCluster) dc).setDocList(new ICTHit[dv.idList.size()]); for (i = 0; i < dv.idList.size(); i++) { ((DocCluster) dc).getDocIdList()[i] = docs[((Integer) dv.idList.get(i)).intValue()].getDocId(); ((DocCluster) dc).getDocList()[i] = docs[((Integer) dv.idList.get(i)).intValue()]; } ((DocCluster) dc).setLevel(level); ((DocCluster) dc).setTags((String) kv.getKey()); for (i = 0; (i < clusterList.size()) && (((DocCluster) dc).getDocIdList().length <= ((DocCluster) clusterList.get(i)).getDocIdList().length);) { i++; } clusterList.add(i, (DocCluster) dc); } } for (i = opt.getMaxClusterCount(); i < clusterList.size();) { DocCluster c = (DocCluster) clusterList.get(i); List idList = ((DocValue) clsIdList.get(c.getTags())).idList; for (dc = idList.iterator(); ((Iterator) dc).hasNext();) { int idx = ((Integer) ((Iterator) dc).next()).intValue(); otherIdList.add(Integer.valueOf(idx)); } clusterList.remove(i); } int i1; for (i = 0; i < clusterList.size(); i++) { DocCluster dc1 = (DocCluster) clusterList.get(i); String[] tagList = dc1.getTags().split(","); String newTags = ""; for (j = 0; j < tagList.length; j++) { i1 = dc1.getTags().indexOf(tagList[j]); int i2 = dc1.getTags().lastIndexOf(tagList[j]); if (i1 == i2) newTags = newTags + tagList[j] + ","; } if ((newTags.trim().length() > 0) && (newTags.endsWith(","))) { newTags = newTags.substring(0, newTags.length() - 1); } dc1.setTags(newTags); dc1.setTitle(""); if (this.useTagsAsTitle) { tagList = dc1.getTags().split(","); for (j = 0; (tagList != null) && (j < tagList.length); j++) { if ((dc1.getTitle() + tagList[j]).length() > 16) break; boolean isSubstr = false; for (DocCluster c : clusterList) { if ((c.getTitle().length() <= 0) || ((!c.getTitle().contains(tagList[j])) && (!tagList[j].contains(c.getTitle())))) continue; isSubstr = true; break; } if (!isSubstr) dc1.setTitle(dc1.getTitle() + tagList[j] + ","); } if ((dc1.getTitle().trim().length() > 0) && (dc1.getTitle().endsWith(","))) { dc1.setTitle(dc1.getTitle().substring(0, dc1.getTitle().length() - 1)); } } if (dc1.getTitle() != "") continue; dc1.setTitle(dc1.getTags()); if (dc1.getTitle().length() <= 16) continue; String s = dc1.getTitle().substring(0, 16); int li = s.lastIndexOf(','); if (li > 0) { dc1.setTitle(s.substring(0, li)); } } if (otherIdList.size() > 0) { DocCluster clusterOther = new DocCluster(); clusterOther.setDocIdList(new String[otherIdList.size()]); clusterOther.setDocList(new ICTHit[otherIdList.size()]); clusterOther.setLevel(level); clusterOther.setTitle("其他"); clusterOther.setTags("其他"); i = 0; for (int k=0;k<otherIdList.size();k++) { int idx = otherIdList.get(k); clusterOther.getDocIdList()[i] = docs[idx].getDocId(); clusterOther.getDocList()[i] = docs[idx]; i++; } clusterList.add(clusterOther); } return (List<DocCluster>) clusterList; } public List<DocCluster> getClusters() { return this.clusters; } public void setClusters(List<DocCluster> clusters) { this.clusters = clusters; } public ICTHit[] getDocs() { return this.docs; } public void setDocs(ICTHit[] docs) { this.docs = docs; } public int getMaxLevels() { return this.maxLevels; } public void setMaxLevels(int maxLevels) { this.maxLevels = maxLevels; } public ClusteringOptions[] getOptions() { return this.options; } public void setOptions(ClusteringOptions[] options) { this.options = options; } public boolean isUseTagsAsTitle() { return this.useTagsAsTitle; } public void setUseTagsAsTitle(boolean useTagsAsTitle) { this.useTagsAsTitle = useTagsAsTitle; } public String getWordsExcluded() { return this.wordsExcluded; } public void setWordsExcluded(String wordsExcluded) { this.wordsExcluded = wordsExcluded; } private class DocValue { public List<Integer> idList = new ArrayList(); public String titleListStr = ""; private DocValue() { } } /** * 主题词ID对,主题词ID为该主题词在主题词文档映射表中的主键位置。 * @author * @version 创建时间:2011-3-9 下午02:52:44 */ private class IdPair { public int Id1; public int Id2; public IdPair(int id1, int id2) { assert (id1 != id2); if (id1 < id2) { this.Id1 = id1; this.Id2 = id2; } else { this.Id1 = id2; this.Id2 = id1; } } public int hashCode() { return -1; } public boolean equals(Object o) { return (((IdPair) o).Id1 == this.Id1) && (((IdPair) o).Id2 == this.Id2); } } public static class TagHitEntry { public String key; public long[] value; public TagHitEntry() { } public TagHitEntry(String k, long[] v) { this.key = k; this.value = v; } public String getKey() { return this.key; } public long[] getValue() { return this.value; } } }

     ClusteringOptions.java

    /**
     * 
    * @author 
    * @version 创建时间:2011-3-8 上午10:23:27
     */
    public class ClusteringOptions {
        public static int DefMaxClusterCount = 20;
        public static int DefMaxKeywordCount = 6;
        public static int DefMinWordsRelevance = 10;
        public static int DefTagMinDocCount = 3;
        public static int DefIgnoreSameDocs = 2;
        public static int DefSameDocPercent = 50;
        public static int DefMinDocsToCluster = 8;
        private int docMaxTagCount;
        private int maxClusterCount;
        private int minDocsToCluster;
        private int minSameDocPercent;
        private int minSameDocs;
        private int minTagRelevance;
        private int tagMinDocCount;
     
        public ClusteringOptions() {
            this.maxClusterCount = DefMaxClusterCount;
            this.minTagRelevance = DefMinWordsRelevance;
            this.tagMinDocCount = DefTagMinDocCount;
            this.minSameDocs = DefIgnoreSameDocs;
            this.minSameDocPercent = DefSameDocPercent;
            this.docMaxTagCount = DefMaxKeywordCount;
            this.minDocsToCluster = DefMinDocsToCluster;
        }
     
        public int getDocMaxTagCount() {
            return this.docMaxTagCount;
        }
     
        public void setDocMaxTagCount(int docMaxTagCount) {
            this.docMaxTagCount = docMaxTagCount;
        }
     
        public int getMaxClusterCount() {
            return this.maxClusterCount;
        }
     
        public void setMaxClusterCount(int maxClusterCount) {
            this.maxClusterCount = maxClusterCount;
        }
     
        public int getMinDocsToCluster() {
            return this.minDocsToCluster;
        }
     
        public void setMinDocsToCluster(int minDocsToCluster) {
            this.minDocsToCluster = minDocsToCluster;
        }
     
        public int getMinSameDocPercent() {
            return this.minSameDocPercent;
        }
     
        public void setMinSameDocPercent(int minSameDocPercent) {
            this.minSameDocPercent = minSameDocPercent;
        }
     
        public int getMinSameDocs() {
            return this.minSameDocs;
        }
     
        public void setMinSameDocs(int minSameDocs) {
            this.minSameDocs = minSameDocs;
        }
     
        public int getMinTagRelevance() {
            return this.minTagRelevance;
        }
     
        public void setMinTagRelevance(int minTagRelevance) {
            this.minTagRelevance = minTagRelevance;
        }
     
        public int getTagMinDocCount() {
            return this.tagMinDocCount;
        }
     
        public void setTagMinDocCount(int tagMinDocCount) {
            this.tagMinDocCount = tagMinDocCount;
        }
    }

    DocCluster.java

    /**
     * 
    * @author
    * @version 创建时间:2011-3-8 上午10:23:35
     */
    public class DocCluster {
        private String[] docIdList;
        private ICTHit[] docList;
        private int level;
        private List<DocCluster> subclusters;
        private String tags;
        private String title;
     
        public String[] getDocIdList() {
            return this.docIdList;
        }
     
        public void setDocIdList(String[] docIdList) {
            this.docIdList = docIdList;
        }
     
        public ICTHit[] getDocList() {
            return this.docList;
        }
     
        public void setDocList(ICTHit[] docList) {
            this.docList = docList;
        }
     
        public int getLevel() {
            return level;
        }
     
        public void setLevel(int level) {
            this.level = level;
        }
     
        public List<DocCluster> getSubclusters() {
            return this.subclusters;
        }
     
        public void setSubclusters(List<DocCluster> subclusters) {
            this.subclusters = subclusters;
        }
     
        public String getTags() {
            return this.tags;
        }
     
        public void setTags(String tags) {
            this.tags = tags;
        }
     
        public String getTitle() {
            if (title == null)
                title = "";
            return this.title;
        }
     
        public void setTitle(String title) {
            this.title = title;
        }
    }

     ICTHit.java

    public class ICTHit implements Serializable {
        /*
         * 关键词数组
         */
        private String[] TagList;
        private String docId;
        private String title;
     
        public String[] getTagList() {
            return TagList;
        }
     
        public void setTagList(String[] tagList) {
            TagList = tagList;
        }
     
        public String getDocId() {
            return docId;
        }
     
        public void setDocId(String docId) {
            this.docId = docId;
        }
     
        public String getTitle() {
            return title;
        }
     
        public void setTitle(String title) {
            this.title = title;
        }    
        
    }

    TagHitMatrix.java

    public class TagHitMatrix extends LinkedHashMap<String, long[]> {
        /**
         * 
         */
        private static final long serialVersionUID = -7511464445378974433L;
        public static int ii = 0;
        public ClusterDocInfo[] docs;
        public int hitsItemCount;
     
        public TagHitMatrix(int DocCount, int MaxTagCount) {
            this.hitsItemCount = (int) (DocCount / 62.0D + 0.984375D);
            this.docs = new ClusterDocInfo[DocCount];
     
            for (int i = 0; i < this.docs.length; i++)
                this.docs[i] = new ClusterDocInfo(MaxTagCount);
        }
     
        public void AddDocHit(String TagStr, int Position) {
            TagStr = TagStr.trim();
     
            int n = Position / 62;
            int m = Position % 62;
            long[] DocHits = (long[]) get(TagStr);
            if (DocHits == null) {
                DocHits = new long[this.hitsItemCount];
                put(TagStr, DocHits);
            }
            DocHits[n] |= Math.round(Math.pow(2.0D, m));
            ClusterDocInfo di = this.docs[Position];
            di.TagList[(di.TagCount++)] = TagStr;
        }
     
        class ClusterDocInfo {
            public String[] TagList;
            public int TagCount;
     
            public ClusterDocInfo(int MaxTagCount) {
                this.TagList = new String[MaxTagCount];
                this.TagCount = 0;
            }
        }
    }

     测试方法:

    public void test(ICTHit[] icthits) throws IOException {
            ClusterBuilder clusterBuilder = new ClusterBuilder();
            // 设置需要聚类的数据集合,测试中用的null。
            clusterBuilder.setDocs(icthits);
            // 设置聚类级别,只使用1级
            clusterBuilder.setMaxLevels(10);
            clusterBuilder.setUseTagsAsTitle(true);
            // 一般将检索词设置为wordsExcluded
            clusterBuilder.setWordsExcluded("万美元,日本,公司,视频,北京时间,图文,新华网,新浪,消息,通讯,互联网,美国,中国");
            clusterBuilder
                    .setOptions(new ClusteringOptions[] { new ClusteringOptions(),new ClusteringOptions() });
     
            // 开始聚类
            clusterBuilder.cluster();
            FileWriter fw1 = new FileWriter("c:/today-20110509-cluster.txt ", true);
            BufferedWriter bw1 = new BufferedWriter(fw1);
     
            // 打印结果
            if (clusterBuilder.getClusters() != null) {
                int i = 0;
                for (DocCluster docCluster : clusterBuilder.getClusters()) {
                    i++;
                    System.out.println("tag:" + docCluster.getTags() + "("
                            + docCluster.getDocIdList().length + ")");
                    bw1.write(docCluster.getTags() + "("+ docCluster.getDocIdList().length + ")"+"
     ");                
                    
                    if (docCluster.getDocList() != null
                            && docCluster.getDocList().length > 0) {
                        for (ICTHit co : docCluster.getDocList()) {
                            System.out.println("     DocID: " + co.getDocId());
                            bw1.write("标题为: "    + co.getTitle()+",ID为"+co.getDocId()+"
     ");    
                            for (int m = 0; m < co.getTagList().length; m++) {                            
                                bw1.write("标题为: "    + co.getTitle()+",ID为"+co.getDocId()+"
     ");    
                                System.out.println("     Key Word: "
                                        + co.getTagList()[m]);
                            }
                            System.out.println("");
                        }
                        System.out.println("");
                    } else {
                        bw1.write("      该分类下无数据!"+"
     ");    
                    }
                    bw1.write("-------------------------------------------------------------------------------
    ");
                }
            }
            bw1.close();
            fw1.close();
        }

    如上方法可以,是一个示例性的,没有用在生产当中。核心方法有了。大家可以引用到项目当中。效果比carrot2标准的方法要好很多。

  • 相关阅读:
    May 1 2017 Week 18 Monday
    April 30 2017 Week 18 Sunday
    April 29 2017 Week 17 Saturday
    April 28 2017 Week 17 Friday
    April 27 2017 Week 17 Thursday
    April 26 2017 Week 17 Wednesday
    【2017-07-04】Qt信号与槽深入理解之一:信号与槽的连接方式
    April 25 2017 Week 17 Tuesday
    April 24 2017 Week 17 Monday
    为什么丑陋的UI界面却能创造良好的用户体验?
  • 原文地址:https://www.cnblogs.com/yanl55555/p/12541279.html
Copyright © 2020-2023  润新知