• 原创:中文分词的逆向最大匹配算法


    逆向最大匹配算法,中文分词机械化分词中最基本的算法,也是入门级别的算法。但是,在机械化分词方面的效果,表现却很好。尤其是在大文本的时候,一次取较多词语进行匹配,因为大文本匹配成词的概率远远高于小文本,所以会有很好的表现。IK分词,在中文分词领域里,只能算是皮毛,或者说是一个壳儿而已,根本不算真正的分词。中文分词里面,运用CRF进行消除歧义分词,是主流,在NLP领域,RNN是主要技术手段,截止到2016年,RNN已经成功应用到NLP领域中,甚至在计算机视觉中也发挥着重要作用。目前,在open nlp社区里,有一个HanLP分词源码包,里面有极速分词和消歧分词,性能非常优异。下面的代码,来自IK分词的一部分源码包,本人进行了逆向最大匹配算法的改造,闲着没事干,算是入门级别的分词。

    package org.wltea.analyzer.core;

    import java.io.IOException;
    import java.io.Reader;
    import java.util.HashMap;
    import java.util.HashSet;
    import java.util.LinkedList;
    import java.util.Map;
    import java.util.Set;

    import org.wltea.analyzer.cfg.Configuration;
    import org.wltea.analyzer.dic.Dictionary;
    /**
    * 中分分词上下文环境
    * @author TongXueQiang
    * @date 2016/01/22
    * @since 1.7
    */
    class AnalyzeContext {
    private char[] segmentBuff;
    private int[] charTypes;
    private int buffOffset;
    private int cursor;
    private int available;
    private Set<String> buffLocker;
    private QuickSortSet orgLexemes;
    private Map<Integer, LexemePath> pathMap;
    private LinkedList<Lexeme> results;
    private Configuration cfg;
    private Integer moveIndex;

    public AnalyzeContext(Configuration cfg) {
    this.cfg = cfg;
    this.segmentBuff = new char[4096];
    this.charTypes = new int[4096];
    this.buffLocker = new HashSet<String>();
    this.orgLexemes = new QuickSortSet();
    this.pathMap = new HashMap<Integer, LexemePath>();
    this.results = new LinkedList<Lexeme>();
    }

    int getCursor() {
    return this.cursor;
    }

    char[] getSegmentBuff() {
    return this.segmentBuff;
    }

    char getCurrentChar() {
    return this.segmentBuff[this.cursor];
    }

    int getCurrentCharType() {
    return this.charTypes[this.cursor];
    }

    int getBufferOffset() {
    return this.buffOffset;
    }
    /**
    * 向缓冲区填充字符
    * @param reader
    * @return
    * @throws IOException
    */
    int fillBuffer(Reader reader) throws IOException {
    int readCount = 0;
    if (this.buffOffset == 0) {
    readCount = reader.read(this.segmentBuff);
    } else {
    int offset = this.available - this.cursor;
    if (offset > 0) {
    System.arraycopy(this.segmentBuff, this.cursor,
    this.segmentBuff, 0, offset);
    readCount = offset;
    }

    readCount += reader.read(this.segmentBuff, offset, -offset);
    }

    this.available = readCount;
    this.cursor = 0;
    return readCount;
    }

    void initCursor() {
    this.cursor = this.available-1;
    //规范会字符
    this.segmentBuff[this.cursor] = CharacterUtil
    .regularize(this.segmentBuff[this.cursor]);
    //为字符指定类型,比如阿拉伯数字类型,英文字母类型等等
    this.charTypes[this.cursor] = CharacterUtil
    .identifyCharType(this.segmentBuff[this.cursor]);
    }

    boolean moveCursor() {
    if ((this.cursor-moveIndex) > 0) {
    this.cursor -= (moveIndex+1);
    //System.out.println("移动指针后的cursor位置:"+cursor);
    //移动指针后还要进行规范化当前字符
    this.segmentBuff[this.cursor] = CharacterUtil
    .regularize(this.segmentBuff[this.cursor]);
    //指定当前字符的类型
    this.charTypes[this.cursor] = CharacterUtil
    .identifyCharType(this.segmentBuff[this.cursor]);
    return true;
    }
    return false;
    }

    void lockBuffer(String segmenterName) {
    this.buffLocker.add(segmenterName);
    }

    void unlockBuffer(String segmenterName) {
    this.buffLocker.remove(segmenterName);
    }

    boolean isBufferLocked() {
    return (this.buffLocker.size() > 0);
    }

    boolean isBufferConsumed() {
    return (this.cursor == this.available - 1);
    }

    boolean needRefillBuffer() {
    return ((this.available == 4096) && (this.cursor < this.available - 1)
    && (this.cursor > this.available - 100) && (!(isBufferLocked())));
    }

    void markBufferOffset() {
    this.buffOffset += this.cursor;
    }

    void addLexeme(Lexeme lexeme) {
    this.orgLexemes.addLexeme(lexeme);
    }

    void addLexemePath(LexemePath path) {
    if (path != null)
    this.pathMap.put(Integer.valueOf(path.getPathBegin()), path);
    }

    QuickSortSet getOrgLexemes() {
    return this.orgLexemes;
    }
    /**
    * 输出结果集
    */
    void outputToResult() {
    int index = 0;
    while (index <= this.cursor) {
    LexemePath path = (LexemePath) this.pathMap.get(Integer
    .valueOf(index));

    if (path != null) {
    Lexeme l = path.pollFirst();

    if (l != null) {
    this.results.add(l);
    index = l.getBegin() + l.getLength();
    this.cursor = index;
    }
    } else {
    outputSingleCJK(index);
    ++index;
    }

    }
    this.pathMap.clear();
    }

    private void outputSingleCJK(int index) {
    Lexeme singleCharLexeme;
    if (4 == this.charTypes[index]) {
    singleCharLexeme = new Lexeme(this.buffOffset, index, 1, 64);
    this.results.add(singleCharLexeme);
    } else if (8 == this.charTypes[index]) {
    singleCharLexeme = new Lexeme(this.buffOffset, index, 1, 8);
    this.results.add(singleCharLexeme);
    }
    }
    /**
    * 取出词元,为词元赋值
    * @return
    */
    Lexeme getNextLexeme() {
    Lexeme result = (Lexeme) this.results.pollFirst();
    while (result != null) {
    compound(result);//数量词合并
    //过滤掉停用词
    if (Dictionary.getSingleton().isStopWord(this.segmentBuff,
    result.getBegin(), result.getLength())) {
    //System.out.println(Dictionary.getSingleton().isStopWord(this.segmentBuff,
    //result.getBegin(), result.getLength()));
    result = (Lexeme) this.results.pollFirst();
    } else {
    //为Lexeme赋值
    result.setLexemeText(String.valueOf(this.segmentBuff,
    result.getBegin(), result.getLength()));
    break;
    }
    }
    return result;
    }

    void reset() {
    this.buffLocker.clear();
    this.orgLexemes = new QuickSortSet();
    this.available = 0;
    this.buffOffset = 0;
    this.charTypes = new int[4096];
    this.cursor = 0;
    this.results.clear();
    this.segmentBuff = new char[4096];
    this.pathMap.clear();
    }
    /**
    * 数量词合并
    * @param result
    */
    private void compound(Lexeme result) {
    if (!(this.cfg.useSmart())) {
    return;
    }

    if (this.results.isEmpty())
    return;
    Lexeme nextLexeme;
    boolean appendOk;
    if (2 == result.getLexemeType()) {
    nextLexeme = (Lexeme) this.results.peekFirst();
    appendOk = false;
    if (16 == nextLexeme.getLexemeType()) {
    appendOk = result.append(nextLexeme, 16);
    } else if (32 == nextLexeme.getLexemeType()) {
    appendOk = result.append(nextLexeme, 48);
    }
    if (appendOk) {
    this.results.pollFirst();
    }

    }

    if ((16 == result.getLexemeType()) && (!(this.results.isEmpty()))) {
    nextLexeme = (Lexeme) this.results.peekFirst();
    appendOk = false;
    if (32 == nextLexeme.getLexemeType()) {
    appendOk = result.append(nextLexeme, 48);
    }
    if (!(appendOk))
    return;
    this.results.pollFirst();
    }
    }

    public void setMoveIndex(Integer moveIndex) {
    this.moveIndex = moveIndex;

    }

    }

    以下是CJK逆向最大匹配算法:

    package org.wltea.analyzer.core;

    import org.wltea.analyzer.dic.Dictionary;
    import org.wltea.analyzer.dic.Hit;

    /**
    * 中日韩分词器,逆向最大匹配算法
    *
    * @author TongXueQiang
    * @date 2016/01/20
    * @since 1.7
    */
    class CJKSegmenter implements ISegmenter {
    static final String SEGMENTER_NAME = "CJK_SEGMENTER";
    static Integer MATCH_LEN = 7;
    static Integer moveIndex = MATCH_LEN - 1;

    CJKSegmenter() {

    }

    /*
    * 逆向最大匹配算法
    *
    * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.
    * AnalyzeContext)
    */
    public void analyze(AnalyzeContext context) {
    if (context.getCursor() < moveIndex) {
    moveIndex = context.getCursor();
    MATCH_LEN = context.getCursor() + 1;
    }
    Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(
    context.getSegmentBuff(), context.getCursor() - moveIndex,
    MATCH_LEN);
    if (singleCharHit.isMatch() || MATCH_LEN == 1) {
    Lexeme newLexeme = new Lexeme(context.getBufferOffset(),
    context.getCursor() - moveIndex, MATCH_LEN, 4);
    context.addLexeme(newLexeme);
    context.setMoveIndex(moveIndex);
    init();
    } else {
    if (!singleCharHit.isUnmatch() || singleCharHit.isUnmatch()) {
    --moveIndex;
    --MATCH_LEN;
    analyze(context);
    }
    }

    }

    private void init() {
    moveIndex = 6;
    MATCH_LEN = 7;
    }

    @Override
    public void reset() {

    }
    }

    革命尚未成功,同志仍需努力!专注机器学习理论的研究,寻求理论的突破,然后转化成代码,苦练最底层的基本功,持之以恒,兼顾理论和编程,成为不可或缺的人才,定能成为一流的高手!

  • 相关阅读:
    guava学习--集合2&Range
    guava学习--集合1
    guava学习--FluentIterable
    guava学习--Supplier Suppliers
    guava--Joiner、Splitter、MapJoinner、MapSplitter
    python_输入一个数,判断是否是素数
    python_33_文件操作2
    python_32_文件操作1
    python_31_集合
    python_输出100:200内的素数
  • 原文地址:https://www.cnblogs.com/txq157/p/5241838.html
Copyright © 2020-2023  润新知