• 基于词典的正向最大匹配中文分词算法,能实现中英文数字混合分词


    基于词典的正向最大匹配中文分词算法,能实现中英文数字混合分词。比如能分出这样的词:bb霜、3室、乐phone、touch4、mp3、T恤

    第一次写中文分词程序,欢迎拍砖。

    publicclass MM2
    {
    privatestaticfinal Log log = LogFactory.getLog(MM2.class);

    privatestatic HashMap<String, Integer> dictionary =null;
    privatestaticfinalint WORD_MAX_LENGTH =9;
    private Reader reader;

    static
    {
    loadDictionary();
    }


    public MM2(Reader reader)
    {
    this.reader = reader;
    }


    //切分出由中文、字母、数字组成的句子
    public ArrayList<Sentence> getSentence() throws IOException
    {
    ArrayList
    <Sentence> list=new ArrayList<Sentence>();
    StringBuffer cb
    =new StringBuffer();
    int d=reader.read();
    int offset=0;
    boolean b=false;
    while(d>-1)
    {
    int type=Character.getType(d);
    if(type==2|| type==9|| type==5)
    {
    d
    =toAscii(d);
    cb.append((
    char)d);
    }

    else
    {
    b
    =true;
    }

    d
    =reader.read();
    if(d==-1|| b)
    {
    if(d==-1) offset++;
    b
    =false;
    char[] ioBuffer =newchar[cb.length()];
    cb.getChars(
    0, cb.length(), ioBuffer, 0);
    Sentence sen
    =new Sentence(ioBuffer,offset-cb.length());
    list.add(sen);
    cb.setLength(
    0);
    }

    offset
    ++;
    }

    return list;
    }


    //将句子切分出词
    public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
    {
    ArrayList
    <Token> tokenlist=new ArrayList<Token>();
    for(Sentence sen:list)
    {
    StringBuffer word
    =new StringBuffer();
    int offset=sen.getStartOffset();
    int bufferIndex =0;
    char c;
    boolean b=false;
    while(bufferIndex<sen.getText().length)
    {
    offset
    ++;
    c
    =sen.getText()[bufferIndex++];
    if(word.length()==0)
    word.append(c);
    else
    {
    String temp
    = (word.toString() + c).intern();
    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
    word.append(c);
    elseif(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
    word.append(c);
    else
    {
    bufferIndex
    --;
    offset
    --;
    while(word.length()>1&& dictionary.get(word.toString())!=null&& dictionary.get(word.toString())==2)
    {
    word.deleteCharAt(word.length()
    -1);
    bufferIndex
    --;
    offset
    --;
    }

    b
    =true;
    }

    }

    if(b || bufferIndex==sen.getText().length)
    {
    Token token
    =new Token(word.toString(),offset-word.length(),offset,"word");
    word.setLength(
    0);
    tokenlist.add(token);
    b
    =false;
    }

    }

    }

    return tokenlist;
    }


    //将相连的单个英文或数字组合成词
    public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
    {
    ArrayList
    <Token> tokenlist=new ArrayList<Token>();
    Token word
    =null;
    for(int i=0;i<list.size();i++)
    {
    Token t
    =list.get(i);
    if(t.getWord().length()==1&& Character.getType((int)t.getWord().charAt(0))!=5)
    {
    if(word==null)
    word
    =t;
    elseif(word.getEnd()==t.getStart())
    {
    word.setEnd(t.getEnd());
    word.setWord(word.getWord()
    +t.getWord());
    }

    else
    {
    tokenlist.add(word);
    word
    =t;
    }

    }

    elseif(word!=null)
    {
    tokenlist.add(word);
    word
    =null;
    tokenlist.add(t);
    }

    else
    tokenlist.add(t);
    }

    if(word!=null)
    tokenlist.add(word);
    return tokenlist;
    }


    //双角转单角
    publicstaticint toAscii(int codePoint)
    {
    if((codePoint>=65296&& codePoint<=65305) //0-9
    || (codePoint>=65313&& codePoint<=65338) //A-Z
    || (codePoint>=65345&& codePoint<=65370) //a-z
    )
    {
    codePoint
    -=65248;
    }

    return codePoint;
    }


    //加载词典
    publicstaticvoid loadDictionary()
    {
    if (dictionary ==null)
    {
    dictionary
    =new HashMap<String, Integer>();
    InputStream is
    =null;
    BufferedReader br
    =null;
    try
    {
    is
    =new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
    br
    =new BufferedReader(new InputStreamReader(is, "UTF-8"));
    String word
    =null;
    while ((word = br.readLine()) !=null)
    {
    word
    =word.toLowerCase();
    if ((word.indexOf("#") ==-1) && (word.length() <= WORD_MAX_LENGTH))
    {
    dictionary.put(word.intern(),
    1);
    int i = word.length()-1;
    while(i >=2)
    {
    String temp
    = word.substring(0, i).intern();
    if (!dictionary.containsKey(temp))
    dictionary.put(temp,
    2);
    i
    --;
    }

    }

    }

    }

    catch (Exception e)
    {
    log.info(e);
    }

    finally
    {
    try
    {
    if(br!=null)
    br.close();
    if(is!=null)
    is.close();
    }

    catch (IOException e)
    {
    log.info(e);
    }

    }

    }

    }


    publicstatic String[] segWords(Reader input)
    {
    ArrayList
    <String> list=new ArrayList<String>();
    try
    {
    MM2 f
    =new MM2(input);
    ArrayList
    <Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
    for(Token t:tlist)
    {
    list.add(t.getWord());
    }

    }

    catch(IOException e)
    {
    log.info(e);
    }

    return (String[])list.toArray(new String[0]);
    }


    publicstaticvoid main(String[] args)
    {
    String[] cc
    =MM2.segWords(new StringReader("ibm商务机t60p".toLowerCase()));
    for(String c:cc)
    {
    System.out.println(c);
    }

    }

    }
  • 相关阅读:
    Lambda
    Guava
    创建数据库时报错 'str' object has no attribute 'decode'
    服务器并发测试(jmeter)
    Mosquitto 创建用户自动输入密码
    menuconfig 语法与用法
    Django操作mongodb
    mqtt mosquitto 安装与使用
    python 使用mongodb数据库
    djangorestframework token 认证
  • 原文地址:https://www.cnblogs.com/ibook360/p/2245868.html
Copyright © 2020-2023  润新知