• solr入门之pinyin4j源代码改写动态加入扩展词及整合进war项目中


    1.初始化时载入用户定义的字典
    package net.sourceforge.pinyin4j;
    
    import net.sourceforge.pinyin4j.multipinyin.Trie;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.IOException;
    
    import com.gome.mx.plus.pinyin.ext.PYWriterUtils;
    
    /**
     * Manage all external resources required in PinyinHelper class.
     *
     * @author Li Min (xmlerlimin@gmail.com)
     */
    public class ChineseToPinyinResource {
        /**
         * A hash table contains <Unicode, HanyuPinyin> pairs
         */
        private Trie unicodeToHanyuPinyinTable = null;
    
        /**
         * @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set.
         */
        private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) {
            this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable;
        }
    
        /**
         * @return Returns the unicodeToHanyuPinyinTable.
         */
       public  Trie getUnicodeToHanyuPinyinTable() {
            return unicodeToHanyuPinyinTable;
        }
    
        /**
         * Private constructor as part of the singleton pattern.
         */
        private ChineseToPinyinResource() {
            initializeResource();
        }
    
        /**
         * Initialize a hash-table contains <Unicode, HanyuPinyin> pairs
         */
        private void initializeResource() {
            try {
                final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
                final String resourceMultiName = "/pinyindb/multi_pinyin.txt";
                final String userResourceName  = PYWriterUtils.getPath();
    
                setUnicodeToHanyuPinyinTable(new Trie());
                getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));
    
                getUnicodeToHanyuPinyinTable().loadMultiPinyin(ResourceHelper.getResourceInputStream(resourceMultiName));
    
                getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();
                //载入用户自己定义词库
                if (userResourceName != null) {
                    File userMultiPinyinFile = new File(userResourceName);
                    FileInputStream is = new FileInputStream(userMultiPinyinFile);
                    getUnicodeToHanyuPinyinTable().load(is);
                }
    
            } catch (FileNotFoundException ex) {
                ex.printStackTrace();
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
    
        Trie getHanyuPinyinTrie(char ch) {
    
            String codepointHexStr = Integer.toHexString((int) ch).toUpperCase();
    
            // fetch from hashtable
            return getUnicodeToHanyuPinyinTable().get(codepointHexStr);
        }
    
        /**
         * Get the unformatted Hanyu Pinyin representations of the given Chinese
         * character in array format.
         *
         * @param ch given Chinese character in Unicode
         * @return The Hanyu Pinyin strings of the given Chinese character in array
         * format; return null if there is no corresponding Pinyin string.
         */
        String[] getHanyuPinyinStringArray(char ch) {
            String pinyinRecord = getHanyuPinyinRecordFromChar(ch);
            return parsePinyinString(pinyinRecord);
        }
    
        String[] parsePinyinString(String pinyinRecord) {
    
            if (null != pinyinRecord) {
                int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET);
                int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET);
    
                String stripedString =
                        pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(),
                                indexOfRightBracket);
    
                return stripedString.split(Field.COMMA);
    
            } else
                return null; // no record found or mal-formatted record
        }
    
        /**
         * @param record given record string of Hanyu Pinyin
         * @return return true if record is not null and record is not "none0" and
         * record is not mal-formatted, else return false
         */
        private boolean isValidRecord(String record) {
            final String noneStr = "(none0)";
    
            return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET)
                    && record.endsWith(Field.RIGHT_BRACKET);
        }
    
        /**
         * @param ch given Chinese character in Unicode
         * @return corresponding Hanyu Pinyin Record in Properties file; null if no
         * record found
         */
        private String getHanyuPinyinRecordFromChar(char ch) {
            // convert Chinese character to code point (integer)
            // please refer to http://www.unicode.org/glossary/#code_point
            // Another reference: http://en.wikipedia.org/wiki/Unicode
            int codePointOfChar = ch;
    
            String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase();
    
            // fetch from hashtable
            Trie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr);
            String foundRecord = null;
            if (trie != null)
                foundRecord = trie.getPinyin();
    
            return isValidRecord(foundRecord) ?

    foundRecord : null; } /** * Singleton factory method. * * @return the one and only MySingleton. */ public static ChineseToPinyinResource getInstance() { return ChineseToPinyinResourceHolder.theInstance; } /** * Singleton implementation helper. */ private static class ChineseToPinyinResourceHolder { static final ChineseToPinyinResource theInstance = new ChineseToPinyinResource(); } /** * A class encloses common string constants used in Properties files * * @author Li Min (xmlerlimin@gmail.com) */ class Field { static final String LEFT_BRACKET = "("; static final String RIGHT_BRACKET = ")"; static final String COMMA = ","; } }




    批量写入功能加入
    
    package com.gome.mx.plus.pinyin.ext;
    
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.OutputStreamWriter;
    import java.util.HashSet;
    import java.util.Map;
    import java.util.Map.Entry;
    import java.util.Set;
    
    import javax.xml.crypto.dsig.spec.ExcC14NParameterSpec;
    
    import net.sourceforge.pinyin4j.ChineseToPinyinResource;
    import net.sourceforge.pinyin4j.ResourceHelper;
    import net.sourceforge.pinyin4j.multipinyin.MultiPinyinConfig;
    import net.sourceforge.pinyin4j.multipinyin.Trie;
    /**
     * 将汉语和拼音写入指定的文件里--文件位置能够指定
     * 而且能够动态的载入  不须要重新启动服务
     * 还能指定是否又一次写 还是追加的方式
     * 还能够将原来已经存在的拼音合并过来--能够指定
     * @author songqinghu
     *
     */
    public class PYWriterUtils {
    
        //这里改为系统的绝对路径
        private static String path;
    
        private static boolean flag = true;//能够设置文件位置
        /**
         * @描写叙述:获取配置文件的位置 ---仅仅能设置一次
         * @return void
         * @exception
         * @createTime:2016年4月6日
         * @author: songqinghu
         */
        public static void setPath(String path){
            if(flag){
                PYWriterUtils.path = path;
                flag = false;//仅仅能设置 一次
            }
        }
    
        public static String getPath(){
            return PYWriterUtils.path;
        }
    
        private static Class pathClass = PYWriterUtils.class;
    
    
        /**
         * 
         * @描写叙述:默认写入的方式  设置为追加模式  合并已经存在的拼音为一个
         * @param word  汉字
         * @param pinyin 拼音
         * @param voice  声调
         * @return
         * @return boolean  是否成功
         * @exception
         * @createTime:2016年4月6日
         * @author: songqinghu
         * @throws Exception 
         */
        public static boolean dufaultWriter(String word,String pinyin,Integer voice) throws Exception{
            return writerControler(word, pinyin, voice, true, true);
        }
        /**
         * 
         * @描写叙述:能够设置的写入方式  --这里还要添加一个批量写入的功能  本方法仅仅是处理一个汉字
         * @param word  汉字
         * @param pinyin 拼音
         * @param voice  声调
         * @param additional 是否追加到文件后
         * @param merge 是否合并已经出现的拼音到文件里
         * @return
         * @return boolean
         * @exception
         * @createTime:2016年4月6日
         * @author: songqinghu
         * @throws Exception 
         * 龦
         */
        public static boolean writerControler(String word,String pinyin,Integer voice,
                boolean additional ,boolean merge) throws Exception{
    
            String path = PYWriterUtils.path;
            if (path != null) {
                File userMultiPinyinFile = new File(path);
                if (userMultiPinyinFile.exists()) {
                    //获取
                    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(userMultiPinyinFile, additional)));
                    //加入音调
                    pinyin = pinyin + voice;
                    //写入--16进制  查询 --
                    if(word !=null && word.length()>0){
                        char c = word.toCharArray()[0];
                        if(c>128){//是汉字
                            String unicode = Integer.toHexString(c).toUpperCase();//编码
                            if(merge){//假设要合并
                                Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable();
    
                                if(trie.get(unicode)!=null){ //存在了编码和拼音相应关系---这里最好在推断一次是否存在了该拼音
                                    String before = trie.get(unicode).getPinyin();
                                    before = before.trim().substring(1, before.trim().length()-1);//去除()
                                    //存在了 就不加入进去了
                                    boolean flag = false;
                                    String[] words = before.split(",");
                                    for (String str : words) {
                                        if(str.equals(pinyin)){
                                            flag = true; //存在该拼音
                                            break;
                                        }
                                    }
                                   if(flag){
                                       pinyin = before;
                                   }else{
                                       pinyin = before +Field.COMMA+ pinyin ;
                                   }
                                }
                                //不存在  不须要改变pinyin
                            }
                            pinyin = addSymbol(pinyin);
                            writer.write(unicode+Field.SPACE+pinyin);
                            writer.newLine();
                        }
                    }
                    writer.flush();
                    writer.close();
                    //写入完毕  更新词库
                    reloadText();
                    return true;
                }
            }else{
                throw new Exception("找不到用户扩展字典");
            }
           return false;
        }
    
        /**
         * 完毕批量加入的功能
         */
        /**
         * 
         * @描写叙述:批量加入汉字和拼音的映射关系到自己定义词库中----这里有个问题 当 批量输入一个多音字 拼音都是map中同一个key时仅仅能提交成功一个--建议提交两次
         * @param contents  汉字  拼音  音调  这里一个汉字  能够输入多个拼音了
         * @param additional 是否追加到文件后
         * @param merge 是否合并已经出现的拼音到文件里
         * @return
         * @return boolean
         * @exception
         * @createTime:2016年4月7日
         * @author: songqinghu
         */
        public static boolean writerBatch(Map<String,Map<String,Integer>> contents,boolean additional ,boolean merge){
            //载入文件部分
            BufferedWriter writer =null;
            try {
                if (path != null) {
                    File userMultiPinyinFile = new File(path);
                    if (userMultiPinyinFile.exists()) {
                writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(userMultiPinyinFile, additional)));
                //写入处理部分
                Set<Entry<String, Map<String, Integer>>> entrySet = contents.entrySet();
                for (Entry<String, Map<String, Integer>> entry : entrySet) {
                    String word = entry.getKey().trim();//汉语
                    String pinyin = "";
                    for (Entry<String, Integer> content : entry.getValue().entrySet()) {
                        String py = content.getKey().trim();
                        Integer voice = content.getValue();
                        pinyin = pinyin + py + voice+",";
                    }
                    //拼音加入结束  去除最后一个,
                    pinyin = pinyin.substring(0, pinyin.length()-1);
                    //汉字和拼音都已经处理完毕 进入单个词语写入模块 --方法 抽取出来公用
                    String line = midWriter(word, pinyin, merge);
                    if(line != null){
                        writer.write(line);
                        writer.newLine();
                    }
                }
                writer.flush();
                return true;
                    }
               }else{
                   throw new  Exception("请配置用户词典绝对路径");
               }
            } catch (Exception e) {
                e.printStackTrace();
            }finally {
                try {
                    if(writer!=null)
                       writer.close();
                    PYWriterUtils.reloadText();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            return false;
        }
        /**
         * 
         * @描写叙述:方法抽取--对单个字进行处理
         * @return
         * @return String 组合后的一行要写入的   形式    E4A3 (ang3,yi1,wang3)
         * @exception
         * @createTime:2016年4月7日
         * @author: songqinghu
         */
        private static String midWriter(String word ,String pinyin,boolean merge){
    
            if(word !=null && word.length()>0){
                char c = word.toCharArray()[0];
                if(c>128){//假设是汉字
                   String unicode  = Integer.toHexString(c).toUpperCase();//变为16进制
                   if(merge){//假设要合并 须要先取出来  在合并  取不到还要处理一下
                       //获取到总的资源池
                       Trie trie = ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable();
                       //假设存在该词语的拼音
                       if(trie.get(unicode)!=null &&trie.get(unicode).getPinyin()!=null){
                           String before = trie.get(unicode).getPinyin();
                           //对已经处在字符串进行处理 --(xxx) (xxxx,xxxx) 
                           before = before.trim().substring(1, before.trim().length()-1);//去除()
                           //假设存在了  就不再反复加入了
                           String[] splits = before.split(",");
                           String[] strings = pinyin.trim().split(",");
                           Set<String> temp  = new HashSet<String>();
                           //去反复
                           for (String split : splits) {
                               temp.add(split.trim());
                           }
                           for (String string : strings) {
                              temp.add(string);
                           }
                           pinyin ="";
                           for (String tem : temp) {
                             pinyin = pinyin + tem+Field.COMMA;
                           }
                           pinyin =  pinyin.substring(0,pinyin.length()-1);//去除最后一个,
                       }
                       //不存在 直接 保持拼音不变
                   }
                   //组合成写入的格式
                   pinyin = addSymbol(pinyin);
    
                   return unicode + Field.SPACE+pinyin;
                }
            }
            return null;
        }
    
        /**
         * 
         * @描写叙述:默认批量写入功能
         * @param contents
         * @return
         * @return boolean
         * @exception
         * @createTime:2016年4月7日
         * @author: songqinghu
         */
        public static boolean defaultWriterBatch(Map<String,Map<String,Integer>> contents){
    
            return writerBatch(contents, true, true);
        }
    
        /**
         * 
         * @描写叙述:当自己定义文件须要更新时,调用方法 又一次载入自己的配置文件
         * @return
         * @return boolean
         * @exception
         * @createTime:2016年4月6日
         * @author: songqinghu
         * @throws IOException 
         */
        public static boolean reloadText() throws IOException{
    
            if (path != null) {
                File userMultiPinyinFile = new File(path);
                FileInputStream is = new FileInputStream(userMultiPinyinFile);
                if(is !=null){
                  ChineseToPinyinResource.getInstance().getUnicodeToHanyuPinyinTable().load(is);
                  return true;
                }
            }
            return false;
        }
    
    
    
        /**
         * 加入操作符号
         */
        private static String addSymbol(String pinyin){
            return Field.LEFT_BRACKET+pinyin+Field.RIGHT_BRACKET;
        }
    
        class Field {
            static final String LEFT_BRACKET = "(";
    
            static final String RIGHT_BRACKET = ")";
    
            static final String COMMA = ",";
    
            static final String SPACE = " ";
        }
    }


    将jar和原有suggestproject进行整合
    
    出现故障---无法写入jar中自己定义文件(jar中的文件仅仅能读取)
    ===>解决思路 将用户自己定义词典放在执行的warproject中
    
    须要手动指定一次文件位置---大概功能已经能够整合进入项目中使用了
    
    package cn.com.mx.gome.suggest.controller;
    
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.stereotype.Controller;
    import org.springframework.web.bind.annotation.RequestMapping;
    import org.springframework.web.bind.annotation.ResponseBody;
    
    import com.gome.mx.plus.pinyin.ext.PYReadUtils;
    import com.gome.mx.plus.pinyin.ext.PYWriterUtils;
    
    import cn.com.mx.gome.search.core.common.ResultData;
    import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
    
    /**
     * 
     * @author songqinghu
     * 对pinyin4j中的用户自己定义词典库进行操作
     */
    @Controller
    @RequestMapping("/suggest/pinyin")
    public class PinYinController {
    
        private  Logger logger = LoggerFactory.getLogger(PinYinController.class);
    
    
    
        /**
         * 
         * @描写叙述:获取指定汉字的现存词库中的拼音 --须要POST请求
         * @return
         * @return ResultData<String>
         * @exception
         * @createTime:2016年4月7日
         * @author: songqinghu
         */
        @RequestMapping("/getpy")
        @ResponseBody
        public ResultData<List<String>> getPinYin(String word){
            ResultData<List<String>> result = new ResultData<List<String>>();
            try {
                if(word != null && word.trim().length()>0){
                    String[] fullPY = PYReadUtils.getFullPY(word);
                    if(fullPY!=null && fullPY.length>0){
                        ArrayList<String> list = new ArrayList<String>();
                        for (String string : fullPY) {
                            list.add(string);
                        }
                        result.setData(list);
                        result.setSuccess(true);
                        return result;
                    }
    
                }
    
            } catch (BadHanyuPinyinOutputFormatCombination e) {
               logger.error("",e);
            }
            result.setSuccess(false);
            return result;
        }
        /**
         * 
         * @描写叙述:加入一个汉字的映射关系到用户自定库中
         * @param word
         * @param pinyin
         * @param voice
         * @return
         * @return ResultData<String>
         * @exception
         * @createTime:2016年4月7日
         * @author: songqinghu
         */
        @RequestMapping("/addpy")
        @ResponseBody
        public ResultData<Boolean> addPinYin(String word,String pinyin,Integer voice){
            ResultData<Boolean> result = new ResultData<Boolean>();
            if(word!=null && word.trim().length()>0 && pinyin !=null && pinyin.trim().length()>0&&voice>0){
                try {
                    boolean flag = PYWriterUtils.dufaultWriter(word, pinyin, voice);
                    result.setData(flag);
                    result.setSuccess(true);
                    return result;
                } catch (Exception e) {
                    e.printStackTrace();
                    logger.error("",e);
                }           
            }
            result.setSuccess(false);
            return result;
        }
    
        @RequestMapping("/test")
        @ResponseBody
        public ResultData<Boolean> addtest(String word,String pinyin,Integer voice){
            Map<String, Map<String, Integer>> contents = new HashMap<String,Map<String,Integer>>();
    
            HashMap<String, Integer> content = new HashMap<String,Integer>();
    
            content.put("test", 1);
            content.put("tttt", 2);
            content.put("ling", 1);
            contents.put("〇", content);
     //       setDicPath();
            ResultData<Boolean> result = new ResultData<Boolean>();
            if(word!=null && word.trim().length()>0 && pinyin !=null && pinyin.trim().length()>0&&voice>0){
                try {
                    boolean flag = PYWriterUtils.defaultWriterBatch(contents);
                    result.setData(flag);
                    result.setSuccess(true);
                    return result;
                } catch (Exception e) {
                    e.printStackTrace();
                    logger.error("",e);
                }           
            }
            result.setSuccess(false);
            return result;
        }
    
    
    
    }
    
    
    

    warproject使用  SSM架构 项目启动时载入词库所在位置工具类
    
    package cn.com.mx.gome.suggest.component;
    /**
     * 项目启动时载入指定的pinyin4j用户扩展字典
     * @author songqinghu
     *
     */
    
    import javax.annotation.PostConstruct;
    
    import org.springframework.beans.factory.annotation.Value;
    import org.springframework.stereotype.Component;
    
    import com.gome.mx.plus.pinyin.ext.PYWriterUtils;
    
    import cn.com.mx.gome.suggest.controller.PinYinController;
    
    @Component
    public class PinYinDataSourceFile {
    
        @Value("${PINYIN_FILE_PATH}")
        private String path;
        /**
         * 
         * @描写叙述:项目启动时 此类载入完毕后运行此方法完毕用户自己定义pinyin4j字典配置的载入
         * @return void
         * @exception
         * @createTime:2016年4月7日
         * @author: songqinghu
         */
        @PostConstruct
        private void setFilePath(){
            String pathFile = PinYinDataSourceFile.class.getResource(path).getPath();
            PYWriterUtils.setPath(pathFile);
        }
    
    
    }
    

    最后附上 改写后的pinyin4j源代码
    链接:http://pan.baidu.com/s/1skUD8dv password:fhy4


  • 相关阅读:
    overflow+文档流
    《大器晚成》读后感 读书笔记
    《指标陷阱》读后感 读书笔记
    《无限的游戏》读后感 读书笔记
    《最蓝的眼睛》读后感 读书笔记
    《正常人》读后感 读书笔记
    《玉米人》读后感 读书笔记
    《科举史》读后感 读书笔记
    《糖的故事》读后感 读书笔记
    《蒙克传》读后感 读书笔记
  • 原文地址:https://www.cnblogs.com/yutingliuyl/p/7249997.html
Copyright © 2020-2023  润新知