要做一个根据词库进行筛选主要词汇的功能,去搜狗下载专业词汇词库时,发现是.scel文件,且通过转换工具(http://tools.bugscaner.com/sceltotxt/)转换为txt时报错如下,只能通过Java程序来转换了。
核心代码如下,涉及到四个类:FileProcessing、SougouScelFileProcessing、SougouScelModel、TxtFileProcessing
文件FileProcessing .java
package cn.ucmed.impl; import java.io.File; import java.io.IOException; public abstract class FileProcessing { protected String targetDir; /** * 解析单个文件 * * @param filePath 要解析的源文件路径 * @param targetFilePath 解析后的文件路径 * @param isAppend 是否为内容追加,不追加则会覆盖内容 */ public abstract void parseFile(String filePath, String targetFilePath, boolean isAppend); /** * 合并解析多个文件 * * @param fileDirPath 要解析的源文件夹路径 * @param targetFilePath 解析后的文件路径 * @param isAppend 是否为内容追加,不追加则会覆盖内容 * @throws IOException */ public abstract void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException; /** * 解析单个或者多个文件,如果是多个文件则生成对应的txt文件,{@link #setTargetDir(String)}, * 如果targetDir不设置,则在当前文件夹下生成相应的txt文件 * * @param filePath 源文件路径 * @param isAppend false:覆盖内容 true:附加内容 */ public abstract void parseFile(String filePath, boolean isAppend); /** * 创建文件夹 * * @param targetFilePath 目标文件 * @return */ protected void createParentDir(String targetFilePath) { if (!targetFilePath.endsWith(".txt")) { throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为 " + targetFilePath); } String path = targetFilePath.substring(0, targetFilePath.lastIndexOf("/") + 1); File file = new File(path); if (!file.exists()) { file.mkdirs(); } } /** * 解析单个文件 * * @param filePath 文件路径 */ public void parseFile(String filePath) { parseFile(filePath, false); } public String getTargetDir() { return targetDir; } /** * 解析后的txt文件存放路径 * * @param targetDir 文件夹路径 */ public void setTargetDir(String targetDir) { this.targetDir = targetDir; } }
SougouScelFileProcessing.java
package cn.ucmed.impl; import lombok.extern.slf4j.Slf4j; import java.io.*; import java.util.*; @Slf4j public class SougouScelFileProcessing extends FileProcessing { protected static String encoding = "UTF-16LE"; protected ByteArrayOutputStream output = new ByteArrayOutputStream(); /** * 解析单个或者多个文件,如果是多个文件则生成对应的txt文件,{@link #setTargetDir(String)}, * 如果targetDir不设置,则在当前文件夹下生成相应的txt文件 * * @param filePath 源文件路径 * @param isAppend false:覆盖内容 true:附加内容 */ @Override public void parseFile(String filePath, boolean isAppend) { File file = new File(filePath); if (file.isDirectory()) { File items[] = file.listFiles(); for (int i = 0; i < items.length; i++) { if (!items[i].getName().endsWith(".scel")) { continue; } if (targetDir == null) { parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".scel", ".txt"), isAppend); } else { parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName().replace(".scel", ".txt"), isAppend); } } } else { parseFile(filePath, file.getAbsolutePath().replace(".scel", ".txt"), isAppend); } } /** * 解析单个scel文件 * * @param filePath 源文件路径 * @param targetFilePath * @param isAppend false:覆盖内容 true:附加内容 */ @Override public void parseFile(String filePath, String targetFilePath, boolean isAppend) { if (!targetFilePath.endsWith(".txt")) { throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为 " + targetFilePath); } if (!filePath.endsWith(".scel")) { return; } File input = new File(filePath); if (input.length() < 8) { // 假如文件小于8字节,不去考虑它 return; } FileInputStream in = null; SougouScelModel model = null; try { in = new FileInputStream(input); model = read(in); if (model == null) { return; } writeToTargetFile(model, targetFilePath, isAppend); } catch (IOException e) { log.info(e.getMessage()); e.printStackTrace(); } } /** * 解析多个文件夹,将解析后的内容放到一个文件里 * * @param fileDirPath 源文件夹路径 * @param targetFilePath 目标文件路径 * @param isAppend false:覆盖内容 true:附加内容 * @throws FileNotFoundException */ @Override public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException { if (!targetFilePath.endsWith(".txt")) { throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为 " + targetFilePath); } File dir = new File(fileDirPath); if (!dir.exists() || !dir.isDirectory()) { throw new IllegalStateException("scel文件夹路径错误 " + targetFilePath); } File scels[] = dir.listFiles(); ArrayList<SougouScelModel> models = new ArrayList<>(); for (int i = 0; i < scels.length; i++) { if (!scels[i].getName().endsWith(".scel")) { continue; } FileInputStream in = null; SougouScelModel model = null; in = new FileInputStream(scels[i]); model = read(in); if (model != null) { models.add(model); } } writeToTargetFile(models, targetFilePath, isAppend); } private void writeToTargetFile(SougouScelModel model, String targetFilePath, boolean isAppend) throws IOException { List<SougouScelModel> models = new ArrayList<>(); models.add(model); writeToTargetFile(models, targetFilePath, isAppend); } /** * 将搜狗scel文件解析后的内容写入txt文件 * * @param models * @param targetFilePath * @param isAppend * @throws IOException */ private void writeToTargetFile(List<SougouScelModel> models, String targetFilePath, boolean isAppend) throws IOException { createParentDir(targetFilePath); FileOutputStream out = new FileOutputStream(targetFilePath, isAppend); int count = 0; for (int k = 0; k < models.size(); k++) { // 词<拼音,词> Map<String, List<String>> words = models.get(k).getWordMap(); Set<Map.Entry<String, List<String>>> set = words.entrySet(); Iterator<Map.Entry<String, List<String>>> iter = set.iterator(); if (isAppend) { out.write(" ".getBytes()); } while (iter.hasNext()) { Map.Entry<String, List<String>> entry = iter.next(); List<String> list = entry.getValue(); int size = list.size(); for (int i = 0; i < size; i++) { String word = list.get(i); out.write((entry.getKey() + " ").getBytes()); // 写入txt文件 out.write((word + " ").getBytes()); count++; } } } out.close(); log.info("生成" + targetFilePath.substring(targetFilePath.lastIndexOf("/") + 1) + "成功!,总计写入: " + count + " 条数据!"); } private SougouScelModel read(InputStream in) { SougouScelModel model = new SougouScelModel(); DataInputStream input = new DataInputStream(in); int read; try { byte[] bytes = new byte[4]; input.readFully(bytes); assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0); input.readFully(bytes); int flag1 = bytes[0]; assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01); int[] reads = new int[]{8}; model.setName(readString(input, 0x130, reads)); model.setType(readString(input, 0x338, reads)); model.setDescription(readString(input, 0x540, reads)); model.setSample(readString(input, 0xd40, reads)); read = reads[0]; input.skip(0x1540 - read); read = 0x1540; input.readFully(bytes); read += 4; assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0); bytes = new byte[128]; Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>(); while (true) { int mark = readUnsignedShort(input); int size = input.readUnsignedByte(); input.skip(1); read += 4; assert (size > 0 && (size % 2) == 0); input.readFully(bytes, 0, size); read += size; String py = new String(bytes, 0, size, encoding); pyMap.put(mark, py); if ("zuo".equals(py)) { break; } } if (flag1 == 0x44) { input.skip(0x2628 - read); } else if (flag1 == 0x45) { input.skip(0x26C4 - read); } else { throw new RuntimeException("出现意外,联系作者"); } StringBuffer buffer = new StringBuffer(); Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>(); while (true) { int size = readUnsignedShort(input); if (size < 0) { break; } int count = readUnsignedShort(input); int len = count / 2; assert (len * 2 == count); buffer.setLength(0); for (int i = 0; i < len; i++) { int key = readUnsignedShort(input); buffer.append(pyMap.get(key)).append("'"); } buffer.setLength(buffer.length() - 1); String py = buffer.toString(); List<String> list = wordMap.get(py); if (list == null) { list = new ArrayList<String>(); wordMap.put(py, list); } for (int i = 0; i < size; i++) { count = readUnsignedShort(input); if (count > bytes.length) { bytes = new byte[count]; } input.readFully(bytes, 0, count); String word = new String(bytes, 0, count, encoding); // 接下来12个字节可能是词频或者类似信息 input.skip(12); list.add(word); } } model.setWordMap(wordMap); return model; } catch (IOException e) { log.info(e.getMessage()); e.printStackTrace(); } finally { try { in.close(); } catch (IOException e) { e.printStackTrace(); } } return null; } protected String readString(DataInputStream input, int pos, int[] reads) throws IOException { int read = reads[0]; input.skip(pos - read); read = pos; output.reset(); while (true) { int c1 = input.read(); int c2 = input.read(); read += 2; if (c1 == 0 && c2 == 0) { break; } else { output.write(c1); output.write(c2); } } reads[0] = read; return new String(output.toByteArray(), encoding); } protected final int readUnsignedShort(InputStream in) throws IOException { int ch1 = in.read(); int ch2 = in.read(); if ((ch1 | ch2) < 0) { return Integer.MIN_VALUE; } return (ch2 << 8) + (ch1 << 0); } }
SougouScelModel.java
package cn.ucmed.impl; import lombok.Data; import lombok.ToString; import java.util.List; import java.util.Map; @Data @ToString public class SougouScelModel { private Map<String, List<String>> wordMap; private String name; private String type; private String description; private String sample; }
TxtFileProcessing.java
package cn.ucmed.impl; import lombok.extern.slf4j.Slf4j; import java.io.*; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @Slf4j public class TxtFileProcessing extends FileProcessing { // 文字编码 private String encoding = "UTF-8"; @Override public void parseFile(String filePath, String targetFilePath, boolean isAppend) { if (!targetFilePath.endsWith(".txt")) { throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为 " + targetFilePath); } if (!filePath.endsWith(".txt")) { return; } File inputFile = new File(filePath); if (!inputFile.exists()) { log.info(filePath + " 文件不存在"); } else { ArrayList<String> content = new ArrayList<>(); HashSet<String> set = new HashSet<>(); createParentDir(targetFilePath); File outputFile = new File(targetFilePath); if (!isAppend) { // 假如不是附加内容,删除 if (outputFile.exists()) { log.info(outputFile.getAbsolutePath() + " 文件存在,删除..."); outputFile.delete(); } } else { // 读取原有的txt文件内容 content.addAll(readTargetFile(outputFile)); } content.addAll(readSourceFile(inputFile)); // 去重 for (int i = 0; i < content.size(); i++) { set.add(content.get(i)); } // 写入目标文件 writeToTargetFile(set, outputFile); } } @Override public void parseFile(String filePath, boolean isAppend) { File file = new File(filePath); if (file.isDirectory()) { File items[] = file.listFiles(); for (int i = 0; i < items.length; i++) { if (!items[i].getName().endsWith(".txt")) { continue; } if (targetDir == null) { parseFile(items[i].getAbsolutePath(), items[i].getAbsolutePath().replace(".txt", "解析.txt"), isAppend); } else { parseFile(items[i].getAbsolutePath(), targetDir + "/" + items[i].getName(), isAppend); } } } else { parseFile(filePath, file.getAbsolutePath().replace(".txt", "解析.txt"), isAppend); } } @Override public void parseFiles(String fileDirPath, String targetFilePath, boolean isAppend) throws IOException { if (!targetFilePath.endsWith(".txt")) { throw new IllegalStateException("文件格式错误,后缀必须为.txt,此格式为 " + targetFilePath); } File fileDir = new File(fileDirPath); if (!fileDir.isDirectory() || !fileDir.exists()) { throw new IllegalStateException("文件夹路径错误 " + targetFilePath); } File file[] = fileDir.listFiles(); ArrayList<String> content = new ArrayList<>(); HashSet<String> set = new HashSet<>(); createParentDir(targetFilePath); File outputFile = new File(targetFilePath); if (!isAppend) { // 假如不是附加内容,删除 if (outputFile.exists()) { log.info(outputFile.getAbsolutePath() + " 文件存在,删除..."); outputFile.delete(); } } else { // 读取原有的txt文件内容 content.addAll(readSourceFile(outputFile)); } for (int i = 0; i < file.length; i++) { if (file[i].getName().endsWith(".txt")) { content.addAll(readSourceFile(file[i])); } } // 去重 for (int i = 0; i < content.size(); i++) { set.add(content.get(i)); } // 写入目标文件 writeToTargetFile(set, outputFile); } /** * 将内容写入目标文件 * * @param set 词库合集 * @param outputFile 目标文件 */ private void writeToTargetFile(HashSet<String> set, File outputFile) { StringBuffer buff = new StringBuffer(); for (String content : set) { buff.append(content); buff.append(" "); } String content = buff.toString(); FileOutputStream out = null; try { out = new FileOutputStream(outputFile); out.write(content.getBytes()); } catch (IOException e) { log.info(e.getMessage()); e.printStackTrace(); } finally { try { out.close(); } catch (IOException e) { log.info(e.getMessage()); e.printStackTrace(); } } log.info("生成" + outputFile.getName() + "成功!,总计写入: " + set.size() + " 条数据!"); } /** * 读取源文件,获取中文词库 * * @param file 源文件 * @return 中文词库集合 */ private List<String> readSourceFile(File file) { ArrayList<String> content = new ArrayList<>(); try { // 判断文件是否存在 if (file.isFile() && file.exists()) { // 考虑到编码格式 InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding); BufferedReader bufferedReader = new BufferedReader(read); String lineTxt = null; while ((lineTxt = bufferedReader.readLine()) != null) { String newStr = new String(lineTxt.getBytes("UTF-8")); String split[] = newStr.split(" "); for (int i = 0; i < split.length; i++) { if (i % 2 == 0) { // 拼音字母 } else { // 中文词库 content.add(split[i]); } } } bufferedReader.close(); read.close(); } else { log.info("找不到源文件 " + file.getAbsolutePath()); } } catch (Exception e) { log.info(e.getMessage()); e.printStackTrace(); } return content; } /** * 读取已解析好的的词库文件 * * @param file 词库文件 * @return 词库内容 */ private List<String> readTargetFile(File file) { ArrayList<String> content = new ArrayList<>(); try { // 判断文件是否存在 if (file.isFile() && file.exists()) { // 考虑到编码格式 InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding); BufferedReader bufferedReader = new BufferedReader(read); String lineTxt = null; while ((lineTxt = bufferedReader.readLine()) != null) { String newStr = new String(lineTxt.getBytes("UTF-8")); if (!newStr.trim().isEmpty()) { content.add(newStr); } } bufferedReader.close(); read.close(); } else { System.err.println("找不到目标文件 " + file.getAbsolutePath()); } } catch (Exception e) { log.info(e.getMessage()); e.printStackTrace(); } return content; } }
测试用例:
public static void main(String[] args) { //单个scel文件转化 FileProcessing scel = new SougouScelFileProcessing(); scel.parseFile("./resolver/src/main/java/cn/ucmed/constant/药品名称大全.scel", "./resolver/src/main/java/cn/ucmed/constant/药品名称大全.txt", true); //多个scel文件转化为一个txt (格式:拼音字母 词) try { scel.parseFiles("/Users/ST_iOS/Desktop/test/ciku", "/Users/ST_iOS/Desktop/test/ciku/txt/汇总.txt", false); } catch (IOException e) { e.printStackTrace(); } //多个scel文件转化为多个txt文件, 转化后文件的存储位置 scel.setTargetDir("/Users/ST_iOS/Desktop/test/ciku/多对多"); scel.parseFile("/Users/ST_iOS/Desktop/test/ciku", false); }