JAV程序解析搜狗词库scel文件格式
- - Java - 编程语言 - ITeye博客在做一个电商的网站的初期时,我们常常面临词库的问题,因为我们并没有比较好的词库,这时候呢,我们就可以从网上下一些,别人有的词库,这些词库有淘宝的,有搜狗的,搜狗的分类比较细, 我们可以根据下载与我们行业比较相关的词库,但这些词库一般都是scel格式的,直接使用JAVA解析,是没法解析的,如果遇到这种情况可用散仙下面的这个类,来解析,经测试无乱码现象,解析完整度还不错.
package com.qin.parse.scel; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; public class SougouScelReader { public SougouScelMdel read(File file) throws IOException { return read(new FileInputStream(file)); } public SougouScelMdel read(URL url) throws IOException { return read(url.openStream()); } protected ByteArrayOutputStream output=new ByteArrayOutputStream(); protected String readString(DataInputStream input,int pos,int[] reads) throws IOException { int read=reads[0]; input.skip(pos-read); read=pos; output.reset(); while(true) { int c1 = input.read(); int c2 = input.read(); read+=2; if(c1==0 && c2==0) { break; } else { output.write(c1); output.write(c2); } } reads[0]=read; return new String(output.toByteArray(),encoding); } protected static String encoding = "UTF-16LE"; public SougouScelMdel read(InputStream in) throws IOException { SougouScelMdel model = new SougouScelMdel(); DataInputStream input = new DataInputStream(in); int read; try { byte[] bytes = new byte[4]; input.readFully(bytes); assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0); input.readFully(bytes); int flag1 = bytes[0]; assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01); int[] reads=new int[]{8}; model.setName(readString(input,0x130,reads)); model.setType(readString(input,0x338,reads)); model.setDescription(readString(input,0x540,reads)); model.setSample(readString(input,0xd40,reads)); read = reads[0]; input.skip(0x1540 - read); read=0x1540; input.readFully(bytes); read += 4; assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0); bytes = new byte[128]; Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>(); while (true) { int mark = readUnsignedShort(input); int size = input.readUnsignedByte(); input.skip(1); read += 4; assert (size > 0 && (size % 2) == 0); input.readFully(bytes, 0, size); read += size; String py = new String(bytes, 0, size, encoding); //System.out.println(py); pyMap.put(mark, py); if ("zuo".equals(py)) { break; } } if (flag1 == 0x44) { input.skip(0x2628 - read); } else if (flag1 == 0x45) { input.skip(0x26C4 - read); } else { throw new RuntimeException("出现意外,联系作者"); } StringBuffer buffer = new StringBuffer(); Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>(); while (true) { int size = readUnsignedShort(input); if (size < 0) { break; } int count = readUnsignedShort(input); int len = count / 2; assert (len * 2 == count); buffer.setLength(0); for (int i = 0; i < len; i++) { int key = readUnsignedShort(input); buffer.append(pyMap.get(key)).append("'"); } buffer.setLength(buffer.length() - 1); String py = buffer.toString(); List<String> list = wordMap.get(py); if (list == null) { list = new ArrayList<String>(); wordMap.put(py, list); } for (int i = 0; i < size; i++) { count = readUnsignedShort(input); if (count > bytes.length) { bytes = new byte[count]; } input.readFully(bytes, 0, count); String word = new String(bytes, 0, count, encoding); //接下来12个字节可能是词频或者类似信息 input.skip(12); list.add(word); } } //System.out.println(wordMap.size()); model.setWordMap(wordMap); return model; } finally { in.close(); } } protected final int readUnsignedShort(InputStream in) throws IOException { int ch1 = in.read(); int ch2 = in.read(); if ((ch1 | ch2) < 0) { return Integer.MIN_VALUE; } return (ch2 << 8) + (ch1 << 0); } } //自行将此类提出来为public class class SougouScelMdel { private Map<String, List<String>> wordMap; private String name; private String type; private String description; private String sample; public Map<String, List<String>> getWordMap() { return wordMap; } void setWordMap(Map<String, List<String>> wordMap) { this.wordMap = wordMap; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public String getSample() { return sample; } public void setSample(String sample) { this.sample = sample; } public String getName() { return name; } public void setName(String name) { this.name = name; } }
package com.qin.parse.scel; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.file.Files; import java.nio.file.LinkOption; import java.nio.file.Paths; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; /** * 解析sogo词库工具类 * * * **/ public class ParseSogo { public static void main(String[] args)throws Exception { sogou("D:\\词库\\dianshang.scel","D:\\词库\\goods1.txt",false); } /** * 读取scel的词库文件 * 生成txt格式的文件 * @param inputPath 输入路径 * @param outputPath 输出路径 * @param isAppend 是否拼接追加词库内容 * true 代表追加,false代表重建 * * **/ private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{ File file=new File(inputPath); if(!isAppend){ if(Files.exists(Paths.get(outputPath),LinkOption.values())){ System.out.println("存储此文件已经删除"); Files.deleteIfExists(Paths.get(outputPath)); } } RandomAccessFile raf=new RandomAccessFile(outputPath, "rw"); int count=0; SougouScelMdel model = new SougouScelReader().read(file); Map<String,List<String>> words = model.getWordMap(); //词<拼音,词> Set<Entry<String,List<String>>> set = words.entrySet(); Iterator<Entry<String,List<String>>> iter = set.iterator(); while(iter.hasNext()){ Entry<String,List<String>> entry = iter.next(); List<String> list = entry.getValue(); int size = list.size(); for(int i = 0; i < size; i++){ String word = list.get(i); //System.out.println(word); raf.seek(raf.getFilePointer()); raf.write((word+"\n").getBytes());//写入txt文件 count++; } } raf.close(); System.out.println("生成txt成功!,总计写入: "+count+" 条数据!"); } }