Tokenizer.java 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. package org.xm.tokenizer;
  2. import com.hankcs.hanlp.HanLP;
  3. import com.hankcs.hanlp.corpus.io.IOUtil;
  4. import com.hankcs.hanlp.seg.Segment;
  5. import com.hankcs.hanlp.seg.common.Term;
  6. import org.slf4j.Logger;
  7. import org.slf4j.LoggerFactory;
  8. import java.io.BufferedReader;
  9. import java.io.File;
  10. import java.io.FileOutputStream;
  11. import java.io.IOException;
  12. import java.util.ArrayList;
  13. import java.util.List;
  14. import java.util.stream.Collectors;
  15. /**
  16. * 对中文分词的封装,封装了对Xmnlp(xuming对HanLP改进版)的调用
  17. * 对分词器的调用采用了单例模式,实现需要时的延迟加载。
  18. *
  19. * @author xuming
  20. */
  21. public class Tokenizer {
  22. private static final Logger logger = LoggerFactory.getLogger(Tokenizer.class);
  23. public static List<Word> segment(String sentence) {
  24. List<Word> results = new ArrayList<>();
  25. /*// ansj_seg
  26. List<org.xm.ansj.domain.Term> termList = StandardSegmentation.parse(sentence).getTerms();//ansj
  27. results.addAll(termList
  28. .stream()
  29. .map(term -> new Word(term.getName(), term.getNature().natureStr))
  30. .collect(Collectors.toList())
  31. );*/
  32. /*//Xmnlp
  33. List<org.xm.xmnlp.seg.domain.Term> termList = Xmnlp.segment(sentence);
  34. results.addAll(termList
  35. .stream()
  36. .map(term -> new Word(term.word, term.getNature().name()))
  37. .collect(Collectors.toList())
  38. );*/
  39. // HanLP
  40. List<Term> termList = HanLP.segment(sentence);
  41. results.addAll(termList
  42. .stream()
  43. .map(term -> new Word(term.word, term.nature.name()))
  44. .collect(Collectors.toList())
  45. );
  46. return results;
  47. }
  48. public static void fileSegment(String inputFilePath, String outputFilePath) {
  49. fileSegment(HanLP.newSegment(), inputFilePath, outputFilePath);
  50. }
  51. public static void fileSegment(Segment segment, String inputFilePath, String outputFilePath) {
  52. try {
  53. WordFreqStatistics.statistics(segment, inputFilePath);
  54. BufferedReader reader = IOUtil.newBufferedReader(inputFilePath);
  55. long allCount = 0;
  56. long lexCount = 0;
  57. long start = System.currentTimeMillis();
  58. String outPath = inputFilePath.replace(".txt", "") + "-Segment-Result.txt";
  59. if (outputFilePath != null && outputFilePath.trim().length() > 0) outPath = outputFilePath;
  60. FileOutputStream fos = new FileOutputStream(new File(outPath));
  61. String temp;
  62. while ((temp = reader.readLine()) != null) {
  63. List<Term> parse = segment.seg(temp);
  64. StringBuilder sb = new StringBuilder();
  65. for (Term term : parse) {
  66. sb.append(term.toString() + "\t");
  67. if (term.word.trim().length() > 0) {
  68. allCount += term.length();
  69. lexCount += 1;
  70. }
  71. }
  72. fos.write(sb.toString().trim().getBytes());
  73. fos.write("\n".getBytes());
  74. }
  75. fos.flush();
  76. fos.close();
  77. reader.close();
  78. long end = System.currentTimeMillis();
  79. System.out.println("segment result save:" + outPath);
  80. System.out.println("共 " + allCount + " 个字符,共 " + lexCount + " 个词语,用时" + (end - start) + "毫秒," +
  81. "每秒处理了:" + (allCount * 1000 / (end - start)));
  82. } catch (IOException e) {
  83. logger.error("IO error: " + e.getLocalizedMessage());
  84. }
  85. }
  86. }