algorithm
/
recommend-offline


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
							package org.xm.tokenizer;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;


/**
 * 对中文分词的封装，封装了对Xmnlp（xuming对HanLP改进版）的调用
 * 对分词器的调用采用了单例模式，实现需要时的延迟加载。
 *
 * @author xuming
 */
public class Tokenizer {
    private static final Logger logger = LoggerFactory.getLogger(Tokenizer.class);

    public static List<Word> segment(String sentence) {
        List<Word> results = new ArrayList<>();
        /*// ansj_seg
        List<org.xm.ansj.domain.Term> termList = StandardSegmentation.parse(sentence).getTerms();//ansj
        results.addAll(termList
                .stream()
                .map(term -> new Word(term.getName(), term.getNature().natureStr))
                .collect(Collectors.toList())
        );*/

        /*//Xmnlp
        List<org.xm.xmnlp.seg.domain.Term> termList = Xmnlp.segment(sentence);
        results.addAll(termList
                .stream()
                .map(term -> new Word(term.word, term.getNature().name()))
                .collect(Collectors.toList())
        );*/

        // HanLP
        List<Term> termList = HanLP.segment(sentence);
        results.addAll(termList
                .stream()
                .map(term -> new Word(term.word, term.nature.name()))
                .collect(Collectors.toList())
        );

        return results;
    }

    public static void fileSegment(String inputFilePath, String outputFilePath) {
        fileSegment(HanLP.newSegment(), inputFilePath, outputFilePath);
    }

    public static void fileSegment(Segment segment, String inputFilePath, String outputFilePath) {
        try {
            WordFreqStatistics.statistics(segment, inputFilePath);
            BufferedReader reader = IOUtil.newBufferedReader(inputFilePath);
            long allCount = 0;
            long lexCount = 0;
            long start = System.currentTimeMillis();
            String outPath = inputFilePath.replace(".txt", "") + "-Segment-Result.txt";
            if (outputFilePath != null && outputFilePath.trim().length() > 0) outPath = outputFilePath;
            FileOutputStream fos = new FileOutputStream(new File(outPath));
            String temp;
            while ((temp = reader.readLine()) != null) {
                List<Term> parse = segment.seg(temp);
                StringBuilder sb = new StringBuilder();
                for (Term term : parse) {
                    sb.append(term.toString() + "\t");
                    if (term.word.trim().length() > 0) {
                        allCount += term.length();
                        lexCount += 1;
                    }
                }
                fos.write(sb.toString().trim().getBytes());
                fos.write("\n".getBytes());
            }

            fos.flush();
            fos.close();
            reader.close();
            long end = System.currentTimeMillis();
            System.out.println("segment result save：" + outPath);
            System.out.println("共 " + allCount + " 个字符，共 " + lexCount + " 个词语，用时" + (end - start) + "毫秒，" +
                    "每秒处理了:" + (allCount * 1000 / (end - start)));
        } catch (IOException e) {
            logger.error("IO error: " + e.getLocalizedMessage());
        }
    }
}