import datetime import time import traceback import re import jieba import jieba.posseg as pseg from db_helper import MysqlHelper from log import Log mysql_helper = MysqlHelper() log_ = Log() def get_stop_words(): """获取停用词表""" stop = open('hit_stopwords.txt', 'r+', encoding='utf-8') stop_words = stop.read().split("\n") return stop_words def filter_emoji(text): """清除文本中的表情符号""" # - # 符号和象形字 # - # 表情符号 # - # 交通符号和地图符号 # - # 其它符号 # \U00010000 -\U0010ffff # 英文emoji表情 p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+') result = re.sub(p, '', text) # 正则匹配,将表情符合替换为空'' return result def word_cut(text): """分词""" # 获取停用词 stop_words = get_stop_words() # 清除空格 text = text.strip() # 清除表情符号 text = filter_emoji(text) # 精确模式分词 seg_list = jieba.cut(text, cut_all=False) seg_list = [seg for seg in seg_list] # print(seg_list) # 根据词性去除数词、数量词、量词、代词 if len(seg_list) > 1: words = [] for seg in seg_list: words += [(word, flag) for word, flag in pseg.cut(seg)] seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']] # print(seg_list) # 去除停用词 seg_list = [seg for seg in seg_list if seg not in stop_words] # 去除空格 seg_list = [seg for seg in seg_list if ' ' not in seg] # 去除纯数字字符串 seg_list = [seg for seg in seg_list if seg.isdigit() is False] # 去除单个字符 seg_list = [seg for seg in seg_list if len(seg) > 1] # print(seg_list) return seg_list if __name__ == '__main__': # get_words(8, 100) # get_words(1, 20) # get_words(2, 10) word_cut('中国参加联合国五常阅兵,出场惊人,让所有人都震惊!')