123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- import datetime
- import time
- import traceback
- import re
- import jieba
- import jieba.posseg as pseg
- from db_helper import MysqlHelper
- from log import Log
- mysql_helper = MysqlHelper()
- log_ = Log()
- def get_stop_words():
- """获取停用词表"""
- stop = open('hit_stopwords.txt', 'r+', encoding='utf-8')
- stop_words = stop.read().split("\n")
- return stop_words
- def filter_emoji(text):
- """清除文本中的表情符号"""
- # <U+1F300> - <U+1F5FF> # 符号和象形字
- # <U+1F600> - <U+1F64F> # 表情符号
- # <U+1F680> - <U+1F6FF> # 交通符号和地图符号
- # <U+2600 > - <U+2B55> # 其它符号
- # \U00010000 -\U0010ffff # 英文emoji表情
- p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+')
- result = re.sub(p, '', text) # 正则匹配,将表情符合替换为空''
- return result
- def word_cut(text):
- """分词"""
- # 获取停用词
- stop_words = get_stop_words()
- # 清除空格
- text = text.strip()
- # 清除表情符号
- text = filter_emoji(text)
- # 精确模式分词
- seg_list = jieba.cut(text, cut_all=False)
- seg_list = [seg for seg in seg_list]
- # print(seg_list)
- # 根据词性去除数词、数量词、量词、代词
- if len(seg_list) > 1:
- words = []
- for seg in seg_list:
- words += [(word, flag) for word, flag in pseg.cut(seg)]
- seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']]
- # print(seg_list)
- # 去除停用词
- seg_list = [seg for seg in seg_list if seg not in stop_words]
- # 去除空格
- seg_list = [seg for seg in seg_list if ' ' not in seg]
- # 去除纯数字字符串
- seg_list = [seg for seg in seg_list if seg.isdigit() is False]
- # 去除单个字符
- seg_list = [seg for seg in seg_list if len(seg) > 1]
- # print(seg_list)
- return seg_list
- if __name__ == '__main__':
- # get_words(8, 100)
- # get_words(1, 20)
- # get_words(2, 10)
- word_cut('中国参加联合国五常阅兵,出场惊人,让所有人都震惊!')
|