123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- import datetime
- import time
- import traceback
- import re
- import jieba
- import jieba.posseg as pseg
- from db_helper import MysqlHelper
- from log import Log
- mysql_helper = MysqlHelper()
- log_ = Log()
- def get_stop_words():
- """获取停用词表"""
- stop = open('hit_stopwords.txt', 'r+', encoding='utf-8')
- stop_words = stop.read().split("\n")
- return stop_words
- def filter_emoji(text):
- """清除文本中的表情符号"""
-
-
-
-
-
- p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+')
- result = re.sub(p, '', text)
- return result
- def word_cut(text):
- """分词"""
-
- stop_words = get_stop_words()
-
- text = text.strip()
-
- text = filter_emoji(text)
-
- seg_list = jieba.cut(text, cut_all=False)
- seg_list = [seg for seg in seg_list]
-
-
- if len(seg_list) > 1:
- words = []
- for seg in seg_list:
- words += [(word, flag) for word, flag in pseg.cut(seg)]
- seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']]
-
-
- seg_list = [seg for seg in seg_list if seg not in stop_words]
-
- seg_list = [seg for seg in seg_list if ' ' not in seg]
-
- seg_list = [seg for seg in seg_list if seg.isdigit() is False]
-
- seg_list = [seg for seg in seg_list if len(seg) > 1]
-
- return seg_list
- if __name__ == '__main__':
-
-
-
- word_cut('中国参加联合国五常阅兵,出场惊人,让所有人都震惊!')
|