words_func.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import datetime
  2. import time
  3. import traceback
  4. import re
  5. import jieba
  6. import jieba.posseg as pseg
  7. from db_helper import MysqlHelper
  8. from log import Log
  9. mysql_helper = MysqlHelper()
  10. log_ = Log()
  11. def get_stop_words():
  12. """获取停用词表"""
  13. stop = open('hit_stopwords.txt', 'r+', encoding='utf-8')
  14. stop_words = stop.read().split("\n")
  15. return stop_words
  16. def filter_emoji(text):
  17. """清除文本中的表情符号"""
  18. # <U+1F300> - <U+1F5FF> # 符号和象形字
  19. # <U+1F600> - <U+1F64F> # 表情符号
  20. # <U+1F680> - <U+1F6FF> # 交通符号和地图符号
  21. # <U+2600 > - <U+2B55> # 其它符号
  22. # \U00010000 -\U0010ffff # 英文emoji表情
  23. p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+')
  24. result = re.sub(p, '', text) # 正则匹配,将表情符合替换为空''
  25. return result
  26. def word_cut(text):
  27. """分词"""
  28. # 获取停用词
  29. stop_words = get_stop_words()
  30. # 清除空格
  31. text = text.strip()
  32. # 清除表情符号
  33. text = filter_emoji(text)
  34. # 精确模式分词
  35. seg_list = jieba.cut(text, cut_all=False)
  36. seg_list = [seg for seg in seg_list]
  37. # print(seg_list)
  38. # 根据词性去除数词、数量词、量词、代词
  39. if len(seg_list) > 1:
  40. words = []
  41. for seg in seg_list:
  42. words += [(word, flag) for word, flag in pseg.cut(seg)]
  43. seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']]
  44. # print(seg_list)
  45. # 去除停用词
  46. seg_list = [seg for seg in seg_list if seg not in stop_words]
  47. # 去除空格
  48. seg_list = [seg for seg in seg_list if ' ' not in seg]
  49. # 去除纯数字字符串
  50. seg_list = [seg for seg in seg_list if seg.isdigit() is False]
  51. # 去除单个字符
  52. seg_list = [seg for seg in seg_list if len(seg) > 1]
  53. # print(seg_list)
  54. return seg_list
  55. if __name__ == '__main__':
  56. # get_words(8, 100)
  57. # get_words(1, 20)
  58. # get_words(2, 10)
  59. word_cut('中国参加联合国五常阅兵,出场惊人,让所有人都震惊!')