words_func.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. import datetime
  2. import time
  3. import traceback
  4. import re
  5. import jieba
  6. import jieba.posseg as pseg
  7. from db_helper import MysqlHelper
  8. from log import Log
  9. mysql_helper = MysqlHelper()
  10. log_ = Log()
  11. def get_words(page_num, page_size):
  12. """
  13. 分页获取所有热点词
  14. :param page_num: 页码
  15. :param page_size: 每页请求条目数
  16. :return: words
  17. """
  18. try:
  19. sql = f"select id, word from word.hot_word order by id limit {(page_num-1)*page_size}, {page_size};"
  20. data = mysql_helper.get_data(sql=sql)
  21. if data is None:
  22. return None
  23. words = []
  24. for id_, word in data:
  25. words.append({'id': id_, 'word': word})
  26. return words
  27. except Exception as e:
  28. log_.error(traceback.format_exc())
  29. return None
  30. def update_wechat_score_data(data):
  31. """
  32. 根据爬取到的微信指数数据更新数据库
  33. :param data:
  34. :return:
  35. """
  36. if data is None or len(data) == 0:
  37. log_.info(f"无需要更新的数据!")
  38. return
  39. # 爬取数据解析
  40. log_.info(f"data count = {len(data)}")
  41. wechat_score_data_list = []
  42. for item in data:
  43. if item is None:
  44. continue
  45. word_id = item.get('id')
  46. word = item.get('word')
  47. wechat_scores = item.get('wechatScores')
  48. if wechat_scores is None or len(wechat_scores) == 0:
  49. continue
  50. for score_data in wechat_scores:
  51. score = score_data.get('score')
  52. score_date = score_data.get('scoreDate')
  53. wechat_score_data_list.append({'word_id': word_id, 'word': word, 'score': score, 'score_date': score_date})
  54. log_.info(f"wechat_score_data_list count = {len(wechat_score_data_list)}")
  55. # update or insert 数据区分
  56. update_data = []
  57. insert_data = []
  58. for wechat_score_data in wechat_score_data_list:
  59. select_sql = f"SELECT id FROM word.word_wechat_score " \
  60. f"WHERE word_id = {wechat_score_data['word_id']} " \
  61. f"AND score_date = '{wechat_score_data['score_date']}';"
  62. res = mysql_helper.get_data(sql=select_sql)
  63. if res is None:
  64. continue
  65. if len(res) == 0:
  66. insert_data.append(wechat_score_data)
  67. else:
  68. id_ = res[0][0]
  69. wechat_score_data['id'] = id_
  70. update_data.append(wechat_score_data)
  71. log_.info(f"update_data = {len(update_data)}")
  72. log_.info(f"insert_data = {len(insert_data)}")
  73. # 批量插入
  74. if len(insert_data) > 0:
  75. for i in range(len(insert_data) // 100 + 1):
  76. log_.info(f"insert i = {i}")
  77. insert_temp_data = insert_data[i * 100:(i + 1) * 100]
  78. if len(insert_temp_data) > 0:
  79. insert_sql_values = ', '.join([f"({item['word_id']}, {item['score']}, '{item['score_date']}')"
  80. for item in insert_temp_data])
  81. insert_start_time = time.time()
  82. insert_sql = f"insert into word.word_wechat_score (word_id, wechat_score, score_date) " \
  83. f"values {insert_sql_values};"
  84. mysql_helper.add_data(sql=insert_sql)
  85. log_.info(f"insert executeTime: {(time.time() - insert_start_time) * 1000}")
  86. log_.info(f"insert wechat score data finished! insert count = {len(insert_data)}")
  87. # 批量更新
  88. if len(update_data) > 0:
  89. for i in range(len(update_data) // 100 + 1):
  90. log_.info(f"update i = {i}")
  91. update_temp_data = update_data[i * 100:(i + 1) * 100]
  92. if len(update_temp_data) > 0:
  93. update_id = [item['id'] for item in update_temp_data]
  94. update_sql_values = ' '.join([f"when {item['id']} then {item['score']}" for item in update_temp_data])
  95. update_start_time = time.time()
  96. if len(update_id) > 1:
  97. update_sql = f"update word.word_wechat_score set wechat_score = " \
  98. f"case id {update_sql_values} end where id in {tuple(update_id)};"
  99. else:
  100. update_sql = f"update word.word_wechat_score set wechat_score = " \
  101. f"case id {update_sql_values} end where id in ({update_id[0]});"
  102. mysql_helper.add_data(sql=update_sql)
  103. log_.info(f"update executeTime: {(time.time() - update_start_time) * 1000}")
  104. log_.info(f"update wechat score data finished! update count = {len(update_data)}")
  105. def get_stop_words():
  106. """获取停用词表"""
  107. stop = open('hit_stopwords.txt', 'r+', encoding='utf-8')
  108. stop_words = stop.read().split("\n")
  109. return stop_words
  110. def filter_emoji(text):
  111. """清除文本中的表情符号"""
  112. # <U+1F300> - <U+1F5FF> # 符号和象形字
  113. # <U+1F600> - <U+1F64F> # 表情符号
  114. # <U+1F680> - <U+1F6FF> # 交通符号和地图符号
  115. # <U+2600 > - <U+2B55> # 其它符号
  116. # \U00010000 -\U0010ffff # 英文emoji表情
  117. p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+')
  118. result = re.sub(p, '', text) # 正则匹配,将表情符合替换为空''
  119. return result
  120. def word_cut(text):
  121. """分词"""
  122. # 获取停用词
  123. stop_words = get_stop_words()
  124. # 清除空格
  125. text = text.strip()
  126. # 清除表情符号
  127. text = filter_emoji(text)
  128. # 精确模式分词
  129. seg_list = jieba.cut(text, cut_all=False)
  130. seg_list = [seg for seg in seg_list]
  131. # print(seg_list)
  132. # 根据词性去除数词、数量词、量词、代词
  133. if len(seg_list) > 1:
  134. words = []
  135. for seg in seg_list:
  136. words += [(word, flag) for word, flag in pseg.cut(seg)]
  137. seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']]
  138. # print(seg_list)
  139. # 去除停用词
  140. seg_list = [seg for seg in seg_list if seg not in stop_words]
  141. # 去除空格
  142. seg_list = [seg for seg in seg_list if ' ' not in seg]
  143. # 去除纯数字字符串
  144. seg_list = [seg for seg in seg_list if seg.isdigit() is False]
  145. # 去除单个字符
  146. seg_list = [seg for seg in seg_list if len(seg) > 1]
  147. # print(seg_list)
  148. return seg_list
  149. def get_today_words(page_num, page_size):
  150. """
  151. 分页获取今日更新的所有热点词
  152. :param page_num: 页码
  153. :param page_size: 每页请求条目数
  154. :return: words
  155. """
  156. try:
  157. dt = datetime.datetime.today().strftime('%Y-%m-%d')
  158. sql = f"select id, word from word.hot_word where update_time > cast('{dt}' as datetime) " \
  159. f"order by id limit {(page_num-1)*page_size}, {page_size};"
  160. data = mysql_helper.get_data(sql=sql)
  161. if data is None:
  162. return None
  163. words = []
  164. for id_, word in data:
  165. words.append({'id': id_, 'word': word})
  166. return words
  167. except Exception as e:
  168. log_.error(traceback.format_exc())
  169. return None
  170. if __name__ == '__main__':
  171. get_words(8, 100)
  172. # get_words(1, 20)
  173. # get_words(2, 10)