2 år sedan · 81f8db94d1
--- a/app.py
+++ b/app.py
@@ -7,7 +7,7 @@ monkey.patch_all()
 
				 from flask import Flask, request
			
 
				 from log import Log
			
 
				 from config import set_config
			
 
				-from words_func import get_words, update_wechat_score_data
			
 
				+from words_func import get_words, update_wechat_score_data, get_today_words
			
 
				 from update_common_words import add_words2mysql
			
 
				 
			
 
				 app = Flask(__name__)
			
@@ -89,5 +89,29 @@ def update_words():
 
				         return json.dumps(result)
			
 
				 
			
 
				 
			
 
				+# 获取热点词库中当天更新的所有热词
			
 
				+@app.route('/hot/word/getTodayWords', methods=['GET', 'POST'])
			
 
				+def get_all_words():
			
 
				+    try:
			
 
				+        start_time = time.time()
			
 
				+        request_data = json.loads(request.get_data())
			
 
				+        page_num = request_data.get('pageNum', 1)
			
 
				+        page_size = request_data.get('pageSize', 100)
			
 
				+        words = get_today_words(page_num=page_num, page_size=page_size)
			
 
				+        result = {'code': 200, 'message': 'success', 'data': {'words': words}}
			
 
				+        log_message = {
			
 
				+            'requestUri': '/hot/word/getTodayWords',
			
 
				+            'logTimestamp': int(time.time() * 1000),
			
 
				+            'result': result,
			
 
				+            'executeTime': (time.time() - start_time) * 1000
			
 
				+        }
			
 
				+        log_.info(log_message)
			
 
				+        return json.dumps(result)
			
 
				+    except Exception as e:
			
 
				+        log_.error(traceback.format_exc())
			
 
				+        result = {'code': -1, 'message': 'fail'}
			
 
				+        return json.dumps(result)
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     app.run()
			
--- a/config.py
+++ b/config.py
@@ -27,13 +27,29 @@ class BaseConfig(object):
 
				             'sheet_id': 'b74YMQ'
			
 
				         },
			
 
				     }
			
 
				+
			
 
				     # mysql数据表
			
 
				     MYSQL_TABLES = {
			
 
				         '热点词库': 'hot_word',
			
 
				-        '热词指数': 'word_wechat_score'
			
 
				+        '热词指数': 'word_wechat_score',
			
 
				+        '分词结果': 'cut_words_result'
			
 
				+    }
			
 
				+
			
 
				+    # ODPS服务配置
			
 
				+    ODPS_CONFIG = {
			
 
				+        'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
			
 
				+        'ACCESSID': 'LTAIWYUujJAm7CbH',
			
 
				+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
			
 
				+    }
			
 
				+
			
 
				+    # 站内外标题数据表
			
 
				+    TITLE_DATA = {
			
 
				+        'project': '',
			
 
				+        'table': ''
			
 
				     }
			
 
				 
			
 
				 
			
 
				+
			
 
				 class DevelopmentConfig(BaseConfig):
			
 
				     """开发环境配置"""
			
 
				     # 报警内容 环境区分
			
--- a/cut_words.py
+++ b/cut_words.py
@@ -0,0 +1,176 @@
 
				+import datetime
			
 
				+import traceback
			
 
				+import pandas as pd
			
 
				+from odps import ODPS
			
 
				+from threading import Timer
			
 
				+from config import set_config
			
 
				+from log import Log
			
 
				+from utils.utils import get_data_from_odps
			
 
				+from words_func import word_cut
			
 
				+from db_helper import MysqlHelper
			
 
				+config_ = set_config()
			
 
				+log_ = Log()
			
 
				+mysql_helper = MysqlHelper()
			
 
				+features = ['title', 'source']
			
 
				+
			
 
				+
			
 
				+def check_table_partition_exits(date, project, table, connect_timeout=3000, read_timeout=500000,
			
 
				+                                pool_maxsize=1000, pool_connections=1000):
			
 
				+    """
			
 
				+    判断表中是否存在这个分区
			
 
				+    :param date: 日期 type-string '%Y%m%d'
			
 
				+    :param project: type-string
			
 
				+    :param table: 表名 type-string
			
 
				+    :param connect_timeout: 连接超时设置
			
 
				+    :param read_timeout: 读取超时设置
			
 
				+    :param pool_maxsize:
			
 
				+    :param pool_connections:
			
 
				+    :return: records
			
 
				+    """
			
 
				+    odps = ODPS(
			
 
				+        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project=project,
			
 
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				+        connect_timeout=connect_timeout,
			
 
				+        read_timeout=read_timeout,
			
 
				+        pool_maxsize=pool_maxsize,
			
 
				+        pool_connections=pool_connections
			
 
				+    )
			
 
				+    t = odps.get_table(name=table)
			
 
				+    return t.exist_partition(partition_spec=f'dt={date}')
			
 
				+
			
 
				+
			
 
				+def data_check(project, table, now_date):
			
 
				+    """检查数据是否准备好"""
			
 
				+    odps = ODPS(
			
 
				+        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project=project,
			
 
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				+        connect_timeout=3000,
			
 
				+        read_timeout=500000,
			
 
				+        pool_maxsize=1000,
			
 
				+        pool_connections=1000
			
 
				+    )
			
 
				+
			
 
				+    try:
			
 
				+        dt = datetime.datetime.strftime(now_date, '%Y%m%d%H')
			
 
				+        check_res = check_table_partition_exits(date=dt, project=project, table=table)
			
 
				+        if check_res:
			
 
				+            sql = f'select * from {project}.{table} where dt = {dt}'
			
 
				+            with odps.execute_sql(sql=sql).open_reader() as reader:
			
 
				+                data_count = reader.count
			
 
				+        else:
			
 
				+            data_count = 0
			
 
				+    except Exception as e:
			
 
				+        data_count = 0
			
 
				+    return data_count
			
 
				+
			
 
				+
			
 
				+def get_title_data(project, table, now_date):
			
 
				+    """获取站内外视频标题数据"""
			
 
				+    dt = datetime.datetime.strftime(now_date, '%Y%m%d')
			
 
				+    records = get_data_from_odps(date=dt, project=project, table=table)
			
 
				+    feature_data = []
			
 
				+    for record in records:
			
 
				+        item = {}
			
 
				+        for feature_name in features:
			
 
				+            item[feature_name] = record[feature_name]
			
 
				+        feature_data.append(item)
			
 
				+    feature_df = pd.DataFrame(feature_data)
			
 
				+    return feature_df
			
 
				+
			
 
				+
			
 
				+def update_cut_words_result(text, source, words_list):
			
 
				+    """
			
 
				+    分词结果入库
			
 
				+    :param text: 原始文本 type-string
			
 
				+    :param source: 文本来源 type-int
			
 
				+    :param words_list: 分词结果 type-list
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 分词结果拼接成字符串
			
 
				+    words = ','.join(words_list)
			
 
				+    # 判断原始文本是否已存在
			
 
				+    select_sql = f"SELECT id FROM word.cut_words_result WHERE text = '{text}';"
			
 
				+    res = mysql_helper.get_data(sql=select_sql)
			
 
				+    if res is None or len(res) == 0:
			
 
				+        # 不存在，插入
			
 
				+        insert_sql = f"insert into word.cut_words_result (text, words, source) values '{text}', '{words}', {source};"
			
 
				+        mysql_helper.add_data(sql=insert_sql)
			
 
				+    else:
			
 
				+        # 存在，更新
			
 
				+        update_sql = f"""update word.cut_words_result set words = '{words}', source = {source}, 
			
 
				+        update_time = {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} where id = {res[0][0]};"""
			
 
				+        mysql_helper.add_data(sql=update_sql)
			
 
				+
			
 
				+
			
 
				+def update_hot_word(words_list, source):
			
 
				+    """
			
 
				+    词入库
			
 
				+    :param words_list: 词列表 type-list
			
 
				+    :param source: 词来源 type-int
			
 
				+    :return:
			
 
				+    """
			
 
				+    for word in words_list:
			
 
				+        if len(word) == 0:
			
 
				+            continue
			
 
				+        # 判断词是否已存在
			
 
				+        select_sql = f"SELECT id, source FROM word.hot_word WHERE word = '{word}';"
			
 
				+        res = mysql_helper.get_data(sql=select_sql)
			
 
				+        if res is None or len(res) == 0:
			
 
				+            # 不存在，插入
			
 
				+            insert_sql = f"insert into word.hot_word (word, source) values '{word}', {source};"
			
 
				+            mysql_helper.add_data(sql=insert_sql)
			
 
				+        else:
			
 
				+            # 存在，更新
			
 
				+            if source != res[0][1]:
			
 
				+                source = 3
			
 
				+            update_sql = \
			
 
				+                f"""update word.hot_word set source = {source}, 
			
 
				+                update_time = {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} where id = {res[0][0]};"""
			
 
				+            mysql_helper.add_data(sql=update_sql)
			
 
				+
			
 
				+
			
 
				+def data_update(project, table, now_date):
			
 
				+    """数据更新"""
			
 
				+    # 获取站内外视频标题数据
			
 
				+    df = get_title_data(project=project, table=table, now_date=now_date)
			
 
				+    df = df['tag'].astype(int)
			
 
				+    for source in [1, 2]:
			
 
				+        df_temp = df[df['source'] == source]
			
 
				+        title_list = df_temp['title'].to_list()
			
 
				+        for title in title_list:
			
 
				+            if len(title) == 0:
			
 
				+                return
			
 
				+            # 1. 分词
			
 
				+            words_list = word_cut(text=title)
			
 
				+            # 2. 分词结果入库
			
 
				+            update_cut_words_result(text=title, source=source, words_list=words_list)
			
 
				+            # 3. 词入库
			
 
				+            update_hot_word(words_list=words_list, source=source)
			
 
				+
			
 
				+
			
 
				+def timer_check():
			
 
				+    try:
			
 
				+        project = config_.TITLE_DATA['project']
			
 
				+        table = config_.TITLE_DATA['table']
			
 
				+        now_date = datetime.datetime.today()
			
 
				+        log_.info(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
			
 
				+        # 查看当天更新的数据是否已准备好
			
 
				+        data_count = data_check(project=project, table=table, now_date=now_date)
			
 
				+        if data_count > 0:
			
 
				+            log_.info(f'data_count = {data_count}')
			
 
				+            # 数据准备好，进行更新
			
 
				+            data_update(project=project, table=table, now_date=now_date)
			
 
				+            log_.info(f"data update end!")
			
 
				+        else:
			
 
				+            # 数据没准备好，1分钟后重新检查
			
 
				+            Timer(60, timer_check).start()
			
 
				+    except Exception as e:
			
 
				+        log_.error(f"数据更新失败, exception: {e}, traceback: {traceback.format_exc()}")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    timer_check()
			
--- a/hit_stopwords.txt
+++ b/hit_stopwords.txt
@@ -0,0 +1,849 @@
 
				+———
			
 
				+》），
			
 
				+）÷（１－
			
 
				+”，
			
 
				+）、
			
 
				+＝（
			
 
				+:
			
 
				+→
			
 
				+℃ 
			
 
				+&
			
 
				+*
			
 
				+一一
			
 
				+~~~~
			
 
				+’
			
 
				+. 
			
 
				+『
			
 
				+.一
			
 
				+./
			
 
				+-- 
			
 
				+』
			
 
				+＝″
			
 
				+【
			
 
				+［＊］
			
 
				+｝＞
			
 
				+［⑤］］
			
 
				+［①Ｄ］
			
 
				+ｃ］
			
 
				+ｎｇ昉
			
 
				+＊
			
 
				+//
			
 
				+［
			
 
				+］
			
 
				+［②ｅ］
			
 
				+［②ｇ］
			
 
				+＝｛
			
 
				+}
			
 
				+，也 
			
 
				+‘
			
 
				+Ａ
			
 
				+［①⑥］
			
 
				+［②Ｂ］ 
			
 
				+［①ａ］
			
 
				+［④ａ］
			
 
				+［①③］
			
 
				+［③ｈ］
			
 
				+③］
			
 
				+１． 
			
 
				+－－ 
			
 
				+［②ｂ］
			
 
				+’‘ 
			
 
				+××× 
			
 
				+［①⑧］
			
 
				+０：２ 
			
 
				+＝［
			
 
				+［⑤ｂ］
			
 
				+［②ｃ］ 
			
 
				+［④ｂ］
			
 
				+［②③］
			
 
				+［③ａ］
			
 
				+［④ｃ］
			
 
				+［①⑤］
			
 
				+［①⑦］
			
 
				+［①ｇ］
			
 
				+∈［ 
			
 
				+［①⑨］
			
 
				+［①④］
			
 
				+［①ｃ］
			
 
				+［②ｆ］
			
 
				+［②⑧］
			
 
				+［②①］
			
 
				+［①Ｃ］
			
 
				+［③ｃ］
			
 
				+［③ｇ］
			
 
				+［②⑤］
			
 
				+［②②］
			
 
				+一.
			
 
				+［①ｈ］
			
 
				+.数
			
 
				+［］
			
 
				+［①Ｂ］
			
 
				+数/
			
 
				+［①ｉ］
			
 
				+［③ｅ］
			
 
				+［①①］
			
 
				+［④ｄ］
			
 
				+［④ｅ］
			
 
				+［③ｂ］
			
 
				+［⑤ａ］
			
 
				+［①Ａ］
			
 
				+［②⑧］
			
 
				+［②⑦］
			
 
				+［①ｄ］
			
 
				+［②ｊ］
			
 
				+〕〔
			
 
				+］［
			
 
				+://
			
 
				+′∈
			
 
				+［②④
			
 
				+［⑤ｅ］
			
 
				+１２％
			
 
				+ｂ］
			
 
				+...
			
 
				+...................
			
 
				+…………………………………………………③
			
 
				+ＺＸＦＩＴＬ
			
 
				+［③Ｆ］
			
 
				+」
			
 
				+［①ｏ］
			
 
				+］∧′＝［ 
			
 
				+∪φ∈
			
 
				+′｜
			
 
				+｛－
			
 
				+②ｃ
			
 
				+｝
			
 
				+［③①］
			
 
				+Ｒ．Ｌ．
			
 
				+［①Ｅ］
			
 
				+Ψ
			
 
				+－［＊］－
			
 
				+↑
			
 
				+.日 
			
 
				+［②ｄ］
			
 
				+［②
			
 
				+［②⑦］
			
 
				+［②②］
			
 
				+［③ｅ］
			
 
				+［①ｉ］
			
 
				+［①Ｂ］
			
 
				+［①ｈ］
			
 
				+［①ｄ］
			
 
				+［①ｇ］
			
 
				+［①②］
			
 
				+［②ａ］
			
 
				+ｆ］
			
 
				+［⑩］
			
 
				+ａ］
			
 
				+［①ｅ］
			
 
				+［②ｈ］
			
 
				+［②⑥］
			
 
				+［③ｄ］
			
 
				+［②⑩］
			
 
				+ｅ］
			
 
				+〉
			
 
				+】
			
 
				+元／吨
			
 
				+［②⑩］
			
 
				+２．３％
			
 
				+５：０  
			
 
				+［①］
			
 
				+::
			
 
				+［②］
			
 
				+［③］
			
 
				+［④］
			
 
				+［⑤］
			
 
				+［⑥］
			
 
				+［⑦］
			
 
				+［⑧］
			
 
				+［⑨］ 
			
 
				+……
			
 
				+——
			
 
				+?
			
 
				+、
			
 
				+。
			
 
				+“
			
 
				+”
			
 
				+《
			
 
				+》
			
 
				+！
			
 
				+，
			
 
				+：
			
 
				+；
			
 
				+？
			
 
				+．
			
 
				+,
			
 
				+．
			
 
				+'
			
 
				+? 
			
 
				+·
			
 
				+———
			
 
				+──
			
 
				+? 
			
 
				+—
			
 
				+<
			
 
				+>
			
 
				+（
			
 
				+）
			
 
				+〔
			
 
				+〕
			
 
				+[
			
 
				+]
			
 
				+(
			
 
				+)
			
 
				+-
			
 
				++
			
 
				+～
			
 
				+×
			
 
				+／
			
 
				+/
			
 
				+①
			
 
				+②
			
 
				+③
			
 
				+④
			
 
				+⑤
			
 
				+⑥
			
 
				+⑦
			
 
				+⑧
			
 
				+⑨
			
 
				+⑩
			
 
				+Ⅲ
			
 
				+В
			
 
				+"
			
 
				+;
			
 
				+#
			
 
				+@
			
 
				+γ
			
 
				+μ
			
 
				+φ
			
 
				+φ．
			
 
				+× 
			
 
				+Δ
			
 
				+■
			
 
				+▲
			
 
				+sub
			
 
				+exp 
			
 
				+sup
			
 
				+sub
			
 
				+Lex 
			
 
				+＃
			
 
				+％
			
 
				+＆
			
 
				+＇
			
 
				+＋
			
 
				+＋ξ
			
 
				+＋＋
			
 
				+－
			
 
				+－β
			
 
				+＜
			
 
				+＜±
			
 
				+＜Δ
			
 
				+＜λ
			
 
				+＜φ
			
 
				+＜＜
			
 
				+=
			
 
				+＝
			
 
				+＝☆
			
 
				+＝－
			
 
				+＞
			
 
				+＞λ
			
 
				+＿
			
 
				+～±
			
 
				+～＋
			
 
				+［⑤ｆ］
			
 
				+［⑤ｄ］
			
 
				+［②ｉ］
			
 
				+≈ 
			
 
				+［②Ｇ］
			
 
				+［①ｆ］
			
 
				+ＬＩ
			
 
				+㈧ 
			
 
				+［－
			
 
				+......
			
 
				+〉
			
 
				+［③⑩］
			
 
				+第二
			
 
				+一番
			
 
				+一直
			
 
				+一个
			
 
				+一些
			
 
				+许多
			
 
				+种
			
 
				+有的是
			
 
				+也就是说
			
 
				+末##末
			
 
				+啊
			
 
				+阿
			
 
				+哎
			
 
				+哎呀
			
 
				+哎哟
			
 
				+唉
			
 
				+俺
			
 
				+俺们
			
 
				+按
			
 
				+按照
			
 
				+吧
			
 
				+吧哒
			
 
				+把
			
 
				+罢了
			
 
				+被
			
 
				+本
			
 
				+本着
			
 
				+比
			
 
				+比方
			
 
				+比如
			
 
				+鄙人
			
 
				+彼
			
 
				+彼此
			
 
				+边
			
 
				+别
			
 
				+别的
			
 
				+别说
			
 
				+并
			
 
				+并且
			
 
				+不比
			
 
				+不成
			
 
				+不单
			
 
				+不但
			
 
				+不独
			
 
				+不管
			
 
				+不光
			
 
				+不过
			
 
				+不仅
			
 
				+不拘
			
 
				+不论
			
 
				+不怕
			
 
				+不然
			
 
				+不如
			
 
				+不特
			
 
				+不惟
			
 
				+不问
			
 
				+不只
			
 
				+朝
			
 
				+朝着
			
 
				+趁
			
 
				+趁着
			
 
				+乘
			
 
				+冲
			
 
				+除
			
 
				+除此之外
			
 
				+除非
			
 
				+除了
			
 
				+此
			
 
				+此间
			
 
				+此外
			
 
				+从
			
 
				+从而
			
 
				+打
			
 
				+待
			
 
				+但
			
 
				+但是
			
 
				+当
			
 
				+当着
			
 
				+到
			
 
				+得
			
 
				+的
			
 
				+的话
			
 
				+等
			
 
				+等等
			
 
				+地
			
 
				+第
			
 
				+叮咚
			
 
				+对
			
 
				+对于
			
 
				+多
			
 
				+多少
			
 
				+而
			
 
				+而况
			
 
				+而且
			
 
				+而是
			
 
				+而外
			
 
				+而言
			
 
				+而已
			
 
				+尔后
			
 
				+反过来
			
 
				+反过来说
			
 
				+反之
			
 
				+非但
			
 
				+非徒
			
 
				+否则
			
 
				+嘎
			
 
				+嘎登
			
 
				+该
			
 
				+赶
			
 
				+个
			
 
				+各
			
 
				+各个
			
 
				+各位
			
 
				+各种
			
 
				+各自
			
 
				+给
			
 
				+根据
			
 
				+跟
			
 
				+故
			
 
				+故此
			
 
				+固然
			
 
				+关于
			
 
				+管
			
 
				+归
			
 
				+果然
			
 
				+果真
			
 
				+过
			
 
				+哈
			
 
				+哈哈
			
 
				+呵
			
 
				+和
			
 
				+何
			
 
				+何处
			
 
				+何况
			
 
				+何时
			
 
				+嘿
			
 
				+哼
			
 
				+哼唷
			
 
				+呼哧
			
 
				+乎
			
 
				+哗
			
 
				+还是
			
 
				+还有
			
 
				+换句话说
			
 
				+换言之
			
 
				+或
			
 
				+或是
			
 
				+或者
			
 
				+极了
			
 
				+及
			
 
				+及其
			
 
				+及至
			
 
				+即
			
 
				+即便
			
 
				+即或
			
 
				+即令
			
 
				+即若
			
 
				+即使
			
 
				+几
			
 
				+几时
			
 
				+己
			
 
				+既
			
 
				+既然
			
 
				+既是
			
 
				+继而
			
 
				+加之
			
 
				+假如
			
 
				+假若
			
 
				+假使
			
 
				+鉴于
			
 
				+将
			
 
				+较
			
 
				+较之
			
 
				+叫
			
 
				+接着
			
 
				+结果
			
 
				+借
			
 
				+紧接着
			
 
				+进而
			
 
				+尽
			
 
				+尽管
			
 
				+经
			
 
				+经过
			
 
				+就
			
 
				+就是
			
 
				+就是说
			
 
				+据
			
 
				+具体地说
			
 
				+具体说来
			
 
				+开始
			
 
				+开外
			
 
				+靠
			
 
				+咳
			
 
				+可
			
 
				+可见
			
 
				+可是
			
 
				+可以
			
 
				+况且
			
 
				+啦
			
 
				+来
			
 
				+来着
			
 
				+离
			
 
				+例如
			
 
				+哩
			
 
				+连
			
 
				+连同
			
 
				+两者
			
 
				+了
			
 
				+临
			
 
				+另
			
 
				+另外
			
 
				+另一方面
			
 
				+论
			
 
				+嘛
			
 
				+吗
			
 
				+慢说
			
 
				+漫说
			
 
				+冒
			
 
				+么
			
 
				+每
			
 
				+每当
			
 
				+们
			
 
				+莫若
			
 
				+某
			
 
				+某个
			
 
				+某些
			
 
				+拿
			
 
				+哪
			
 
				+哪边
			
 
				+哪儿
			
 
				+哪个
			
 
				+哪里
			
 
				+哪年
			
 
				+哪怕
			
 
				+哪天
			
 
				+哪些
			
 
				+哪样
			
 
				+那
			
 
				+那边
			
 
				+那儿
			
 
				+那个
			
 
				+那会儿
			
 
				+那里
			
 
				+那么
			
 
				+那么些
			
 
				+那么样
			
 
				+那时
			
 
				+那些
			
 
				+那样
			
 
				+乃
			
 
				+乃至
			
 
				+呢
			
 
				+能
			
 
				+你
			
 
				+你们
			
 
				+您
			
 
				+宁
			
 
				+宁可
			
 
				+宁肯
			
 
				+宁愿
			
 
				+哦
			
 
				+呕
			
 
				+啪达
			
 
				+旁人
			
 
				+呸
			
 
				+凭
			
 
				+凭借
			
 
				+其
			
 
				+其次
			
 
				+其二
			
 
				+其他
			
 
				+其它
			
 
				+其一
			
 
				+其余
			
 
				+其中
			
 
				+起
			
 
				+起见
			
 
				+起见
			
 
				+岂但
			
 
				+恰恰相反
			
 
				+前后
			
 
				+前者
			
 
				+且
			
 
				+然而
			
 
				+然后
			
 
				+然则
			
 
				+让
			
 
				+人家
			
 
				+任
			
 
				+任何
			
 
				+任凭
			
 
				+如
			
 
				+如此
			
 
				+如果
			
 
				+如何
			
 
				+如其
			
 
				+如若
			
 
				+如上所述
			
 
				+若
			
 
				+若非
			
 
				+若是
			
 
				+啥
			
 
				+上下
			
 
				+尚且
			
 
				+设若
			
 
				+设使
			
 
				+甚而
			
 
				+甚么
			
 
				+甚至
			
 
				+省得
			
 
				+时候
			
 
				+什么
			
 
				+什么样
			
 
				+使得
			
 
				+是
			
 
				+是的
			
 
				+首先
			
 
				+谁
			
 
				+谁知
			
 
				+顺
			
 
				+顺着
			
 
				+似的
			
 
				+虽
			
 
				+虽然
			
 
				+虽说
			
 
				+虽则
			
 
				+随
			
 
				+随着
			
 
				+所
			
 
				+所以
			
 
				+他
			
 
				+他们
			
 
				+他人
			
 
				+它
			
 
				+它们
			
 
				+她
			
 
				+她们
			
 
				+倘
			
 
				+倘或
			
 
				+倘然
			
 
				+倘若
			
 
				+倘使
			
 
				+腾
			
 
				+替
			
 
				+通过
			
 
				+同
			
 
				+同时
			
 
				+哇
			
 
				+万一
			
 
				+往
			
 
				+望
			
 
				+为
			
 
				+为何
			
 
				+为了
			
 
				+为什么
			
 
				+为着
			
 
				+喂
			
 
				+嗡嗡
			
 
				+我
			
 
				+我们
			
 
				+呜
			
 
				+呜呼
			
 
				+乌乎
			
 
				+无论
			
 
				+无宁
			
 
				+毋宁
			
 
				+嘻
			
 
				+吓
			
 
				+相对而言
			
 
				+像
			
 
				+向
			
 
				+向着
			
 
				+嘘
			
 
				+呀
			
 
				+焉
			
 
				+沿
			
 
				+沿着
			
 
				+要
			
 
				+要不
			
 
				+要不然
			
 
				+要不是
			
 
				+要么
			
 
				+要是
			
 
				+也
			
 
				+也罢
			
 
				+也好
			
 
				+一
			
 
				+一般
			
 
				+一旦
			
 
				+一方面
			
 
				+一来
			
 
				+一切
			
 
				+一样
			
 
				+一则
			
 
				+依
			
 
				+依照
			
 
				+矣
			
 
				+以
			
 
				+以便
			
 
				+以及
			
 
				+以免
			
 
				+以至
			
 
				+以至于
			
 
				+以致
			
 
				+抑或
			
 
				+因
			
 
				+因此
			
 
				+因而
			
 
				+因为
			
 
				+哟
			
 
				+用
			
 
				+由
			
 
				+由此可见
			
 
				+由于
			
 
				+有
			
 
				+有的
			
 
				+有关
			
 
				+有些
			
 
				+又
			
 
				+于
			
 
				+于是
			
 
				+于是乎
			
 
				+与
			
 
				+与此同时
			
 
				+与否
			
 
				+与其
			
 
				+越是
			
 
				+云云
			
 
				+哉
			
 
				+再说
			
 
				+再者
			
 
				+在
			
 
				+在下
			
 
				+咱
			
 
				+咱们
			
 
				+则
			
 
				+怎
			
 
				+怎么
			
 
				+怎么办
			
 
				+怎么样
			
 
				+怎样
			
 
				+咋
			
 
				+照
			
 
				+照着
			
 
				+者
			
 
				+这
			
 
				+这边
			
 
				+这儿
			
 
				+这个
			
 
				+这会儿
			
 
				+这就是说
			
 
				+这里
			
 
				+这么
			
 
				+这么点儿
			
 
				+这么些
			
 
				+这么样
			
 
				+这时
			
 
				+这些
			
 
				+这样
			
 
				+正如
			
 
				+吱
			
 
				+之
			
 
				+之类
			
 
				+之所以
			
 
				+之一
			
 
				+只是
			
 
				+只限
			
 
				+只要
			
 
				+只有
			
 
				+至
			
 
				+至于
			
 
				+诸位
			
 
				+着
			
 
				+着呢
			
 
				+自
			
 
				+自从
			
 
				+自个儿
			
 
				+自各儿
			
 
				+自己
			
 
				+自家
			
 
				+自身
			
 
				+综上所述
			
 
				+总的来看
			
 
				+总的来说
			
 
				+总的说来
			
 
				+总而言之
			
 
				+总之
			
 
				+纵
			
 
				+纵令
			
 
				+纵然
			
 
				+纵使
			
 
				+遵照
			
 
				+作为
			
 
				+兮
			
 
				+呃
			
 
				+呗
			
 
				+咚
			
 
				+咦
			
 
				+喏
			
 
				+啐
			
 
				+喔唷
			
 
				+嗬
			
 
				+嗯
			
 
				+嗳
			
 
				+看完大快人心！
			
 
				+刚刚曝光
			
 
				+头一次见
			
 
				+第一次见
			
 
				+看看
			
 
				+速看
			
 
				+必看
			
 
				+快看看
			
 
				+赶紧看
			
 
				+打开看看
			
 
				+一起看看
			
 
				+一起听听
			
 
				+赶紧看看
			
 
				+一定要看
			
 
				+值得一看
			
 
				+国人必看
			
 
				+值得看看
			
 
				+一定要看
			
 
				+不看可惜
			
 
				+全民必看
			
 
				+大家速看
			
 
				+我看完了
			
 
				+发给大家看看
			
 
				+发给大家都看看
			
 
				+刚刚
			
 
				+刚刚传来
			
 
				+刚刚发生
			
 
				+刚刚公开
			
 
				+就在刚刚
			
 
				+十万火急
			
 
				+紧急通知
			
 
				+爆炸消息传来
			
 
				+转发就是支持
			
 
				+全球震怒
			
 
				+突然暴露
			
 
				+重磅提醒
			
 
				+今早传来
			
 
				+全民疯传
			
 
				+现在知道还不晚
			
 
				+早知道早受益
			
 
				+请转发
			
 
				+事关所有人
			
 
				+终于公开
			
 
				+重磅来袭
			
 
				+不可思议
			
 
				+噩耗传来
			
 
				+前所未闻
			
 
				+传来消息
			
 
				+今日周一
			
 
				+今日周二
			
 
				+今日周三
			
 
				+今日周四
			
 
				+今日周五
			
 
				+今日周六
			
 
				+今日周末
			
 
				+务必警惕
			
 
				+今天周
			
 
				+看完转发
			
 
				+二个
			
 
				+终于
			
 
				+今天
			
 
				+有人
			
 
				+知道
			
 
				+不是
			
 
				+大家
			
 
				+一分
			
 
				+传来
			
 
				+打开
			
 
				+一起
			
 
				+赶紧
			
 
				+一定
			
 
				+值得
			
 
				+看完
			
 
				+大家
			
 
				+发生
			
 
				+公开
			
 
				+紧急
			
 
				+通知
			
 
				+听听
			
 
				+紧急
			
 
				+通知
			
 
				+今日
			
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -1,8 +1,11 @@
 
				 import requests
			
 
				 import json
			
 
				 import traceback
			
 
				+from odps import ODPS
			
 
				 from log import Log
			
 
				+from config import set_config
			
 
				 log_ = Log()
			
 
				+config_ = set_config()
			
 
				 
			
 
				 
			
 
				 def request_post(request_url, headers, request_data):
			
@@ -43,3 +46,30 @@ def request_get(request_url, headers, params=None):
 
				     except Exception as e:
			
 
				         log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
			
 
				         return None
			
 
				+
			
 
				+
			
 
				+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
			
 
				+                       pool_maxsize=1000, pool_connections=1000):
			
 
				+    """
			
 
				+    从odps获取数据
			
 
				+    :param date: 日期 type-string '%Y%m%d'
			
 
				+    :param project: type-string
			
 
				+    :param table: 表名 type-string
			
 
				+    :param connect_timeout: 连接超时设置
			
 
				+    :param read_timeout: 读取超时设置
			
 
				+    :param pool_maxsize:
			
 
				+    :param pool_connections:
			
 
				+    :return: records
			
 
				+    """
			
 
				+    odps = ODPS(
			
 
				+        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				+        project=project,
			
 
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				+        connect_timeout=connect_timeout,
			
 
				+        read_timeout=read_timeout,
			
 
				+        pool_maxsize=pool_maxsize,
			
 
				+        pool_connections=pool_connections
			
 
				+    )
			
 
				+    records = odps.read_table(name=table, partition='dt=%s' % date)
			
 
				+    return records
			
--- a/words_func.py
+++ b/words_func.py
@@ -1,5 +1,9 @@
 
				+import datetime
			
 
				 import time
			
 
				 import traceback
			
 
				+import re
			
 
				+import jieba
			
 
				+import jieba.posseg as pseg
			
 
				 
			
 
				 from db_helper import MysqlHelper
			
 
				 from log import Log
			
@@ -108,6 +112,79 @@ def update_wechat_score_data(data):
 
				     log_.info(f"update wechat score data finished! update count = {len(update_data)}")
			
 
				 
			
 
				 
			
 
				+def get_stop_words():
			
 
				+    """获取停用词表"""
			
 
				+    stop = open('hit_stopwords.txt', 'r+', encoding='utf-8')
			
 
				+    stop_words = stop.read().split("\n")
			
 
				+    return stop_words
			
 
				+
			
 
				+
			
 
				+def filter_emoji(text):
			
 
				+    """清除文本中的表情符号"""
			
 
				+    # <U+1F300> - <U+1F5FF>  # 符号和象形字
			
 
				+    # <U+1F600> - <U+1F64F>  # 表情符号
			
 
				+    # <U+1F680> - <U+1F6FF>  # 交通符号和地图符号
			
 
				+    # <U+2600 > - <U+2B55>  # 其它符号
			
 
				+    # \U00010000 -\U0010ffff  # 英文emoji表情
			
 
				+    p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+')
			
 
				+    result = re.sub(p, '', text)  # 正则匹配，将表情符合替换为空''
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def word_cut(text):
			
 
				+    """分词"""
			
 
				+    # 获取停用词
			
 
				+    stop_words = get_stop_words()
			
 
				+    # 清除空格
			
 
				+    text = text.strip()
			
 
				+    # 清除表情符号
			
 
				+    text = filter_emoji(text)
			
 
				+    # 精确模式分词
			
 
				+    seg_list = jieba.cut(text, cut_all=False)
			
 
				+    seg_list = [seg for seg in seg_list]
			
 
				+    # print(seg_list)
			
 
				+    # 根据词性去除数词、数量词、量词、代词
			
 
				+    if len(seg_list) > 1:
			
 
				+        words = []
			
 
				+        for seg in seg_list:
			
 
				+            words += [(word, flag) for word, flag in pseg.cut(seg)]
			
 
				+        seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']]
			
 
				+        # print(seg_list)
			
 
				+    # 去除停用词
			
 
				+    seg_list = [seg for seg in seg_list if seg not in stop_words]
			
 
				+    # 去除空格
			
 
				+    seg_list = [seg for seg in seg_list if ' ' not in seg]
			
 
				+    # 去除纯数字字符串
			
 
				+    seg_list = [seg for seg in seg_list if seg.isdigit() is False]
			
 
				+    # 去除单个字符
			
 
				+    seg_list = [seg for seg in seg_list if len(seg) > 1]
			
 
				+    # print(seg_list)
			
 
				+    return seg_list
			
 
				+
			
 
				+
			
 
				+def get_today_words(page_num, page_size):
			
 
				+    """
			
 
				+    分页获取今日更新的所有热点词
			
 
				+    :param page_num: 页码
			
 
				+    :param page_size: 每页请求条目数
			
 
				+    :return: words
			
 
				+    """
			
 
				+    try:
			
 
				+        dt = datetime.datetime.today().strftime('%Y-%m-%d')
			
 
				+        sql = f"select id, word from word.hot_word where update_time > cast('{dt}' as datetime) " \
			
 
				+              f"order by id limit {(page_num-1)*page_size}, {page_size};"
			
 
				+        data = mysql_helper.get_data(sql=sql)
			
 
				+        if data is None:
			
 
				+            return None
			
 
				+        words = []
			
 
				+        for id_, word in data:
			
 
				+            words.append({'id': id_, 'word': word})
			
 
				+        return words
			
 
				+    except Exception as e:
			
 
				+        log_.error(traceback.format_exc())
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     get_words(8, 100)
			
 
				     # get_words(1, 20)