2 năm trước cách đây · ab57339390
--- a/cut_words.py
+++ b/cut_words.py
@@ -1,181 +0,0 @@
 
				-import datetime
			
 
				-import traceback
			
 
				-import pandas as pd
			
 
				-from odps import ODPS
			
 
				-from threading import Timer
			
 
				-from config import set_config
			
 
				-from log import Log
			
 
				-from utils.utils import get_data_from_odps
			
 
				-from words_func import word_cut
			
 
				-from db_helper import MysqlHelper
			
 
				-config_, env = set_config()
			
 
				-log_ = Log()
			
 
				-mysql_helper = MysqlHelper()
			
 
				-features = ['title', 'source']
			
 
				-
			
 
				-
			
 
				-def check_table_partition_exits(date, project, table, connect_timeout=3000, read_timeout=500000,
			
 
				-                                pool_maxsize=1000, pool_connections=1000):
			
 
				-    """
			
 
				-    判断表中是否存在这个分区
			
 
				-    :param date: 日期 type-string '%Y%m%d'
			
 
				-    :param project: type-string
			
 
				-    :param table: 表名 type-string
			
 
				-    :param connect_timeout: 连接超时设置
			
 
				-    :param read_timeout: 读取超时设置
			
 
				-    :param pool_maxsize:
			
 
				-    :param pool_connections:
			
 
				-    :return: records
			
 
				-    """
			
 
				-    odps = ODPS(
			
 
				-        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				-        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				-        project=project,
			
 
				-        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				-        connect_timeout=connect_timeout,
			
 
				-        read_timeout=read_timeout,
			
 
				-        pool_maxsize=pool_maxsize,
			
 
				-        pool_connections=pool_connections
			
 
				-    )
			
 
				-    t = odps.get_table(name=table)
			
 
				-    return t.exist_partition(partition_spec=f'dt={date}')
			
 
				-
			
 
				-
			
 
				-def data_check(project, table, now_date):
			
 
				-    """检查数据是否准备好"""
			
 
				-    odps = ODPS(
			
 
				-        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				-        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				-        project=project,
			
 
				-        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				-        connect_timeout=3000,
			
 
				-        read_timeout=500000,
			
 
				-        pool_maxsize=1000,
			
 
				-        pool_connections=1000
			
 
				-    )
			
 
				-
			
 
				-    try:
			
 
				-        dt = datetime.datetime.strftime(now_date, '%Y%m%d')
			
 
				-        check_res = check_table_partition_exits(date=dt, project=project, table=table)
			
 
				-        if check_res:
			
 
				-            sql = f'select * from {project}.{table} where dt = {dt}'
			
 
				-            with odps.execute_sql(sql=sql).open_reader() as reader:
			
 
				-                data_count = reader.count
			
 
				-        else:
			
 
				-            data_count = 0
			
 
				-    except Exception as e:
			
 
				-        data_count = 0
			
 
				-    return data_count
			
 
				-
			
 
				-
			
 
				-def get_title_data(project, table, now_date):
			
 
				-    """获取站内外视频标题数据"""
			
 
				-    dt = datetime.datetime.strftime(now_date, '%Y%m%d')
			
 
				-    records = get_data_from_odps(date=dt, project=project, table=table)
			
 
				-    feature_data = []
			
 
				-    for record in records:
			
 
				-        item = {}
			
 
				-        for feature_name in features:
			
 
				-            item[feature_name] = record[feature_name]
			
 
				-        feature_data.append(item)
			
 
				-    feature_df = pd.DataFrame(feature_data)
			
 
				-    return feature_df
			
 
				-
			
 
				-
			
 
				-def update_cut_words_result(text, source, words_list):
			
 
				-    """
			
 
				-    分词结果入库
			
 
				-    :param text: 原始文本 type-string
			
 
				-    :param source: 文本来源 type-int
			
 
				-    :param words_list: 分词结果 type-list
			
 
				-    :return:
			
 
				-    """
			
 
				-    # 分词结果拼接成字符串
			
 
				-    words = ','.join(words_list)
			
 
				-    # 判断原始文本是否已存在
			
 
				-    select_sql = f"SELECT id FROM word.cut_words_result WHERE text = '{text}';"
			
 
				-    res = mysql_helper.get_data(sql=select_sql)
			
 
				-    if res is None or len(res) == 0:
			
 
				-        # 不存在，插入
			
 
				-        insert_sql = f"insert into word.cut_words_result (text, words, source) values ('{text}', '{words}', {source});"
			
 
				-        log_.info(f"insert_sql = {insert_sql}")
			
 
				-        mysql_helper.add_data(sql=insert_sql)
			
 
				-    else:
			
 
				-        # 存在，更新
			
 
				-        update_sql = f"""update word.cut_words_result set words = '{words}', source = {source}, 
			
 
				-        update_time = '{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}' where id = {res[0][0]};"""
			
 
				-        log_.info(f"update_sql = {update_sql}")
			
 
				-        mysql_helper.add_data(sql=update_sql)
			
 
				-
			
 
				-
			
 
				-def update_hot_word(words_list, source):
			
 
				-    """
			
 
				-    词入库
			
 
				-    :param words_list: 词列表 type-list
			
 
				-    :param source: 词来源 type-int
			
 
				-    :return:
			
 
				-    """
			
 
				-    for word in words_list:
			
 
				-        if len(word) == 0:
			
 
				-            continue
			
 
				-        # 判断词是否已存在
			
 
				-        select_sql = f"SELECT id, source FROM word.hot_word WHERE word = '{word}';"
			
 
				-        res = mysql_helper.get_data(sql=select_sql)
			
 
				-        if res is None or len(res) == 0:
			
 
				-            # 不存在，插入
			
 
				-            insert_sql = f"insert into word.hot_word (word, source) values ('{word}', {source});"
			
 
				-            mysql_helper.add_data(sql=insert_sql)
			
 
				-        else:
			
 
				-            # 存在，更新
			
 
				-            if source != res[0][1]:
			
 
				-                source = 3
			
 
				-            update_sql = \
			
 
				-                f"""update word.hot_word set source = {source}, 
			
 
				-                update_time = '{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}' where id = {res[0][0]};"""
			
 
				-            mysql_helper.add_data(sql=update_sql)
			
 
				-
			
 
				-
			
 
				-def data_update(project, table, now_date):
			
 
				-    """数据更新"""
			
 
				-    # 获取站内外视频标题数据
			
 
				-    df = get_title_data(project=project, table=table, now_date=now_date)
			
 
				-    df['source'] = df['source'].astype(int)
			
 
				-    for source in [1, 2]:
			
 
				-        df_temp = df[df['source'] == source]
			
 
				-        title_list = df_temp['title'].to_list()
			
 
				-        log_.info(f"source = {source}, count = {len(title_list)}")
			
 
				-        for title in title_list:
			
 
				-            log_.info(f"title: {title}")
			
 
				-            if len(title) == 0:
			
 
				-                return
			
 
				-            # 1. 分词
			
 
				-            words_list = word_cut(text=title)
			
 
				-            log_.info(f"words_list: {words_list}")
			
 
				-            # 2. 分词结果入库
			
 
				-            update_cut_words_result(text=title, source=source, words_list=words_list)
			
 
				-            # 3. 词入库
			
 
				-            update_hot_word(words_list=words_list, source=source)
			
 
				-
			
 
				-
			
 
				-def timer_check():
			
 
				-    try:
			
 
				-        project = config_.TITLE_DATA['project']
			
 
				-        table = config_.TITLE_DATA['table']
			
 
				-        now_date = datetime.datetime.today()
			
 
				-        log_.info(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
			
 
				-        # 查看当天更新的数据是否已准备好
			
 
				-        data_count = data_check(project=project, table=table, now_date=now_date)
			
 
				-        if data_count > 0:
			
 
				-            log_.info(f'data_count = {data_count}')
			
 
				-            # 数据准备好，进行更新
			
 
				-            data_update(project=project, table=table, now_date=now_date)
			
 
				-            log_.info(f"data update end!")
			
 
				-        else:
			
 
				-            # 数据没准备好，1分钟后重新检查
			
 
				-            Timer(60, timer_check).start()
			
 
				-    except Exception as e:
			
 
				-        log_.error(f"数据更新失败, exception: {e}, traceback: {traceback.format_exc()}")
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    timer_check()
			
--- a/cut_words_task.sh
+++ b/cut_words_task.sh
@@ -1,7 +0,0 @@
 
				-source /etc/profile
			
 
				-echo $HOT_WORDS_ENV
			
 
				-if [[ $HOT_WORDS_ENV == 'test' ]]; then
			
 
				-    cd /data2/hot-words && /root/anaconda3/bin/python /data2/hot-words/cut_words.py
			
 
				-elif [[ $HOT_WORDS_ENV == 'pro' ]]; then
			
 
				-    cd /data/hot-words && /root/anaconda3/bin/python /data2/hot-words/cut_words.py
			
 
				-fi
			
--- a/hit_stopwords.txt
+++ b/hit_stopwords.txt
@@ -1,849 +0,0 @@
 
				-———
			
 
				-》），
			
 
				-）÷（１－
			
 
				-”，
			
 
				-）、
			
 
				-＝（
			
 
				-:
			
 
				-→
			
 
				-℃ 
			
 
				-&
			
 
				-*
			
 
				-一一
			
 
				-~~~~
			
 
				-’
			
 
				-. 
			
 
				-『
			
 
				-.一
			
 
				-./
			
 
				--- 
			
 
				-』
			
 
				-＝″
			
 
				-【
			
 
				-［＊］
			
 
				-｝＞
			
 
				-［⑤］］
			
 
				-［①Ｄ］
			
 
				-ｃ］
			
 
				-ｎｇ昉
			
 
				-＊
			
 
				-//
			
 
				-［
			
 
				-］
			
 
				-［②ｅ］
			
 
				-［②ｇ］
			
 
				-＝｛
			
 
				-}
			
 
				-，也 
			
 
				-‘
			
 
				-Ａ
			
 
				-［①⑥］
			
 
				-［②Ｂ］ 
			
 
				-［①ａ］
			
 
				-［④ａ］
			
 
				-［①③］
			
 
				-［③ｈ］
			
 
				-③］
			
 
				-１． 
			
 
				-－－ 
			
 
				-［②ｂ］
			
 
				-’‘ 
			
 
				-××× 
			
 
				-［①⑧］
			
 
				-０：２ 
			
 
				-＝［
			
 
				-［⑤ｂ］
			
 
				-［②ｃ］ 
			
 
				-［④ｂ］
			
 
				-［②③］
			
 
				-［③ａ］
			
 
				-［④ｃ］
			
 
				-［①⑤］
			
 
				-［①⑦］
			
 
				-［①ｇ］
			
 
				-∈［ 
			
 
				-［①⑨］
			
 
				-［①④］
			
 
				-［①ｃ］
			
 
				-［②ｆ］
			
 
				-［②⑧］
			
 
				-［②①］
			
 
				-［①Ｃ］
			
 
				-［③ｃ］
			
 
				-［③ｇ］
			
 
				-［②⑤］
			
 
				-［②②］
			
 
				-一.
			
 
				-［①ｈ］
			
 
				-.数
			
 
				-［］
			
 
				-［①Ｂ］
			
 
				-数/
			
 
				-［①ｉ］
			
 
				-［③ｅ］
			
 
				-［①①］
			
 
				-［④ｄ］
			
 
				-［④ｅ］
			
 
				-［③ｂ］
			
 
				-［⑤ａ］
			
 
				-［①Ａ］
			
 
				-［②⑧］
			
 
				-［②⑦］
			
 
				-［①ｄ］
			
 
				-［②ｊ］
			
 
				-〕〔
			
 
				-］［
			
 
				-://
			
 
				-′∈
			
 
				-［②④
			
 
				-［⑤ｅ］
			
 
				-１２％
			
 
				-ｂ］
			
 
				-...
			
 
				-...................
			
 
				-…………………………………………………③
			
 
				-ＺＸＦＩＴＬ
			
 
				-［③Ｆ］
			
 
				-」
			
 
				-［①ｏ］
			
 
				-］∧′＝［ 
			
 
				-∪φ∈
			
 
				-′｜
			
 
				-｛－
			
 
				-②ｃ
			
 
				-｝
			
 
				-［③①］
			
 
				-Ｒ．Ｌ．
			
 
				-［①Ｅ］
			
 
				-Ψ
			
 
				-－［＊］－
			
 
				-↑
			
 
				-.日 
			
 
				-［②ｄ］
			
 
				-［②
			
 
				-［②⑦］
			
 
				-［②②］
			
 
				-［③ｅ］
			
 
				-［①ｉ］
			
 
				-［①Ｂ］
			
 
				-［①ｈ］
			
 
				-［①ｄ］
			
 
				-［①ｇ］
			
 
				-［①②］
			
 
				-［②ａ］
			
 
				-ｆ］
			
 
				-［⑩］
			
 
				-ａ］
			
 
				-［①ｅ］
			
 
				-［②ｈ］
			
 
				-［②⑥］
			
 
				-［③ｄ］
			
 
				-［②⑩］
			
 
				-ｅ］
			
 
				-〉
			
 
				-】
			
 
				-元／吨
			
 
				-［②⑩］
			
 
				-２．３％
			
 
				-５：０  
			
 
				-［①］
			
 
				-::
			
 
				-［②］
			
 
				-［③］
			
 
				-［④］
			
 
				-［⑤］
			
 
				-［⑥］
			
 
				-［⑦］
			
 
				-［⑧］
			
 
				-［⑨］ 
			
 
				-……
			
 
				-——
			
 
				-?
			
 
				-、
			
 
				-。
			
 
				-“
			
 
				-”
			
 
				-《
			
 
				-》
			
 
				-！
			
 
				-，
			
 
				-：
			
 
				-；
			
 
				-？
			
 
				-．
			
 
				-,
			
 
				-．
			
 
				-'
			
 
				-? 
			
 
				-·
			
 
				-———
			
 
				-──
			
 
				-? 
			
 
				-—
			
 
				-<
			
 
				->
			
 
				-（
			
 
				-）
			
 
				-〔
			
 
				-〕
			
 
				-[
			
 
				-]
			
 
				-(
			
 
				-)
			
 
				--
			
 
				-+
			
 
				-～
			
 
				-×
			
 
				-／
			
 
				-/
			
 
				-①
			
 
				-②
			
 
				-③
			
 
				-④
			
 
				-⑤
			
 
				-⑥
			
 
				-⑦
			
 
				-⑧
			
 
				-⑨
			
 
				-⑩
			
 
				-Ⅲ
			
 
				-В
			
 
				-"
			
 
				-;
			
 
				-#
			
 
				-@
			
 
				-γ
			
 
				-μ
			
 
				-φ
			
 
				-φ．
			
 
				-× 
			
 
				-Δ
			
 
				-■
			
 
				-▲
			
 
				-sub
			
 
				-exp 
			
 
				-sup
			
 
				-sub
			
 
				-Lex 
			
 
				-＃
			
 
				-％
			
 
				-＆
			
 
				-＇
			
 
				-＋
			
 
				-＋ξ
			
 
				-＋＋
			
 
				-－
			
 
				-－β
			
 
				-＜
			
 
				-＜±
			
 
				-＜Δ
			
 
				-＜λ
			
 
				-＜φ
			
 
				-＜＜
			
 
				-=
			
 
				-＝
			
 
				-＝☆
			
 
				-＝－
			
 
				-＞
			
 
				-＞λ
			
 
				-＿
			
 
				-～±
			
 
				-～＋
			
 
				-［⑤ｆ］
			
 
				-［⑤ｄ］
			
 
				-［②ｉ］
			
 
				-≈ 
			
 
				-［②Ｇ］
			
 
				-［①ｆ］
			
 
				-ＬＩ
			
 
				-㈧ 
			
 
				-［－
			
 
				-......
			
 
				-〉
			
 
				-［③⑩］
			
 
				-第二
			
 
				-一番
			
 
				-一直
			
 
				-一个
			
 
				-一些
			
 
				-许多
			
 
				-种
			
 
				-有的是
			
 
				-也就是说
			
 
				-末##末
			
 
				-啊
			
 
				-阿
			
 
				-哎
			
 
				-哎呀
			
 
				-哎哟
			
 
				-唉
			
 
				-俺
			
 
				-俺们
			
 
				-按
			
 
				-按照
			
 
				-吧
			
 
				-吧哒
			
 
				-把
			
 
				-罢了
			
 
				-被
			
 
				-本
			
 
				-本着
			
 
				-比
			
 
				-比方
			
 
				-比如
			
 
				-鄙人
			
 
				-彼
			
 
				-彼此
			
 
				-边
			
 
				-别
			
 
				-别的
			
 
				-别说
			
 
				-并
			
 
				-并且
			
 
				-不比
			
 
				-不成
			
 
				-不单
			
 
				-不但
			
 
				-不独
			
 
				-不管
			
 
				-不光
			
 
				-不过
			
 
				-不仅
			
 
				-不拘
			
 
				-不论
			
 
				-不怕
			
 
				-不然
			
 
				-不如
			
 
				-不特
			
 
				-不惟
			
 
				-不问
			
 
				-不只
			
 
				-朝
			
 
				-朝着
			
 
				-趁
			
 
				-趁着
			
 
				-乘
			
 
				-冲
			
 
				-除
			
 
				-除此之外
			
 
				-除非
			
 
				-除了
			
 
				-此
			
 
				-此间
			
 
				-此外
			
 
				-从
			
 
				-从而
			
 
				-打
			
 
				-待
			
 
				-但
			
 
				-但是
			
 
				-当
			
 
				-当着
			
 
				-到
			
 
				-得
			
 
				-的
			
 
				-的话
			
 
				-等
			
 
				-等等
			
 
				-地
			
 
				-第
			
 
				-叮咚
			
 
				-对
			
 
				-对于
			
 
				-多
			
 
				-多少
			
 
				-而
			
 
				-而况
			
 
				-而且
			
 
				-而是
			
 
				-而外
			
 
				-而言
			
 
				-而已
			
 
				-尔后
			
 
				-反过来
			
 
				-反过来说
			
 
				-反之
			
 
				-非但
			
 
				-非徒
			
 
				-否则
			
 
				-嘎
			
 
				-嘎登
			
 
				-该
			
 
				-赶
			
 
				-个
			
 
				-各
			
 
				-各个
			
 
				-各位
			
 
				-各种
			
 
				-各自
			
 
				-给
			
 
				-根据
			
 
				-跟
			
 
				-故
			
 
				-故此
			
 
				-固然
			
 
				-关于
			
 
				-管
			
 
				-归
			
 
				-果然
			
 
				-果真
			
 
				-过
			
 
				-哈
			
 
				-哈哈
			
 
				-呵
			
 
				-和
			
 
				-何
			
 
				-何处
			
 
				-何况
			
 
				-何时
			
 
				-嘿
			
 
				-哼
			
 
				-哼唷
			
 
				-呼哧
			
 
				-乎
			
 
				-哗
			
 
				-还是
			
 
				-还有
			
 
				-换句话说
			
 
				-换言之
			
 
				-或
			
 
				-或是
			
 
				-或者
			
 
				-极了
			
 
				-及
			
 
				-及其
			
 
				-及至
			
 
				-即
			
 
				-即便
			
 
				-即或
			
 
				-即令
			
 
				-即若
			
 
				-即使
			
 
				-几
			
 
				-几时
			
 
				-己
			
 
				-既
			
 
				-既然
			
 
				-既是
			
 
				-继而
			
 
				-加之
			
 
				-假如
			
 
				-假若
			
 
				-假使
			
 
				-鉴于
			
 
				-将
			
 
				-较
			
 
				-较之
			
 
				-叫
			
 
				-接着
			
 
				-结果
			
 
				-借
			
 
				-紧接着
			
 
				-进而
			
 
				-尽
			
 
				-尽管
			
 
				-经
			
 
				-经过
			
 
				-就
			
 
				-就是
			
 
				-就是说
			
 
				-据
			
 
				-具体地说
			
 
				-具体说来
			
 
				-开始
			
 
				-开外
			
 
				-靠
			
 
				-咳
			
 
				-可
			
 
				-可见
			
 
				-可是
			
 
				-可以
			
 
				-况且
			
 
				-啦
			
 
				-来
			
 
				-来着
			
 
				-离
			
 
				-例如
			
 
				-哩
			
 
				-连
			
 
				-连同
			
 
				-两者
			
 
				-了
			
 
				-临
			
 
				-另
			
 
				-另外
			
 
				-另一方面
			
 
				-论
			
 
				-嘛
			
 
				-吗
			
 
				-慢说
			
 
				-漫说
			
 
				-冒
			
 
				-么
			
 
				-每
			
 
				-每当
			
 
				-们
			
 
				-莫若
			
 
				-某
			
 
				-某个
			
 
				-某些
			
 
				-拿
			
 
				-哪
			
 
				-哪边
			
 
				-哪儿
			
 
				-哪个
			
 
				-哪里
			
 
				-哪年
			
 
				-哪怕
			
 
				-哪天
			
 
				-哪些
			
 
				-哪样
			
 
				-那
			
 
				-那边
			
 
				-那儿
			
 
				-那个
			
 
				-那会儿
			
 
				-那里
			
 
				-那么
			
 
				-那么些
			
 
				-那么样
			
 
				-那时
			
 
				-那些
			
 
				-那样
			
 
				-乃
			
 
				-乃至
			
 
				-呢
			
 
				-能
			
 
				-你
			
 
				-你们
			
 
				-您
			
 
				-宁
			
 
				-宁可
			
 
				-宁肯
			
 
				-宁愿
			
 
				-哦
			
 
				-呕
			
 
				-啪达
			
 
				-旁人
			
 
				-呸
			
 
				-凭
			
 
				-凭借
			
 
				-其
			
 
				-其次
			
 
				-其二
			
 
				-其他
			
 
				-其它
			
 
				-其一
			
 
				-其余
			
 
				-其中
			
 
				-起
			
 
				-起见
			
 
				-起见
			
 
				-岂但
			
 
				-恰恰相反
			
 
				-前后
			
 
				-前者
			
 
				-且
			
 
				-然而
			
 
				-然后
			
 
				-然则
			
 
				-让
			
 
				-人家
			
 
				-任
			
 
				-任何
			
 
				-任凭
			
 
				-如
			
 
				-如此
			
 
				-如果
			
 
				-如何
			
 
				-如其
			
 
				-如若
			
 
				-如上所述
			
 
				-若
			
 
				-若非
			
 
				-若是
			
 
				-啥
			
 
				-上下
			
 
				-尚且
			
 
				-设若
			
 
				-设使
			
 
				-甚而
			
 
				-甚么
			
 
				-甚至
			
 
				-省得
			
 
				-时候
			
 
				-什么
			
 
				-什么样
			
 
				-使得
			
 
				-是
			
 
				-是的
			
 
				-首先
			
 
				-谁
			
 
				-谁知
			
 
				-顺
			
 
				-顺着
			
 
				-似的
			
 
				-虽
			
 
				-虽然
			
 
				-虽说
			
 
				-虽则
			
 
				-随
			
 
				-随着
			
 
				-所
			
 
				-所以
			
 
				-他
			
 
				-他们
			
 
				-他人
			
 
				-它
			
 
				-它们
			
 
				-她
			
 
				-她们
			
 
				-倘
			
 
				-倘或
			
 
				-倘然
			
 
				-倘若
			
 
				-倘使
			
 
				-腾
			
 
				-替
			
 
				-通过
			
 
				-同
			
 
				-同时
			
 
				-哇
			
 
				-万一
			
 
				-往
			
 
				-望
			
 
				-为
			
 
				-为何
			
 
				-为了
			
 
				-为什么
			
 
				-为着
			
 
				-喂
			
 
				-嗡嗡
			
 
				-我
			
 
				-我们
			
 
				-呜
			
 
				-呜呼
			
 
				-乌乎
			
 
				-无论
			
 
				-无宁
			
 
				-毋宁
			
 
				-嘻
			
 
				-吓
			
 
				-相对而言
			
 
				-像
			
 
				-向
			
 
				-向着
			
 
				-嘘
			
 
				-呀
			
 
				-焉
			
 
				-沿
			
 
				-沿着
			
 
				-要
			
 
				-要不
			
 
				-要不然
			
 
				-要不是
			
 
				-要么
			
 
				-要是
			
 
				-也
			
 
				-也罢
			
 
				-也好
			
 
				-一
			
 
				-一般
			
 
				-一旦
			
 
				-一方面
			
 
				-一来
			
 
				-一切
			
 
				-一样
			
 
				-一则
			
 
				-依
			
 
				-依照
			
 
				-矣
			
 
				-以
			
 
				-以便
			
 
				-以及
			
 
				-以免
			
 
				-以至
			
 
				-以至于
			
 
				-以致
			
 
				-抑或
			
 
				-因
			
 
				-因此
			
 
				-因而
			
 
				-因为
			
 
				-哟
			
 
				-用
			
 
				-由
			
 
				-由此可见
			
 
				-由于
			
 
				-有
			
 
				-有的
			
 
				-有关
			
 
				-有些
			
 
				-又
			
 
				-于
			
 
				-于是
			
 
				-于是乎
			
 
				-与
			
 
				-与此同时
			
 
				-与否
			
 
				-与其
			
 
				-越是
			
 
				-云云
			
 
				-哉
			
 
				-再说
			
 
				-再者
			
 
				-在
			
 
				-在下
			
 
				-咱
			
 
				-咱们
			
 
				-则
			
 
				-怎
			
 
				-怎么
			
 
				-怎么办
			
 
				-怎么样
			
 
				-怎样
			
 
				-咋
			
 
				-照
			
 
				-照着
			
 
				-者
			
 
				-这
			
 
				-这边
			
 
				-这儿
			
 
				-这个
			
 
				-这会儿
			
 
				-这就是说
			
 
				-这里
			
 
				-这么
			
 
				-这么点儿
			
 
				-这么些
			
 
				-这么样
			
 
				-这时
			
 
				-这些
			
 
				-这样
			
 
				-正如
			
 
				-吱
			
 
				-之
			
 
				-之类
			
 
				-之所以
			
 
				-之一
			
 
				-只是
			
 
				-只限
			
 
				-只要
			
 
				-只有
			
 
				-至
			
 
				-至于
			
 
				-诸位
			
 
				-着
			
 
				-着呢
			
 
				-自
			
 
				-自从
			
 
				-自个儿
			
 
				-自各儿
			
 
				-自己
			
 
				-自家
			
 
				-自身
			
 
				-综上所述
			
 
				-总的来看
			
 
				-总的来说
			
 
				-总的说来
			
 
				-总而言之
			
 
				-总之
			
 
				-纵
			
 
				-纵令
			
 
				-纵然
			
 
				-纵使
			
 
				-遵照
			
 
				-作为
			
 
				-兮
			
 
				-呃
			
 
				-呗
			
 
				-咚
			
 
				-咦
			
 
				-喏
			
 
				-啐
			
 
				-喔唷
			
 
				-嗬
			
 
				-嗯
			
 
				-嗳
			
 
				-看完大快人心！
			
 
				-刚刚曝光
			
 
				-头一次见
			
 
				-第一次见
			
 
				-看看
			
 
				-速看
			
 
				-必看
			
 
				-快看看
			
 
				-赶紧看
			
 
				-打开看看
			
 
				-一起看看
			
 
				-一起听听
			
 
				-赶紧看看
			
 
				-一定要看
			
 
				-值得一看
			
 
				-国人必看
			
 
				-值得看看
			
 
				-一定要看
			
 
				-不看可惜
			
 
				-全民必看
			
 
				-大家速看
			
 
				-我看完了
			
 
				-发给大家看看
			
 
				-发给大家都看看
			
 
				-刚刚
			
 
				-刚刚传来
			
 
				-刚刚发生
			
 
				-刚刚公开
			
 
				-就在刚刚
			
 
				-十万火急
			
 
				-紧急通知
			
 
				-爆炸消息传来
			
 
				-转发就是支持
			
 
				-全球震怒
			
 
				-突然暴露
			
 
				-重磅提醒
			
 
				-今早传来
			
 
				-全民疯传
			
 
				-现在知道还不晚
			
 
				-早知道早受益
			
 
				-请转发
			
 
				-事关所有人
			
 
				-终于公开
			
 
				-重磅来袭
			
 
				-不可思议
			
 
				-噩耗传来
			
 
				-前所未闻
			
 
				-传来消息
			
 
				-今日周一
			
 
				-今日周二
			
 
				-今日周三
			
 
				-今日周四
			
 
				-今日周五
			
 
				-今日周六
			
 
				-今日周末
			
 
				-务必警惕
			
 
				-今天周
			
 
				-看完转发
			
 
				-二个
			
 
				-终于
			
 
				-今天
			
 
				-有人
			
 
				-知道
			
 
				-不是
			
 
				-大家
			
 
				-一分
			
 
				-传来
			
 
				-打开
			
 
				-一起
			
 
				-赶紧
			
 
				-一定
			
 
				-值得
			
 
				-看完
			
 
				-大家
			
 
				-发生
			
 
				-公开
			
 
				-紧急
			
 
				-通知
			
 
				-听听
			
 
				-紧急
			
 
				-通知
			
 
				-今日
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,5 @@
 
				-pandas==1.1.3
			
 
				-jieba==0.42.1
			
 
				 Flask==1.1.2
			
 
				-pyodps==0.10.7
			
 
				 requests==2.24.0
			
 
				 gevent==20.9.0
			
 
				 PyMySQL==1.0.2
			
 
				 aliyun_python_sdk==2.2.0
			
 
				-odps==3.5.1
			
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -1,7 +1,6 @@
 
				 import requests
			
 
				 import json
			
 
				 import traceback
			
 
				-from odps import ODPS
			
 
				 from log import Log
			
 
				 from config import set_config
			
 
				 log_ = Log()
			
@@ -46,30 +45,3 @@ def request_get(request_url, headers, params=None):
 
				     except Exception as e:
			
 
				         log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
			
 
				         return None
			
 
				-
			
 
				-
			
 
				-def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
			
 
				-                       pool_maxsize=1000, pool_connections=1000):
			
 
				-    """
			
 
				-    从odps获取数据
			
 
				-    :param date: 日期 type-string '%Y%m%d'
			
 
				-    :param project: type-string
			
 
				-    :param table: 表名 type-string
			
 
				-    :param connect_timeout: 连接超时设置
			
 
				-    :param read_timeout: 读取超时设置
			
 
				-    :param pool_maxsize:
			
 
				-    :param pool_connections:
			
 
				-    :return: records
			
 
				-    """
			
 
				-    odps = ODPS(
			
 
				-        access_id=config_.ODPS_CONFIG['ACCESSID'],
			
 
				-        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
			
 
				-        project=project,
			
 
				-        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
			
 
				-        connect_timeout=connect_timeout,
			
 
				-        read_timeout=read_timeout,
			
 
				-        pool_maxsize=pool_maxsize,
			
 
				-        pool_connections=pool_connections
			
 
				-    )
			
 
				-    records = odps.read_table(name=table, partition='dt=%s' % date)
			
 
				-    return records
			
--- a/words_func.py
+++ b/words_func.py
@@ -1,10 +1,6 @@
 
				 import datetime
			
 
				 import time
			
 
				 import traceback
			
 
				-import re
			
 
				-import jieba
			
 
				-import jieba.posseg as pseg
			
 
				-
			
 
				 from db_helper import MysqlHelper
			
 
				 from log import Log
			
 
				 
			
@@ -112,56 +108,6 @@ def update_wechat_score_data(data):
 
				     log_.info(f"update wechat score data finished! update count = {len(update_data)}")
			
 
				 
			
 
				 
			
 
				-def get_stop_words():
			
 
				-    """获取停用词表"""
			
 
				-    stop = open('hit_stopwords.txt', 'r+', encoding='utf-8')
			
 
				-    stop_words = stop.read().split("\n")
			
 
				-    return stop_words
			
 
				-
			
 
				-
			
 
				-def filter_emoji(text):
			
 
				-    """清除文本中的表情符号"""
			
 
				-    # <U+1F300> - <U+1F5FF>  # 符号和象形字
			
 
				-    # <U+1F600> - <U+1F64F>  # 表情符号
			
 
				-    # <U+1F680> - <U+1F6FF>  # 交通符号和地图符号
			
 
				-    # <U+2600 > - <U+2B55>  # 其它符号
			
 
				-    # \U00010000 -\U0010ffff  # 英文emoji表情
			
 
				-    p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+')
			
 
				-    result = re.sub(p, '', text)  # 正则匹配，将表情符合替换为空''
			
 
				-    return result
			
 
				-
			
 
				-
			
 
				-def word_cut(text):
			
 
				-    """分词"""
			
 
				-    # 获取停用词
			
 
				-    stop_words = get_stop_words()
			
 
				-    # 清除空格
			
 
				-    text = text.strip()
			
 
				-    # 清除表情符号
			
 
				-    text = filter_emoji(text)
			
 
				-    # 精确模式分词
			
 
				-    seg_list = jieba.cut(text, cut_all=False)
			
 
				-    seg_list = [seg for seg in seg_list]
			
 
				-    # print(seg_list)
			
 
				-    # 根据词性去除数词、数量词、量词、代词
			
 
				-    if len(seg_list) > 1:
			
 
				-        words = []
			
 
				-        for seg in seg_list:
			
 
				-            words += [(word, flag) for word, flag in pseg.cut(seg)]
			
 
				-        seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']]
			
 
				-        # print(seg_list)
			
 
				-    # 去除停用词
			
 
				-    seg_list = [seg for seg in seg_list if seg not in stop_words]
			
 
				-    # 去除空格
			
 
				-    seg_list = [seg for seg in seg_list if ' ' not in seg]
			
 
				-    # 去除纯数字字符串
			
 
				-    seg_list = [seg for seg in seg_list if seg.isdigit() is False]
			
 
				-    # 去除单个字符
			
 
				-    seg_list = [seg for seg in seg_list if len(seg) > 1]
			
 
				-    # print(seg_list)
			
 
				-    return seg_list
			
 
				-
			
 
				-
			
 
				 def get_today_words(page_num, page_size):
			
 
				     """
			
 
				     分页获取今日更新的所有热点词