Ver código fonte

initial & add file

liqian 2 anos atrás
pai
commit
3c9be9f3fb
15 arquivos alterados com 1658 adições e 0 exclusões
  1. 2 0
      .gitignore
  2. 173 0
      config.py
  3. 181 0
      cut_words.py
  4. 7 0
      cut_words_task.sh
  5. 69 0
      db_helper.py
  6. 849 0
      hit_stopwords.txt
  7. 40 0
      log.py
  8. 87 0
      log_conf.py
  9. 7 0
      requirements.txt
  10. 50 0
      update_common_words.py
  11. 7 0
      update_common_words_task.sh
  12. 0 0
      utils/__init__.py
  13. 42 0
      utils/feishu.py
  14. 75 0
      utils/utils.py
  15. 69 0
      words_func.py

+ 2 - 0
.gitignore

@@ -57,4 +57,6 @@ docs/_build/
 
 # PyBuilder
 target/
+.idea/
+
 

+ 173 - 0
config.py

@@ -0,0 +1,173 @@
+import os
+
+
+class BaseConfig(object):
+    """基础配置"""
+    # 飞书应用凭证
+    FEISHU_TOKEN = {
+        'app_id': 'cli_a3667697a57b500e',
+        'app_secret': '5eMszgeNt21U56XnPjCykgmTfZUEEMnp'
+    }
+    # 电子表格信息
+    SHEET_INFO = {
+        '汉语常用词汇表': {
+            'spreadsheet_token': 'shtcnU8JgPeMq5wAuKejptwtLof',
+            'sheet_id': 'wnB24K'
+        },
+        '微信指数自动化搜索-站内标题分词结果': {
+            'spreadsheet_token': 'shtcnHxCj6dZBYMuK1Q3tIJVlqg',
+            'sheet_id': 'nCudsM'
+        },
+        '微信指数自动化搜索-人工标注站内高频关键词': {
+            'spreadsheet_token': 'shtcnHxCj6dZBYMuK1Q3tIJVlqg',
+            'sheet_id': 'n9Jo4j'
+        },
+        '微信指数自动化搜索-每日关键词': {
+            'spreadsheet_token': 'shtcnHxCj6dZBYMuK1Q3tIJVlqg',
+            'sheet_id': 'b74YMQ'
+        },
+    }
+
+    # mysql数据表
+    MYSQL_TABLES = {
+        '热点词库': 'hot_word',
+        '热词指数': 'word_wechat_score',
+        '分词结果': 'cut_words_result'
+    }
+
+    # ODPS服务配置
+    ODPS_CONFIG = {
+        'ENDPOINT': 'http://service.cn.maxcompute.aliyun.com/api',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+    }
+
+    # 站内外标题数据表
+    TITLE_DATA = {
+        'project': 'loghubods',
+        'table': 'crawler_hot_title_1'
+    }
+
+
+class DevelopmentConfig(BaseConfig):
+    """开发环境配置"""
+    # 报警内容 环境区分
+    ENV_TEXT = "开发环境"
+    # 项目存放目录
+    PROJECT_PATH = '/data2/hot-words'
+
+    # 测试环境mysql地址
+    MYSQL_INFO = {
+        'host': 'rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com',
+        'port': 3306,
+        'user': 'wx2016_longvideo',
+        'password': 'wx2016_longvideoP@assword1234',
+        'db': 'word',
+        'charset': 'utf8mb4'
+    }
+
+    # 日志服务配置
+    ALIYUN_LOG = {
+        'ENDPOINT': 'cn-hangzhou.log.aliyuncs.com',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+        'PROJECT': 'hot-words-test',
+    }
+
+
+class TestConfig(BaseConfig):
+    """测试环境配置"""
+    # 报警内容 环境区分
+    ENV_TEXT = "测试环境"
+    # 项目存放目录
+    PROJECT_PATH = '/data2/hot-words'
+
+    # 测试环境mysql地址
+    MYSQL_INFO = {
+        'host': 'rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com',
+        'port': 3306,
+        'user': 'wx2016_longvideo',
+        'password': 'wx2016_longvideoP@assword1234',
+        'db': 'word',
+        'charset': 'utf8mb4'
+    }
+
+    # 日志服务配置
+    ALIYUN_LOG = {
+        'ENDPOINT': 'cn-hangzhou.log.aliyuncs.com',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+        'PROJECT': 'hot-words-test',
+    }
+
+
+class PreProductionConfig(BaseConfig):
+    """预发布环境配置"""
+    # 报警内容 环境区分
+    ENV_TEXT = "预发布环境"
+    # 项目存放目录
+    PROJECT_PATH = '/data/hot-words'
+
+    # 生产环境mysql地址
+    MYSQL_INFO = {
+        'host': 'rm-bp1661607875x9596.mysql.rds.aliyuncs.com',
+        'port': 3306,
+        'user': 'word',
+        'password': 'Piaoquan123@',
+        'db': 'word',
+        'charset': 'utf8mb4'
+    }
+
+    # 日志服务配置
+    ALIYUN_LOG = {
+        'ENDPOINT': 'cn-hangzhou.log.aliyuncs.com',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+        'PROJECT': 'hot-words',
+    }
+
+
+class ProductionConfig(BaseConfig):
+    """生产环境配置"""
+    # 报警内容 环境区分
+    ENV_TEXT = "生产环境"
+    # 项目存放目录
+    PROJECT_PATH = '/data/hot-words'
+
+    # 生产环境mysql地址
+    MYSQL_INFO = {
+        'host': 'rm-bp1661607875x9596.mysql.rds.aliyuncs.com',
+        'port': 3306,
+        'user': 'word',
+        'password': 'Piaoquan123@',
+        'db': 'word',
+        'charset': 'utf8mb4'
+    }
+
+    # 日志服务配置
+    ALIYUN_LOG = {
+        'ENDPOINT': 'cn-hangzhou.log.aliyuncs.com',
+        'ACCESSID': 'LTAIWYUujJAm7CbH',
+        'ACCESSKEY': 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
+        'PROJECT': 'hot-words',
+    }
+
+
+def set_config():
+    # 获取环境变量 HOT_WORDS_ENV
+    env = os.environ.get('HOT_WORDS_ENV')
+    # env = 'dev'
+    if env is None:
+        # log_.error('ENV ERROR: is None!')
+        return
+    if env == 'dev':
+        return DevelopmentConfig(), env
+    elif env == 'test':
+        return TestConfig(), env
+    elif env == 'pre':
+        return PreProductionConfig(), env
+    elif env == 'pro':
+        return ProductionConfig(), env
+    else:
+        # log_.error('ENV ERROR: is {}'.format(env))
+        return

+ 181 - 0
cut_words.py

@@ -0,0 +1,181 @@
+import datetime
+import traceback
+import pandas as pd
+from odps import ODPS
+from threading import Timer
+from config import set_config
+from log import Log
+from utils.utils import get_data_from_odps
+from words_func import word_cut
+from db_helper import MysqlHelper
+config_, env = set_config()
+log_ = Log()
+mysql_helper = MysqlHelper()
+features = ['title', 'source']
+
+
+def check_table_partition_exits(date, project, table, connect_timeout=3000, read_timeout=500000,
+                                pool_maxsize=1000, pool_connections=1000):
+    """
+    判断表中是否存在这个分区
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=connect_timeout,
+        read_timeout=read_timeout,
+        pool_maxsize=pool_maxsize,
+        pool_connections=pool_connections
+    )
+    t = odps.get_table(name=table)
+    return t.exist_partition(partition_spec=f'dt={date}')
+
+
+def data_check(project, table, now_date):
+    """检查数据是否准备好"""
+    odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=3000,
+        read_timeout=500000,
+        pool_maxsize=1000,
+        pool_connections=1000
+    )
+
+    try:
+        dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+        check_res = check_table_partition_exits(date=dt, project=project, table=table)
+        if check_res:
+            sql = f'select * from {project}.{table} where dt = {dt}'
+            with odps.execute_sql(sql=sql).open_reader() as reader:
+                data_count = reader.count
+        else:
+            data_count = 0
+    except Exception as e:
+        data_count = 0
+    return data_count
+
+
+def get_title_data(project, table, now_date):
+    """获取站内外视频标题数据"""
+    dt = datetime.datetime.strftime(now_date, '%Y%m%d')
+    records = get_data_from_odps(date=dt, project=project, table=table)
+    feature_data = []
+    for record in records:
+        item = {}
+        for feature_name in features:
+            item[feature_name] = record[feature_name]
+        feature_data.append(item)
+    feature_df = pd.DataFrame(feature_data)
+    return feature_df
+
+
+def update_cut_words_result(text, source, words_list):
+    """
+    分词结果入库
+    :param text: 原始文本 type-string
+    :param source: 文本来源 type-int
+    :param words_list: 分词结果 type-list
+    :return:
+    """
+    # 分词结果拼接成字符串
+    words = ','.join(words_list)
+    # 判断原始文本是否已存在
+    select_sql = f"SELECT id FROM word.cut_words_result WHERE text = '{text}';"
+    res = mysql_helper.get_data(sql=select_sql)
+    if res is None or len(res) == 0:
+        # 不存在,插入
+        insert_sql = f"insert into word.cut_words_result (text, words, source) values ('{text}', '{words}', {source});"
+        log_.info(f"insert_sql = {insert_sql}")
+        mysql_helper.add_data(sql=insert_sql)
+    else:
+        # 存在,更新
+        update_sql = f"""update word.cut_words_result set words = '{words}', source = {source}, 
+        update_time = '{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}' where id = {res[0][0]};"""
+        log_.info(f"update_sql = {update_sql}")
+        mysql_helper.add_data(sql=update_sql)
+
+
+def update_hot_word(words_list, source):
+    """
+    词入库
+    :param words_list: 词列表 type-list
+    :param source: 词来源 type-int
+    :return:
+    """
+    for word in words_list:
+        if len(word) == 0:
+            continue
+        # 判断词是否已存在
+        select_sql = f"SELECT id, source FROM word.hot_word WHERE word = '{word}';"
+        res = mysql_helper.get_data(sql=select_sql)
+        if res is None or len(res) == 0:
+            # 不存在,插入
+            insert_sql = f"insert into word.hot_word (word, source) values ('{word}', {source});"
+            mysql_helper.add_data(sql=insert_sql)
+        else:
+            # 存在,更新
+            if source != res[0][1]:
+                source = 3
+            update_sql = \
+                f"""update word.hot_word set source = {source}, 
+                update_time = '{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}' where id = {res[0][0]};"""
+            mysql_helper.add_data(sql=update_sql)
+
+
+def data_update(project, table, now_date):
+    """数据更新"""
+    # 获取站内外视频标题数据
+    df = get_title_data(project=project, table=table, now_date=now_date)
+    df['source'] = df['source'].astype(int)
+    for source in [1, 2]:
+        df_temp = df[df['source'] == source]
+        title_list = df_temp['title'].to_list()
+        log_.info(f"source = {source}, count = {len(title_list)}")
+        for title in title_list:
+            log_.info(f"title: {title}")
+            if len(title) == 0:
+                return
+            # 1. 分词
+            words_list = word_cut(text=title)
+            log_.info(f"words_list: {words_list}")
+            # 2. 分词结果入库
+            update_cut_words_result(text=title, source=source, words_list=words_list)
+            # 3. 词入库
+            update_hot_word(words_list=words_list, source=source)
+
+
+def timer_check():
+    try:
+        project = config_.TITLE_DATA['project']
+        table = config_.TITLE_DATA['table']
+        now_date = datetime.datetime.today()
+        log_.info(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
+        # 查看当天更新的数据是否已准备好
+        data_count = data_check(project=project, table=table, now_date=now_date)
+        if data_count > 0:
+            log_.info(f'data_count = {data_count}')
+            # 数据准备好,进行更新
+            data_update(project=project, table=table, now_date=now_date)
+            log_.info(f"data update end!")
+        else:
+            # 数据没准备好,1分钟后重新检查
+            Timer(60, timer_check).start()
+    except Exception as e:
+        log_.error(f"数据更新失败, exception: {e}, traceback: {traceback.format_exc()}")
+
+
+if __name__ == '__main__':
+    timer_check()

+ 7 - 0
cut_words_task.sh

@@ -0,0 +1,7 @@
+source /etc/profile
+echo $HOT_WORDS_ENV
+if [[ $HOT_WORDS_ENV == 'test' ]]; then
+    cd /data2/hot-words && /root/anaconda3/bin/python /data2/hot-words/cut_words.py
+elif [[ $HOT_WORDS_ENV == 'pro' ]]; then
+    cd /data/hot-words && /root/anaconda3/bin/python /data2/hot-words/cut_words.py
+fi

+ 69 - 0
db_helper.py

@@ -0,0 +1,69 @@
+import pymysql
+from config import set_config
+from log import Log
+
+config_, env = set_config()
+log_ = Log()
+
+
+class MysqlHelper(object):
+    def __init__(self):
+        """
+        初始化mysql连接信息
+        """
+        self.mysql_info = config_.MYSQL_INFO
+
+    def get_data(self, sql):
+        """
+        查询数据
+        :param sql: sql语句
+        :return: data
+        """
+        # 连接数据库
+        conn = pymysql.connect(**self.mysql_info)
+        # 创建游标
+        cursor = conn.cursor()
+        try:
+            # 执行SQL语句
+            cursor.execute(sql)
+            # 获取查询的所有记录
+            data = cursor.fetchall()
+        except Exception as e:
+            return None
+        # 关闭游标对象
+        cursor.close()
+        # 关闭数据库连接
+        conn.close()
+        return data
+
+    def add_data(self, sql):
+        """
+        新增数据
+        :param sql:
+        :return:
+        """
+        # 连接数据库
+        conn = pymysql.connect(**self.mysql_info)
+        # 创建游标
+        cursor = conn.cursor()
+        try:
+            # 执行SQL语句
+            cursor.execute(sql)
+            # 提交到数据库执行
+            conn.commit()
+        except Exception as e:
+            # 发生错误时回滚
+            log_.error(e)
+            conn.rollback()
+        # 关闭游标对象
+        cursor.close()
+        # 关闭数据库连接
+        conn.close()
+
+
+if __name__ == '__main__':
+    mysql_helper = MysqlHelper()
+    sql = "select * from hot_word;"
+    data = mysql_helper.get_data(sql=sql)
+    print(data)
+

+ 849 - 0
hit_stopwords.txt

@@ -0,0 +1,849 @@
+———
+》),
+)÷(1-
+”,
+)、
+=(
+:
+→
+℃ 
+&
+*
+一一
+~~~~
+’
+. 
+『
+.一
+./
+-- 
+』
+=″
+【
+[*]
+}>
+[⑤]]
+[①D]
+c]
+ng昉
+*
+//
+[
+]
+[②e]
+[②g]
+={
+}
+,也 
+‘
+A
+[①⑥]
+[②B] 
+[①a]
+[④a]
+[①③]
+[③h]
+③]
+1. 
+-- 
+[②b]
+’‘ 
+××× 
+[①⑧]
+0:2 
+=[
+[⑤b]
+[②c] 
+[④b]
+[②③]
+[③a]
+[④c]
+[①⑤]
+[①⑦]
+[①g]
+∈[ 
+[①⑨]
+[①④]
+[①c]
+[②f]
+[②⑧]
+[②①]
+[①C]
+[③c]
+[③g]
+[②⑤]
+[②②]
+一.
+[①h]
+.数
+[]
+[①B]
+数/
+[①i]
+[③e]
+[①①]
+[④d]
+[④e]
+[③b]
+[⑤a]
+[①A]
+[②⑧]
+[②⑦]
+[①d]
+[②j]
+〕〔
+][
+://
+′∈
+[②④
+[⑤e]
+12%
+b]
+...
+...................
+…………………………………………………③
+ZXFITL
+[③F]
+」
+[①o]
+]∧′=[ 
+∪φ∈
+′|
+{-
+②c
+}
+[③①]
+R.L.
+[①E]
+-[*]-
+↑
+.日 
+[②d]
+[②
+[②⑦]
+[②②]
+[③e]
+[①i]
+[①B]
+[①h]
+[①d]
+[①g]
+[①②]
+[②a]
+f]
+[⑩]
+a]
+[①e]
+[②h]
+[②⑥]
+[③d]
+[②⑩]
+e]
+〉
+】
+元/吨
+[②⑩]
+2.3%
+5:0  
+[①]
+::
+[②]
+[③]
+[④]
+[⑤]
+[⑥]
+[⑦]
+[⑧]
+[⑨] 
+……
+——
+?
+、
+。
+“
+”
+《
+》
+!
+,
+:
+;
+?
+.
+,
+.
+'
+? 
+———
+──
+? 
+—
+<
+>
+(
+)
+〔
+〕
+[
+]
+(
+)
+-
++
+~
+/
+/
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+Ⅲ
+"
+;
+#
+@
+φ.
+■
+▲
+sub
+exp 
+sup
+sub
+Lex 
+#
+%
+&
+'
++
++ξ
+++
+-
+-β
+<
+<±
+<Δ
+<λ
+<φ
+<<
+=
+=
+=☆
+=-
+>
+>λ
+_
+~±
+~+
+[⑤f]
+[⑤d]
+[②i]
+≈ 
+[②G]
+[①f]
+LI
+㈧ 
+[-
+......
+〉
+[③⑩]
+第二
+一番
+一直
+一个
+一些
+许多
+种
+有的是
+也就是说
+末##末
+啊
+阿
+哎
+哎呀
+哎哟
+唉
+俺
+俺们
+按
+按照
+吧
+吧哒
+把
+罢了
+被
+本
+本着
+比
+比方
+比如
+鄙人
+彼
+彼此
+边
+别
+别的
+别说
+并
+并且
+不比
+不成
+不单
+不但
+不独
+不管
+不光
+不过
+不仅
+不拘
+不论
+不怕
+不然
+不如
+不特
+不惟
+不问
+不只
+朝
+朝着
+趁
+趁着
+乘
+冲
+除
+除此之外
+除非
+除了
+此
+此间
+此外
+从
+从而
+打
+待
+但
+但是
+当
+当着
+到
+得
+的
+的话
+等
+等等
+地
+第
+叮咚
+对
+对于
+多
+多少
+而
+而况
+而且
+而是
+而外
+而言
+而已
+尔后
+反过来
+反过来说
+反之
+非但
+非徒
+否则
+嘎
+嘎登
+该
+赶
+个
+各
+各个
+各位
+各种
+各自
+给
+根据
+跟
+故
+故此
+固然
+关于
+管
+归
+果然
+果真
+过
+哈
+哈哈
+呵
+和
+何
+何处
+何况
+何时
+嘿
+哼
+哼唷
+呼哧
+乎
+哗
+还是
+还有
+换句话说
+换言之
+或
+或是
+或者
+极了
+及
+及其
+及至
+即
+即便
+即或
+即令
+即若
+即使
+几
+几时
+己
+既
+既然
+既是
+继而
+加之
+假如
+假若
+假使
+鉴于
+将
+较
+较之
+叫
+接着
+结果
+借
+紧接着
+进而
+尽
+尽管
+经
+经过
+就
+就是
+就是说
+据
+具体地说
+具体说来
+开始
+开外
+靠
+咳
+可
+可见
+可是
+可以
+况且
+啦
+来
+来着
+离
+例如
+哩
+连
+连同
+两者
+了
+临
+另
+另外
+另一方面
+论
+嘛
+吗
+慢说
+漫说
+冒
+么
+每
+每当
+们
+莫若
+某
+某个
+某些
+拿
+哪
+哪边
+哪儿
+哪个
+哪里
+哪年
+哪怕
+哪天
+哪些
+哪样
+那
+那边
+那儿
+那个
+那会儿
+那里
+那么
+那么些
+那么样
+那时
+那些
+那样
+乃
+乃至
+呢
+能
+你
+你们
+您
+宁
+宁可
+宁肯
+宁愿
+哦
+呕
+啪达
+旁人
+呸
+凭
+凭借
+其
+其次
+其二
+其他
+其它
+其一
+其余
+其中
+起
+起见
+起见
+岂但
+恰恰相反
+前后
+前者
+且
+然而
+然后
+然则
+让
+人家
+任
+任何
+任凭
+如
+如此
+如果
+如何
+如其
+如若
+如上所述
+若
+若非
+若是
+啥
+上下
+尚且
+设若
+设使
+甚而
+甚么
+甚至
+省得
+时候
+什么
+什么样
+使得
+是
+是的
+首先
+谁
+谁知
+顺
+顺着
+似的
+虽
+虽然
+虽说
+虽则
+随
+随着
+所
+所以
+他
+他们
+他人
+它
+它们
+她
+她们
+倘
+倘或
+倘然
+倘若
+倘使
+腾
+替
+通过
+同
+同时
+哇
+万一
+往
+望
+为
+为何
+为了
+为什么
+为着
+喂
+嗡嗡
+我
+我们
+呜
+呜呼
+乌乎
+无论
+无宁
+毋宁
+嘻
+吓
+相对而言
+像
+向
+向着
+嘘
+呀
+焉
+沿
+沿着
+要
+要不
+要不然
+要不是
+要么
+要是
+也
+也罢
+也好
+一
+一般
+一旦
+一方面
+一来
+一切
+一样
+一则
+依
+依照
+矣
+以
+以便
+以及
+以免
+以至
+以至于
+以致
+抑或
+因
+因此
+因而
+因为
+哟
+用
+由
+由此可见
+由于
+有
+有的
+有关
+有些
+又
+于
+于是
+于是乎
+与
+与此同时
+与否
+与其
+越是
+云云
+哉
+再说
+再者
+在
+在下
+咱
+咱们
+则
+怎
+怎么
+怎么办
+怎么样
+怎样
+咋
+照
+照着
+者
+这
+这边
+这儿
+这个
+这会儿
+这就是说
+这里
+这么
+这么点儿
+这么些
+这么样
+这时
+这些
+这样
+正如
+吱
+之
+之类
+之所以
+之一
+只是
+只限
+只要
+只有
+至
+至于
+诸位
+着
+着呢
+自
+自从
+自个儿
+自各儿
+自己
+自家
+自身
+综上所述
+总的来看
+总的来说
+总的说来
+总而言之
+总之
+纵
+纵令
+纵然
+纵使
+遵照
+作为
+兮
+呃
+呗
+咚
+咦
+喏
+啐
+喔唷
+嗬
+嗯
+嗳
+看完大快人心!
+刚刚曝光
+头一次见
+第一次见
+看看
+速看
+必看
+快看看
+赶紧看
+打开看看
+一起看看
+一起听听
+赶紧看看
+一定要看
+值得一看
+国人必看
+值得看看
+一定要看
+不看可惜
+全民必看
+大家速看
+我看完了
+发给大家看看
+发给大家都看看
+刚刚
+刚刚传来
+刚刚发生
+刚刚公开
+就在刚刚
+十万火急
+紧急通知
+爆炸消息传来
+转发就是支持
+全球震怒
+突然暴露
+重磅提醒
+今早传来
+全民疯传
+现在知道还不晚
+早知道早受益
+请转发
+事关所有人
+终于公开
+重磅来袭
+不可思议
+噩耗传来
+前所未闻
+传来消息
+今日周一
+今日周二
+今日周三
+今日周四
+今日周五
+今日周六
+今日周末
+务必警惕
+今天周
+看完转发
+二个
+终于
+今天
+有人
+知道
+不是
+大家
+一分
+传来
+打开
+一起
+赶紧
+一定
+值得
+看完
+大家
+发生
+公开
+紧急
+通知
+听听
+紧急
+通知
+今日

+ 40 - 0
log.py

@@ -0,0 +1,40 @@
+import logging
+import logging.config
+
+from log_conf import conf
+
+
+class Log(object):
+    def __init__(self):
+        # 配置
+        logging.config.dictConfig(conf)
+
+    def __console(self, level, message):
+        if level == 'info':
+            logger = logging.getLogger('sls')
+            logger.info(message)
+        elif level == 'debug':
+            logger = logging.getLogger('root')
+            logger.debug(message)
+        elif level == 'warning':
+            logger = logging.getLogger('root')
+            logger.warning(message)
+        elif level == 'error':
+            logger = logging.getLogger('error')
+            logger.error(message)
+
+    def debug(self, message):
+        self.__console('debug', message)
+        # return
+
+    def info(self, message):
+        self.__console('info', message)
+        # return
+
+    def warning(self, message):
+        self.__console('warning', message)
+        # return
+
+    def error(self, message):
+        self.__console('error', message)
+        # return

+ 87 - 0
log_conf.py

@@ -0,0 +1,87 @@
+# log conf
+import logging
+import aliyun
+import os
+import time
+from config import set_config
+config_, env = set_config()
+
+# 本地日志存储路径
+log_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "logs")
+if not os.path.exists(log_path):
+    os.makedirs(log_path)
+# 文件的命名
+log_name = os.path.join(log_path, '{}.log'.format(time.strftime('%Y%m%d')))
+
+conf = {
+    'version': 1,
+    'formatters': {
+        'rawFormatter': {
+            'class': 'logging.Formatter',
+            'format': '%(message)s'
+        },
+        'simpleFormatter': {
+            'class': 'logging.Formatter',
+            'format': '%(asctime)s %(levelname)s: %(message)s'
+        }
+    },
+    'handlers': {
+        'consoleHandler': {
+            '()': 'logging.StreamHandler',
+            'level': 'DEBUG',
+            'formatter': 'simpleFormatter',
+        },
+        'slsHandler': {
+            '()': 'aliyun.log.QueuedLogHandler',
+            'level': 'INFO',
+            'formatter': 'rawFormatter',
+            # custom args:
+            'end_point': config_.ALIYUN_LOG.get('ENDPOINT', ''),
+            'access_key_id': config_.ALIYUN_LOG.get('ACCESSID', ''),
+            'access_key': config_.ALIYUN_LOG.get('ACCESSKEY', ''),
+            'project': config_.ALIYUN_LOG.get('PROJECT', ''),
+            'log_store': "info",
+            'extract_kv': True,
+            'extract_json': True
+        },
+        'errorHandler': {
+            '()': 'aliyun.log.QueuedLogHandler',
+            'level': 'ERROR',
+            'formatter': 'rawFormatter',
+            # custom args:
+            'end_point': config_.ALIYUN_LOG.get('ENDPOINT', ''),
+            'access_key_id': config_.ALIYUN_LOG.get('ACCESSID', ''),
+            'access_key': config_.ALIYUN_LOG.get('ACCESSKEY', ''),
+            'project': config_.ALIYUN_LOG.get('PROJECT', ''),
+            'log_store': "error",
+            'extract_kv': True,
+            'extract_json': True
+        },
+        'fileHandler': {
+            '()': 'logging.FileHandler',
+            'level': 'INFO',
+            'formatter': 'simpleFormatter',
+            'filename': log_name,
+            'mode': 'a',
+            'encoding': 'utf-8'
+        }
+    },
+    'loggers': {
+        'root': {
+            'handlers': ['consoleHandler', ],
+            'level': 'DEBUG'
+        },
+        'sls': {
+            # 'handlers': ['consoleHandler', 'slsHandler'],
+            'handlers': ['consoleHandler', 'fileHandler'],
+            'level': 'INFO',
+            'propagate': False
+        },
+        'error': {
+            # 'handlers': ['consoleHandler', 'errorHandler'],
+            'handlers': ['consoleHandler', 'fileHandler'],
+            'level': 'ERROR',
+            'propagate': False
+        }
+    }
+}

+ 7 - 0
requirements.txt

@@ -0,0 +1,7 @@
+pyodps==0.10.7
+pandas==1.1.3
+requests==2.24.0
+PyMySQL==1.0.2
+jieba==0.42.1
+aliyun_python_sdk==2.2.0
+odps==3.5.1

+ 50 - 0
update_common_words.py

@@ -0,0 +1,50 @@
+import sys
+
+from utils.feishu import FeiShuHelper
+from db_helper import MysqlHelper
+from config import set_config
+from log import Log
+
+config_, env = set_config()
+log_ = Log()
+mysql_helper = MysqlHelper()
+
+
+def add_words2mysql(sheet_name, source):
+    """新增词到mysql数据库中"""
+    sheet_info = config_.SHEET_INFO.get(sheet_name)
+    # 获取词
+    feishu_helper = FeiShuHelper()
+    data = feishu_helper.get_data(spreadsheet_token=sheet_info.get('spreadsheet_token'),
+                                  sheet_id=sheet_info.get('sheet_id'))
+    words = [word for item in data for word in item if word is not None and word != '']
+    words = list(set(words))
+    log_.info(f"words count = {len(words)}")
+    # 批量插入到mysql
+    for i in range(len(words)//100+1):
+        log_.info(f"i = {i}")
+        words_list = words[i*100:(i+1)*100]
+        if len(words_list) > 0:
+            # 与数据库中的数据去重
+            select_sql = f"select word from word.hot_word where word in {tuple(words_list)}"
+            data = mysql_helper.get_data(sql=select_sql)
+            exist_words = [item[0] for item in data]
+            insert_words = list(set(words_list).difference(set(exist_words)))
+            # log_.info(f"words_list = {words_list}, \n"
+            #           f"exist_words = {exist_words}, \n"
+            #           f"insert_words = {insert_words}")
+            log_.info(f"words_list count = {len(words_list)}, "
+                      f"exist_words count = {len(exist_words)}, "
+                      f"insert_words count = {len(insert_words)}")
+            if len(insert_words) > 0:
+                # 拼接sql,插入数据库
+                sql_values = ', '.join([f"('{word}', {source})" for word in insert_words])
+                insert_sql = f"insert into word.hot_word (word, source) values {sql_values};"
+                log_.info(f"insert_sql = {insert_sql}")
+                mysql_helper.add_data(sql=insert_sql)
+
+
+if __name__ == '__main__':
+    # add_words2mysql(sheet_name='微信指数搜索常用词样本-人工标注站内高频关键词', source=1)
+    sheet_name, source = sys.argv[1], sys.argv[2]
+    add_words2mysql(sheet_name, source)

+ 7 - 0
update_common_words_task.sh

@@ -0,0 +1,7 @@
+source /etc/profile
+echo $HOT_WORDS_ENV
+if [[ $HOT_WORDS_ENV == 'test' ]]; then
+    cd /data2/hot-words && /root/anaconda3/bin/python /data2/hot-words/update_common_words.py '微信指数自动化搜索-每日关键词' 1
+elif [[ $HOT_WORDS_ENV == 'pro' ]]; then
+    cd /data/hot-words && /root/anaconda3/bin/python /data2/hot-words/update_common_words.py '微信指数自动化搜索-每日关键词' 1
+fi

+ 0 - 0
utils/__init__.py


+ 42 - 0
utils/feishu.py

@@ -0,0 +1,42 @@
+from utils.utils import request_post, request_get
+from config import set_config
+
+config_, env = set_config()
+
+
+class FeiShuHelper(object):
+    @staticmethod
+    def get_tenant_access_token():
+        """获取自建应用的tenant_access_token"""
+        url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal"
+        headers = {"Content-Type": "application/json; charset=utf-8"}
+        request_data = config_.FEISHU_TOKEN
+        data = request_post(request_url=url, headers=headers, request_data=request_data)
+        if data is not None:
+            tenant_access_token = data.get('tenant_access_token')
+            return tenant_access_token
+
+    def get_data(self, spreadsheet_token, sheet_id):
+        """读取电子表格数据"""
+        tenant_access_token = self.get_tenant_access_token()
+        url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{spreadsheet_token}/values_batch_get"
+        headers = {
+            "Content-Type": "application/json; charset=utf-8",
+            "Authorization": f"Bearer {tenant_access_token}"
+        }
+        params = {
+            'ranges': sheet_id,
+        }
+        data = request_get(request_url=url, headers=headers, params=params)
+        values = []
+        if data is not None:
+            try:
+                values = data['data']['valueRanges'][0].get('values')
+            except:
+                values = []
+        return values
+
+
+if __name__ == '__main__':
+    sheet_info = config_.SHEET_INFO['汉语常用词汇表']
+    FeiShuHelper().get_data(spreadsheet_token=sheet_info.get('spreadsheet_token'), sheet_id=sheet_info.get('sheet_id'))

+ 75 - 0
utils/utils.py

@@ -0,0 +1,75 @@
+import requests
+import json
+import traceback
+from odps import ODPS
+from log import Log
+from config import set_config
+log_ = Log()
+config_, env = set_config()
+
+
+def request_post(request_url, headers, request_data):
+    """
+    post 请求 HTTP接口
+    :param request_url: 接口URL
+    :param headers: 请求头
+    :param request_data: 请求参数
+    :return: res_data json格式
+    """
+    try:
+        response = requests.post(url=request_url, json=request_data, headers=headers)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+        else:
+            return None
+    except Exception as e:
+        log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
+        return None
+
+
+def request_get(request_url, headers, params=None):
+    """
+    get 请求 HTTP接口
+    :param request_url: 接口URL
+    :param headers: 请求头
+    :param params: 请求参数
+    :return: res_data json格式
+    """
+    try:
+        response = requests.get(url=request_url, headers=headers, params=params)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+        else:
+            return None
+    except Exception as e:
+        log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
+        return None
+
+
+def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
+                       pool_maxsize=1000, pool_connections=1000):
+    """
+    从odps获取数据
+    :param date: 日期 type-string '%Y%m%d'
+    :param project: type-string
+    :param table: 表名 type-string
+    :param connect_timeout: 连接超时设置
+    :param read_timeout: 读取超时设置
+    :param pool_maxsize:
+    :param pool_connections:
+    :return: records
+    """
+    odps = ODPS(
+        access_id=config_.ODPS_CONFIG['ACCESSID'],
+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
+        project=project,
+        endpoint=config_.ODPS_CONFIG['ENDPOINT'],
+        connect_timeout=connect_timeout,
+        read_timeout=read_timeout,
+        pool_maxsize=pool_maxsize,
+        pool_connections=pool_connections
+    )
+    records = odps.read_table(name=table, partition='dt=%s' % date)
+    return records

+ 69 - 0
words_func.py

@@ -0,0 +1,69 @@
+import datetime
+import time
+import traceback
+import re
+import jieba
+import jieba.posseg as pseg
+
+from db_helper import MysqlHelper
+from log import Log
+
+mysql_helper = MysqlHelper()
+log_ = Log()
+
+
+def get_stop_words():
+    """获取停用词表"""
+    stop = open('hit_stopwords.txt', 'r+', encoding='utf-8')
+    stop_words = stop.read().split("\n")
+    return stop_words
+
+
+def filter_emoji(text):
+    """清除文本中的表情符号"""
+    # <U+1F300> - <U+1F5FF>  # 符号和象形字
+    # <U+1F600> - <U+1F64F>  # 表情符号
+    # <U+1F680> - <U+1F6FF>  # 交通符号和地图符号
+    # <U+2600 > - <U+2B55>  # 其它符号
+    # \U00010000 -\U0010ffff  # 英文emoji表情
+    p = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u2B55 \U00010000-\U0010ffff]+')
+    result = re.sub(p, '', text)  # 正则匹配,将表情符合替换为空''
+    return result
+
+
+def word_cut(text):
+    """分词"""
+    # 获取停用词
+    stop_words = get_stop_words()
+    # 清除空格
+    text = text.strip()
+    # 清除表情符号
+    text = filter_emoji(text)
+    # 精确模式分词
+    seg_list = jieba.cut(text, cut_all=False)
+    seg_list = [seg for seg in seg_list]
+    # print(seg_list)
+    # 根据词性去除数词、数量词、量词、代词
+    if len(seg_list) > 1:
+        words = []
+        for seg in seg_list:
+            words += [(word, flag) for word, flag in pseg.cut(seg)]
+        seg_list = [word for word, flag in words if flag not in ['m', 'mq', 'q', 'r']]
+        # print(seg_list)
+    # 去除停用词
+    seg_list = [seg for seg in seg_list if seg not in stop_words]
+    # 去除空格
+    seg_list = [seg for seg in seg_list if ' ' not in seg]
+    # 去除纯数字字符串
+    seg_list = [seg for seg in seg_list if seg.isdigit() is False]
+    # 去除单个字符
+    seg_list = [seg for seg in seg_list if len(seg) > 1]
+    # print(seg_list)
+    return seg_list
+
+
+if __name__ == '__main__':
+    # get_words(8, 100)
+    # get_words(1, 20)
+    # get_words(2, 10)
+    word_cut('中国参加联合国五常阅兵,出场惊人,让所有人都震惊!')