Explorar o código

add words to mysql

liqian %!s(int64=2) %!d(string=hai) anos
pai
achega
e2afc63138
Modificáronse 7 ficheiros con 156 adicións e 6 borrados
  1. 21 1
      config.py
  2. 3 0
      db_helper.py
  3. 3 3
      log_conf.py
  4. 43 0
      update_common_words.py
  5. 0 0
      utils/__init__.py
  6. 41 2
      utils/feishu.py
  7. 45 0
      utils/utils.py

+ 21 - 1
config.py

@@ -8,7 +8,27 @@ class BaseConfig(object):
         'app_id': 'cli_a3667697a57b500e',
         'app_secret': '5eMszgeNt21U56XnPjCykgmTfZUEEMnp'
     }
-    pass
+    # 电子表格信息
+    SHEET_INFO = {
+        '汉语常用词汇表': {
+            'spreadsheet_token': 'shtcnU8JgPeMq5wAuKejptwtLof',
+            'sheet_id': 'wnB24K'
+        },
+        '微信指数搜索常用词样本-站内标题分词': {
+            'spreadsheet_token': 'shtcnHxCj6dZBYMuK1Q3tIJVlqg',
+            'sheet_id': 'nCudsM'
+        },
+        '微信指数搜索常用词样本-人工标注站内高频关键词': {
+            'spreadsheet_token': 'shtcnHxCj6dZBYMuK1Q3tIJVlqg',
+            'sheet_id': 'n9Jo4j'
+        }
+
+    }
+    # mysql数据表
+    MYSQL_TABLES = {
+        '热点词库': 'hot_word',
+        '热词指数': 'word_wechat_score'
+    }
 
 
 class DevelopmentConfig(BaseConfig):

+ 3 - 0
db_helper.py

@@ -1,7 +1,9 @@
 import pymysql
 from config import set_config
+from log import Log
 
 config_, env = set_config()
+log_ = Log()
 
 
 class MysqlHelper(object):
@@ -51,6 +53,7 @@ class MysqlHelper(object):
             conn.commit()
         except Exception as e:
             # 发生错误时回滚
+            log_.error(e)
             conn.rollback()
         # 关闭游标对象
         cursor.close()

+ 3 - 3
log_conf.py

@@ -4,7 +4,7 @@ import aliyun
 import os
 import time
 from config import set_config
-config_ = set_config()
+config_, env = set_config()
 
 # 本地日志存储路径
 log_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "logs")
@@ -73,13 +73,13 @@ conf = {
         },
         'sls': {
             # 'handlers': ['consoleHandler', 'slsHandler'],
-            'handlers': ['consoleHandler'],
+            'handlers': ['consoleHandler', 'fileHandler'],
             'level': 'INFO',
             'propagate': False
         },
         'error': {
             # 'handlers': ['consoleHandler', 'errorHandler'],
-            'handlers': ['consoleHandler'],
+            'handlers': ['consoleHandler', 'fileHandler'],
             'level': 'ERROR',
             'propagate': False
         }

+ 43 - 0
update_common_words.py

@@ -0,0 +1,43 @@
+from feishu import FeiShuHelper
+from db_helper import MysqlHelper
+from config import set_config
+from log import Log
+
+config_, env = set_config()
+log_ = Log()
+mysql_helper = MysqlHelper()
+
+
+def add_words2mysql(sheet_name, source):
+    """新增词到mysql数据库中"""
+    sheet_info = config_.SHEET_INFO.get(sheet_name)
+    # 获取词
+    feishu_helper = FeiShuHelper()
+    data = feishu_helper.get_data(spreadsheet_token=sheet_info.get('spreadsheet_token'),
+                                  sheet_id=sheet_info.get('sheet_id'))
+    words = [word for item in data for word in item if word is not None and word != '']
+    words = list(set(words))
+    log_.info(f"words count = {len(words)}")
+    # 批量插入到mysql
+    for i in range(len(words)//100+1):
+        log_.info(f"i = {i}")
+        words_list = words[i*100:(i+1)*100]
+        if len(words_list) > 0:
+            # 与数据库中的数据去重
+            select_sql = f"select word from word.hot_word where word in {tuple(words_list)}"
+            data = mysql_helper.get_data(sql=select_sql)
+            exist_words = [item[0] for item in data]
+            insert_words = list(set(words_list).difference(set(exist_words)))
+            log_.info(f"words_list count = {len(words_list)}, "
+                      f"exist_words count = {len(exist_words)}, "
+                      f"insert_words count = {len(insert_words)}")
+            if len(insert_words) > 0:
+                # 拼接sql,插入数据库
+                sql_values = ', '.join([f"('{word}', {source})" for word in insert_words])
+                insert_sql = f"insert into word.hot_word (word, source) values {sql_values};"
+                log_.info(f"insert_sql = {insert_sql}")
+                mysql_helper.add_data(sql=insert_sql)
+
+
+if __name__ == '__main__':
+    add_words2mysql(sheet_name='微信指数搜索常用词样本-人工标注站内高频关键词', source=1)

+ 0 - 0
utils/__init__.py


+ 41 - 2
utils/feishu.py

@@ -1,3 +1,42 @@
-class FeiShu(object):
-    def get_tenant_access_token(self):
+from utils.utils import request_post, request_get
+from config import set_config
 
+config_, env = set_config()
+
+
+class FeiShuHelper(object):
+    @staticmethod
+    def get_tenant_access_token():
+        """获取自建应用的tenant_access_token"""
+        url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal"
+        headers = {"Content-Type": "application/json; charset=utf-8"}
+        request_data = config_.FEISHU_TOKEN
+        data = request_post(request_url=url, headers=headers, request_data=request_data)
+        if data is not None:
+            tenant_access_token = data.get('tenant_access_token')
+            return tenant_access_token
+
+    def get_data(self, spreadsheet_token, sheet_id):
+        """读取电子表格数据"""
+        tenant_access_token = self.get_tenant_access_token()
+        url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{spreadsheet_token}/values_batch_get"
+        headers = {
+            "Content-Type": "application/json; charset=utf-8",
+            "Authorization": f"Bearer {tenant_access_token}"
+        }
+        params = {
+            'ranges': sheet_id,
+        }
+        data = request_get(request_url=url, headers=headers, params=params)
+        values = []
+        if data is not None:
+            try:
+                values = data['data']['valueRanges'][0].get('values')
+            except:
+                values = []
+        return values
+
+
+if __name__ == '__main__':
+    sheet_info = config_.SHEET_INFO['汉语常用词汇表']
+    FeiShuHelper().get_data(spreadsheet_token=sheet_info.get('spreadsheet_token'), sheet_id=sheet_info.get('sheet_id'))

+ 45 - 0
utils/utils.py

@@ -0,0 +1,45 @@
+import requests
+import json
+import traceback
+from log import Log
+log_ = Log()
+
+
+def request_post(request_url, headers, request_data):
+    """
+    post 请求 HTTP接口
+    :param request_url: 接口URL
+    :param headers: 请求头
+    :param request_data: 请求参数
+    :return: res_data json格式
+    """
+    try:
+        response = requests.post(url=request_url, json=request_data, headers=headers)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+        else:
+            return None
+    except Exception as e:
+        log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
+        return None
+
+
+def request_get(request_url, headers, params=None):
+    """
+    get 请求 HTTP接口
+    :param request_url: 接口URL
+    :param headers: 请求头
+    :param params: 请求参数
+    :return: res_data json格式
+    """
+    try:
+        response = requests.get(url=request_url, headers=headers, params=params)
+        if response.status_code == 200:
+            res_data = json.loads(response.text)
+            return res_data
+        else:
+            return None
+    except Exception as e:
+        log_.error('url: {}, exception: {}, traceback: {}'.format(request_url, e, traceback.format_exc()))
+        return None