liqian %!s(int64=2) %!d(string=hai) anos
pai
achega
faf5d763a9
Modificáronse 5 ficheiros con 27 adicións e 16 borrados
  1. 1 1
      app.py
  2. 6 7
      config.py
  3. 12 7
      cut_words.py
  4. 7 0
      cut_words_task.sh
  5. 1 1
      utils/utils.py

+ 1 - 1
app.py

@@ -91,7 +91,7 @@ def update_words():
 
 # 获取热点词库中当天更新的所有热词
 @app.route('/hot/word/getTodayWords', methods=['GET', 'POST'])
-def get_all_words():
+def get_today_all_words():
     try:
         start_time = time.time()
         request_data = json.loads(request.get_data())

+ 6 - 7
config.py

@@ -44,12 +44,11 @@ class BaseConfig(object):
 
     # 站内外标题数据表
     TITLE_DATA = {
-        'project': '',
-        'table': ''
+        'project': 'loghubods',
+        'table': 'crawler_hot_title_1'
     }
 
 
-
 class DevelopmentConfig(BaseConfig):
     """开发环境配置"""
     # 报警内容 环境区分
@@ -64,7 +63,7 @@ class DevelopmentConfig(BaseConfig):
         'user': 'wx2016_longvideo',
         'password': 'wx2016_longvideoP@assword1234',
         'db': 'word',
-        'charset': 'utf8'
+        'charset': 'utf8mb4'
     }
 
     # 日志服务配置
@@ -90,7 +89,7 @@ class TestConfig(BaseConfig):
         'user': 'wx2016_longvideo',
         'password': 'wx2016_longvideoP@assword1234',
         'db': 'word',
-        'charset': 'utf8'
+        'charset': 'utf8mb4'
     }
 
     # 日志服务配置
@@ -116,7 +115,7 @@ class PreProductionConfig(BaseConfig):
         'user': 'word',
         'password': 'Piaoquan123@',
         'db': 'word',
-        'charset': 'utf8'
+        'charset': 'utf8mb4'
     }
 
     # 日志服务配置
@@ -142,7 +141,7 @@ class ProductionConfig(BaseConfig):
         'user': 'word',
         'password': 'Piaoquan123@',
         'db': 'word',
-        'charset': 'utf8'
+        'charset': 'utf8mb4'
     }
 
     # 日志服务配置

+ 12 - 7
cut_words.py

@@ -8,7 +8,7 @@ from log import Log
 from utils.utils import get_data_from_odps
 from words_func import word_cut
 from db_helper import MysqlHelper
-config_ = set_config()
+config_, env = set_config()
 log_ = Log()
 mysql_helper = MysqlHelper()
 features = ['title', 'source']
@@ -55,7 +55,7 @@ def data_check(project, table, now_date):
     )
 
     try:
-        dt = datetime.datetime.strftime(now_date, '%Y%m%d%H')
+        dt = datetime.datetime.strftime(now_date, '%Y%m%d')
         check_res = check_table_partition_exits(date=dt, project=project, table=table)
         if check_res:
             sql = f'select * from {project}.{table} where dt = {dt}'
@@ -97,12 +97,14 @@ def update_cut_words_result(text, source, words_list):
     res = mysql_helper.get_data(sql=select_sql)
     if res is None or len(res) == 0:
         # 不存在,插入
-        insert_sql = f"insert into word.cut_words_result (text, words, source) values '{text}', '{words}', {source};"
+        insert_sql = f"insert into word.cut_words_result (text, words, source) values ('{text}', '{words}', {source});"
+        log_.info(f"insert_sql = {insert_sql}")
         mysql_helper.add_data(sql=insert_sql)
     else:
         # 存在,更新
         update_sql = f"""update word.cut_words_result set words = '{words}', source = {source}, 
-        update_time = {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} where id = {res[0][0]};"""
+        update_time = '{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}' where id = {res[0][0]};"""
+        log_.info(f"update_sql = {update_sql}")
         mysql_helper.add_data(sql=update_sql)
 
 
@@ -121,7 +123,7 @@ def update_hot_word(words_list, source):
         res = mysql_helper.get_data(sql=select_sql)
         if res is None or len(res) == 0:
             # 不存在,插入
-            insert_sql = f"insert into word.hot_word (word, source) values '{word}', {source};"
+            insert_sql = f"insert into word.hot_word (word, source) values ('{word}', {source});"
             mysql_helper.add_data(sql=insert_sql)
         else:
             # 存在,更新
@@ -129,7 +131,7 @@ def update_hot_word(words_list, source):
                 source = 3
             update_sql = \
                 f"""update word.hot_word set source = {source}, 
-                update_time = {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} where id = {res[0][0]};"""
+                update_time = '{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}' where id = {res[0][0]};"""
             mysql_helper.add_data(sql=update_sql)
 
 
@@ -137,15 +139,18 @@ def data_update(project, table, now_date):
     """数据更新"""
     # 获取站内外视频标题数据
     df = get_title_data(project=project, table=table, now_date=now_date)
-    df = df['tag'].astype(int)
+    df['source'] = df['source'].astype(int)
     for source in [1, 2]:
         df_temp = df[df['source'] == source]
         title_list = df_temp['title'].to_list()
+        log_.info(f"source = {source}, count = {len(title_list)}")
         for title in title_list:
+            log_.info(f"title: {title}")
             if len(title) == 0:
                 return
             # 1. 分词
             words_list = word_cut(text=title)
+            log_.info(f"words_list: {words_list}")
             # 2. 分词结果入库
             update_cut_words_result(text=title, source=source, words_list=words_list)
             # 3. 词入库

+ 7 - 0
cut_words_task.sh

@@ -0,0 +1,7 @@
+source /etc/profile
+echo $HOT_WORDS_ENV
+if [[ $HOT_WORDS_ENV == 'test' ]]; then
+    cd /data2/hot-words && /root/anaconda3/bin/python /data2/hot-words/cut_words.py
+elif [[ $HOT_WORDS_ENV == 'pro' ]]; then
+    cd /data/hot-words && /root/anaconda3/bin/python /data2/hot-words/cut_words.py
+fi

+ 1 - 1
utils/utils.py

@@ -5,7 +5,7 @@ from odps import ODPS
 from log import Log
 from config import set_config
 log_ = Log()
-config_ = set_config()
+config_, env = set_config()
 
 
 def request_post(request_url, headers, request_data):