Browse Source

update mini program info

luojunhui 3 months ago
parent
commit
5d989e6a1e

+ 22 - 0
sh/run_update_minigram_info_daily.sh

@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/update_mini_program_info_task_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+if pgrep -f "python3 update_mini_info_v2.py" > /dev/null
+then
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - update_mini_info_v2.py is running"
+else
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - trying to restart update_mini_info_v2.py"
+    cd /root/luojunhui/LongArticlesJob
+    source /root/miniconda3/etc/profile.d/conda.sh
+    conda activate tasks
+    nohup python3 update_mini_info_v2.py >> "${LOG_FILE}" 2>&1 &
+    echo "$(date '+%Y-%m-%d %H:%M:%S') - successfully restarted update_mini_info_v2.py"
+    fi
+fi

+ 28 - 25
tasks/update_published_articles_minigram_detail.py

@@ -25,29 +25,28 @@ functions = Functions()
 
 TASK_NAME = "updateMinigramInfoDaily"
 ARTICLE_TABLE = "official_articles_v2"
-DETAIL_TABLE = "long_articles_detail_info_dev"
+DETAIL_TABLE = "long_articles_detail_info"
 EMPTY_LIST = []
+EMPTY_DICT = {}
 
 
-def get_root_source_id_list(mini_program: List[Dict]) -> List[str]:
+def extract_path(path: str) -> Dict[str: str]:
     """
-    校验是否存在文章是否存在root_source_id
+    提取path参数
+    :param path:
     :return:
     """
-    root_source_id_list = []
-    for item in mini_program:
-        path = item['path']
-        # 解析主URL的查询参数
-        params = parse_qs(urlparse(path).query)
-        # 提取 'jumpPage' 参数的值并解析它的查询参数
-        jump_page = params.get('jumpPage', [None])[0]
-        if jump_page:
-            params2 = parse_qs(jump_page)
-            # 提取 'rootSourceId' 参数的值
-            root_source_id = params2.get('rootSourceId', [None])[0]
-            if root_source_id:
-                root_source_id_list.append(root_source_id)
-    return root_source_id_list
+    params = parse_qs(urlparse(path).query)
+    jump_page = params.get('jumpPage', [None])[0]
+    if jump_page:
+        params2 = parse_qs(jump_page)
+        res = {
+            "video_id": params2['pages/user-videos?id'][0],
+            "root_source_id": params2['rootSourceId'][0],
+        }
+        return res
+    else:
+        return EMPTY_DICT
 
 
 def get_article_mini_program_info(content_url: str) -> List[Dict]:
@@ -105,9 +104,11 @@ class UpdatePublishedArticlesMinigramDetail(object):
         :return:
         """
         sql = f"""
-        SELECT ContentUrl, wx_sn 
-        FROM {ARTICLE_TABLE}
-        WHERE publish_timestamp IN {(const.DEFAULT_STATUS, const.REQUEST_FAIL_STATUS)};"""
+            SELECT ContentUrl, wx_sn 
+            FROM {ARTICLE_TABLE}
+            WHERE publish_timestamp IN {(const.DEFAULT_STATUS, const.REQUEST_FAIL_STATUS)};
+        """
+
         response = self.piaoquan_crawler_db_client.fetch(sql, cursor_type=DictCursor)
         return response
 
@@ -127,7 +128,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
         if result:
             return result[0]
         else:
-            return {}
+            return EMPTY_DICT
 
     def get_articles_published_yesterday(self, biz_date: str) -> List[Dict]:
         """
@@ -206,8 +207,10 @@ class UpdatePublishedArticlesMinigramDetail(object):
                     for video_index, mini_item in enumerate(article_mini_program_detail, 1):
                         image_url = mini_item['image_url']
                         nick_name = mini_item['nike_name']
-                        root_source_id = mini_item['path'].split("rootSourceId%3D")[-1]
-                        video_id = mini_item['path'].split("videos%3Fid%3D")[1].split("%26su%3D")[0]
+                        # extract video id and root_source_id
+                        id_info = extract_path(mini_item['path'])
+                        root_source_id = id_info['root_source_id']
+                        video_id = id_info['video_id']
                         kimi_title = mini_item['title']
                         self.insert_each_root_source_id(
                             wx_sn=wx_sn,
@@ -220,7 +223,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
                             publish_dt=publish_date.strftime('%Y-%m-%d'),
                             recall_dt=date_str
                         )
-                return {}
+                return EMPTY_DICT
             except Exception as e:
                 error_msg = traceback.format_exc()
                 log(
@@ -323,7 +326,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
 
     def update_mini_program_detail_job(self, biz_date=None):
         """
-        更新裂变信息
+        update mini program detail info
         :param biz_date:
         :return:
         """

+ 33 - 4
update_mini_info_v2.py

@@ -1,11 +1,40 @@
 """
 @author: luojunhui
 """
+import traceback
+
+from argparse import ArgumentParser
+
+from applications import bot
 from tasks.update_published_articles_minigram_detail import UpdatePublishedArticlesMinigramDetail
 
 
-U = UpdatePublishedArticlesMinigramDetail()
-U.init_database()
+def main():
+    """
+    update mini program detail main
+    :return:
+    """
+    parser = ArgumentParser()
+    parser.add_argument("--run-date",
+                        help="Run only once for date in format of %Y-%m-%d. \
+                            If no specified, run as daily jobs.")
+    args = parser.parse_args()
 
-# U.update_published_articles_job()
-U.update_mini_program_detail_job()
+    try:
+        U = UpdatePublishedArticlesMinigramDetail()
+        U.init_database()
+    except Exception as e:
+        bot(
+            title='update mini program detail init database error',
+            detail={
+                "error": str(e),
+                "traceback": traceback.format_exc()
+            }
+        )
+        return
+    if args.run_date:
+        U.update_published_articles_job(biz_date=args.run_date)
+        U.update_mini_program_detail_job(biz_date=args.run_date)
+    else:
+        U.update_published_articles_job()
+        U.update_mini_program_detail_job()