luojunhui 3 months ago
parent
commit
74988bfe9a
1 changed files with 58 additions and 23 deletions
  1. 58 23
      tasks/update_published_articles_minigram_detail.py

+ 58 - 23
tasks/update_published_articles_minigram_detail.py

@@ -1,6 +1,7 @@
 """
 @author: luojunhui
 """
+import json
 import traceback
 
 from datetime import datetime, timedelta
@@ -49,24 +50,6 @@ def extract_path(path: str) -> Dict:
         return EMPTY_DICT
 
 
-def get_article_mini_program_info(content_url: str) -> List[Dict]:
-    """
-    获取文章的小程序信息
-    :return:
-    """
-    try:
-        article_detail = spider.get_article_text(content_url)
-    except Exception as e:
-        raise SpiderError(error=e, spider="detail", url=content_url)
-
-    response_code = article_detail['code']
-    if response_code == const.ARTICLE_SUCCESS_CODE:
-        mini_info = article_detail['data']['data']['mini_program']
-        return mini_info
-    else:
-        return EMPTY_LIST
-
-
 class UpdatePublishedArticlesMinigramDetail(object):
     """
     更新已发布文章数据
@@ -136,7 +119,7 @@ class UpdatePublishedArticlesMinigramDetail(object):
         :return:
         """
         sql = f"""
-             SELECT ContentUrl, wx_sn, publish_timestamp, accountName, title
+             SELECT ContentUrl, wx_sn, publish_timestamp, accountName, title, root_source_id_list
              FROM official_articles_v2
              WHERE FROM_UNIXTIME(publish_timestamp)
              BETWEEN DATE_SUB('{biz_date}', INTERVAL 1 DAY) AND DATE_SUB('{biz_date}', INTERVAL 1 SECOND);
@@ -181,8 +164,9 @@ class UpdatePublishedArticlesMinigramDetail(object):
         url = article_info['ContentUrl']
         publish_timestamp = article_info['publish_timestamp']
         wx_sn = article_info['wx_sn'].decode()
+        root_source_id_list = json.loads(article_info['root_source_id_list'])
 
-        article_mini_program_detail = get_article_mini_program_info(url)
+        article_mini_program_detail = self.get_article_mini_program_info(url, root_source_id_list)
         if article_mini_program_detail:
             log(
                 task=TASK_NAME,
@@ -208,9 +192,13 @@ class UpdatePublishedArticlesMinigramDetail(object):
                         image_url = mini_item['image_url']
                         nick_name = mini_item['nike_name']
                         # extract video id and root_source_id
-                        id_info = extract_path(mini_item['path'])
-                        root_source_id = id_info['root_source_id']
-                        video_id = id_info['video_id']
+                        if mini_item.get("root_source_id") and mini_item.get("video_id"):
+                            root_source_id = mini_item['root_source_id']
+                            video_id = mini_item['video_id']
+                        else:
+                            id_info = extract_path(mini_item['path'])
+                            root_source_id = id_info['root_source_id']
+                            video_id = id_info['video_id']
                         kimi_title = mini_item['title']
                         self.insert_each_root_source_id(
                             wx_sn=wx_sn,
@@ -237,6 +225,53 @@ class UpdatePublishedArticlesMinigramDetail(object):
         else:
             return EMPTY_DICT
 
+    def get_article_mini_program_info(self, content_url: str, root_source_id_list: list) -> List[Dict]:
+        """
+        获取文章的小程序信息
+        :return:
+        """
+        if root_source_id_list:
+            # 说明已经获取到 root_source_id了
+            fetch_sql = f"""
+                select video_id, root_source_id
+                from long_articles_root_source_id
+                where root_source_id in %s;
+            """
+            fetch_response = self.long_articles_db_client.fetch(
+                query=fetch_sql,
+                params=(tuple(root_source_id_list),),
+                cursor_type=DictCursor
+            )
+            mini_info = EMPTY_LIST
+            if fetch_response:
+                for item in fetch_response:
+                    mini_info.append(
+                        {
+                            "app_id": "wx89e7eb06478361d7",
+                            "avatar": "https://rescdn.yishihui.com/0temp/logo.png",
+                            "image_url": "",
+                            "nike_name": "票圈 l 3亿人喜欢的视频平台",
+                            "root_source_id": item['root_source_id'],
+                            "video_id": item['video_id'],
+                            "service_type": "0",
+                            "title": "",
+                            "type": "card"
+                        }
+                    )
+                return mini_info
+
+        try:
+            article_detail = spider.get_article_text(content_url)
+        except Exception as e:
+            raise SpiderError(error=e, spider="detail", url=content_url)
+
+        response_code = article_detail['code']
+        if response_code == const.ARTICLE_SUCCESS_CODE:
+            mini_info = article_detail['data']['data']['mini_program']
+            return mini_info
+        else:
+            return EMPTY_LIST
+
     def get_root_source_id_for_three_days(self, biz_date: str) -> List[Dict]:
         """
         获取publish_dt在 biz_date前三天的root_source_id