Przeglądaj źródła

票圈视频接入每日抓取流程

luojunhui 2 miesięcy temu
rodzic
commit
9952c8ea8c

+ 1 - 0
applications/api/__init__.py

@@ -2,6 +2,7 @@
 @author: luojunhui
 """
 from .aigc_system_api import AigcSystemApi
+from .apollo_api import ApolloApi
 from .deep_seek_api_by_volcanoengine import fetch_deepseek_response
 from .moon_shot_api import fetch_moon_shot_response
 from .nlp_api import similarity_between_title_list

+ 24 - 0
applications/api/apollo_api.py

@@ -0,0 +1,24 @@
+import pyapollos
+
+
+class ApolloApi:
+    def __init__(self, app_id="LongArticlesJob", env="pre"):
+        match env:
+            case "pre":
+                config_server_url = 'https://preapolloconfig-internal.piaoquantv.com/'
+            case "dev":
+                config_server_url = 'https://devapolloconfig-internal.piaoquantv.com/'
+            case "prod":
+                config_server_url = 'https://apolloconfig-internal.piaoquantv.com/'
+            case _:
+                raise ValueError("env must be 'pre' or 'dev' or 'prod'")
+
+        self.apollo_connection = pyapollos.ApolloClient(
+            app_id=app_id,
+            config_server_url=config_server_url,
+            timeout=10
+        )
+
+    def get_config_value(self, key):
+        return self.apollo_connection.get_value(key)
+

+ 0 - 2
applications/pipeline/crawler_pipeline.py

@@ -74,8 +74,6 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
         case "sohu":
             video_path = download_sohu_video(article_url)
         case "piaoquan":
-            oss_path = ""
-            video_item["video_oss_path"] = oss_path
             return video_item
         case _:
             return empty_dict

+ 2 - 0
applications/utils/item.py

@@ -24,6 +24,8 @@ default_single_video_table_fields = {
     "bad_status": 0,
     "tags": None,
     "video_oss_path": None,
+    "audit_status": 0,
+    "category_status": 0
 }
 
 

+ 4 - 2
applications/utils/save_to_db.py

@@ -12,9 +12,9 @@ def insert_into_single_video_source_table(db_client, video_item):
     """
     insert_sql = f"""
         INSERT INTO publish_single_video_source
-        (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account)
+        (content_trace_id, article_title, out_account_id, out_account_name, read_cnt, like_cnt, article_url, cover_url, video_oss_path, publish_timestamp, crawler_timestamp, url_unique_md5, category, tags, platform, source_account, category_status, audit_status)
         values
-        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+        (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
     """
     try:
         db_client.save(
@@ -36,6 +36,8 @@ def insert_into_single_video_source_table(db_client, video_item):
                 video_item["tags"],
                 video_item["platform"],
                 video_item["source_account"],
+                video_item["category_status"],
+                video_item["audit_status"]
             ),
         )
     except Exception as e:

+ 8 - 25
tasks/crawler_tasks/crawler_video/crawler_piaoquan_videos.py

@@ -8,6 +8,7 @@ from pymysql.cursors import DictCursor
 from tqdm import tqdm
 
 from applications import log
+from applications.api import ApolloApi
 from applications.api import fetch_piaoquan_video_list_detail
 from applications.const.crawler_video_const import CrawlerPiaoQuanVideosConst
 from applications.db import DatabaseConnector
@@ -19,29 +20,8 @@ from applications.utils import insert_into_single_video_source_table
 from config import long_articles_config
 
 const = CrawlerPiaoQuanVideosConst()
-
-category_map = {
-    "知识科普": "知识科普",
-    "生活技巧科普": "知识科普",
-    "老年相关法律科普": "知识科普",
-    "中国战争史": "军事历史",
-    "中国历史影像": "军事历史",
-    "正能量剧情": "家长里短",
-    "人财诈骗": "社会法治",
-    "贪污腐败": "社会法治",
-    "罕见画面": "奇闻趣事",
-    "惊奇事件": "奇闻趣事",
-    "动物萌宠": "奇闻趣事",
-    "老明星": "名人八卦",
-    "健康知识": "健康养生",
-    "饮食健康": "健康养生",
-    "人生忠告": "情感故事",
-    "老年生活": "情感故事",
-    "国际军事": "政治新闻",
-    "他国政策": "政治新闻",
-    "国际时政": "政治新闻",
-    "历史名人": "历史人物",
-}
+apollo_api = ApolloApi()
+pq_long_articles_category_mapping = json.loads(apollo_api.get_config_value("pq_long_articles_category_mapping"))
 
 
 class CrawlerPiaoQuanVideos:
@@ -105,9 +85,12 @@ class CrawlerPiaoQuanVideos:
         )
         video_item.add("source_account", const.NO_SOURCE_ACCOUNT)
         video_item.add("crawler_timestamp", int(time.time()))
-        video_item.add("oss_path", video_detail["ossVideoPath"])
+        video_item.add("video_oss_path", video_detail["ossVideoPath"])
         video_item.add("audit_status", video_detail["auditStatus"])
-        video_item.add("category", category_map.get(video_data["category"]))
+        category = pq_long_articles_category_mapping.get(video_data["category"])
+        if category:
+            video_item.add("category", category)
+            video_item.add("category_status", const.SUCCESS_STATUS)
 
         # check item before insert
         video_item.check(source="video")