luojunhui 1 هفته پیش
والد
کامیت
8a8fdc7462

+ 2 - 3
applications/pipeline/crawler_pipeline.py

@@ -7,6 +7,7 @@ import json
 
 from applications import log
 
+from applications.utils import download_sohu_video
 from applications.utils import download_gzh_video
 from applications.utils import download_toutiao_video
 from applications.utils import upload_to_oss
@@ -54,12 +55,10 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
     video_title = video_item["article_title"]
     # whether title sensitive
     if whether_title_sensitive(video_title):
-        print("title is sensitive")
         return empty_dict
 
     # whether duplicate video title
     if whether_duplicate_video_title(video_title, db_client):
-        print("duplicate video title")
         return empty_dict
 
     # download video
@@ -73,7 +72,7 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
         case "sph":
             video_path = ""
         case "sohu":
-            video_path = download_toutiao_video(article_url)
+            video_path = download_sohu_video(article_url)
         case _:
             return empty_dict
 

+ 1 - 0
applications/utils/__init__.py

@@ -6,6 +6,7 @@ from .cold_start import get_inner_account_set
 from .common import *
 from .download_video import download_gzh_video
 from .download_video import download_sph_video
+from .download_video import download_sohu_video
 from .download_video import download_toutiao_video
 from .item import Item
 from .save_to_db import insert_into_single_video_source_table

+ 13 - 0
applications/utils/download_video.py

@@ -154,3 +154,16 @@ def download_toutiao_video(video_url: str) -> str:
 
     return save_path
 
+
+def download_sohu_video(video_url: str) -> str:
+    """
+    download sohu video
+    """
+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
+    response = requests.get(video_url, headers=headers, stream=True)
+    with open(save_path, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+
+    return save_path

+ 9 - 9
coldStartTasks/crawler/sohu/get_recommedation.py

@@ -79,12 +79,12 @@ def get_recommendation_video_list(seed_url, author_id, article_id, page):
     return None
 
 
-# usage example
-if __name__ == "__main__":
-    res = get_recommendation_video_list(
-        seed_url="https://www.sohu.com/a/877214751_121141867",
-        author_id="121141867",
-        article_id="877214751",
-        page=2,
-    )
-    print(json.dumps(res, indent=4, ensure_ascii=False))
+# # usage example
+# if __name__ == "__main__":
+#     res = get_recommendation_video_list(
+#         seed_url="https://www.sohu.com/a/877214751_121141867",
+#         author_id="121141867",
+#         article_id="877214751",
+#         page=2,
+#     )
+#     print(json.dumps(res, indent=4, ensure_ascii=False))

+ 3 - 33
coldStartTasks/publish/publish_single_video_pool_videos.py

@@ -13,34 +13,7 @@ from config import long_articles_config, apolloConfig
 config = apolloConfig()
 const = SingleVideoPoolPublishTaskConst()
 
-# video_pool_config = json.loads(config.getConfigValue(key="video_pool_publish_config"))
-video_pool_config = {
-    "sph": {
-        "nick_name": "视频号",
-        "process_num_each_day": 218,
-        "generate_plan_id": "20250325025917853810062"
-    },
-    "gzh": {
-        "nick_name": "公众号",
-        "process_num_each_day": 201,
-        "generate_plan_id": "20250324132413116896899"
-    },
-    "toutiao": {
-        "nick_name": "头条号",
-        "process_num_each_day": 411,
-        "generate_plan_id": "20250324132226090387919"
-    },
-    "hksp": {
-        "nick_name": "好看视频",
-        "process_num_each_day": 165,
-        "generate_plan_id": "20250325025446821867933"
-    },
-    "sohu": {
-        "nick_name": "搜狐",
-        "process_num_each_day": 100,
-        "generate_plan_id": "20250409083938381788492"
-    }
-}
+video_pool_config = json.loads(config.getConfigValue(key="video_pool_publish_config"))
 
 
 class PublishSingleVideoPoolVideos:
@@ -79,7 +52,7 @@ class PublishSingleVideoPoolVideos:
         """
         entrance of this class
         """
-        platform_list = ["sohu"]
+        platform_list = ["sph", "gzh", "toutiao", "hksp", "sohu"]
         for platform in tqdm(platform_list, desc='process each platform'):
             task_list = self.get_task_list(platform)
             task_id_tuple = tuple([task['id'] for task in task_list])
@@ -141,7 +114,4 @@ class PublishSingleVideoPoolVideos:
                         'msg': '该平台无待发布视频,请关注供给的抓取'
                     },
                     mention=False
-                )
-
-if __name__ == '__main__':
-    PublishSingleVideoPoolVideos().deal()
+                )

+ 1 - 2
coldStartTasks/publish/publish_video_to_pq_for_audit.py

@@ -43,8 +43,7 @@ class PublishVideosForAudit(object):
         sql = f"""
             SELECT id, article_title, video_oss_path 
             FROM publish_single_video_source 
-            WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS} and platform = 'sohu'
-                and score > 0.5
+            WHERE audit_status = {const.VIDEO_AUDIT_INIT_STATUS} and bad_status = {const.TITLE_DEFAULT_STATUS}
             ORDER BY score DESC
             LIMIT {limit_count};
             """