3 miesięcy temu · 82290ef6ee
--- a/coldStartTasks/crawler/sohu/__init__.py
+++ b/coldStartTasks/crawler/sohu/__init__.py
@@ -1,2 +1,3 @@
 
				 from .get_recommedation import get_recommendation_video_list
			
 
				-from.get_user_homepage import get_user_homepage_videos
			
 
				+from.get_user_homepage import get_user_homepage_videos
			
 
				+from .get_hot_point import get_hot_point_videos
			
--- a/coldStartTasks/crawler/sohu/get_hot_point.py
+++ b/coldStartTasks/crawler/sohu/get_hot_point.py
@@ -0,0 +1,93 @@
 
				+import requests
			
 
				+import json
			
 
				+from tenacity import retry
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.utils import proxy, request_retry
			
 
				+from coldStartTasks.crawler.sohu.basic import generate_random_strings
			
 
				+from coldStartTasks.crawler.sohu.basic import generate_random_digits
			
 
				+from coldStartTasks.crawler.sohu.basic import get_ms_timestamp
			
 
				+
			
 
				+
			
 
				+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
			
 
				+
			
 
				+
			
 
				+@retry(**retry_desc)
			
 
				+def get_hot_point_videos():
			
 
				+    url = "https://odin.sohu.com/odin/api/blockdata"
			
 
				+    payload = json.dumps(
			
 
				+        {
			
 
				+            "pvId": f"{get_ms_timestamp()}_{generate_random_strings(7)}",
			
 
				+            "pageId": f"{get_ms_timestamp()}_{generate_random_digits(11)}_{generate_random_strings(3)}",
			
 
				+            "mainContent": {
			
 
				+                "productType": "13",
			
 
				+                "productId": "1650",
			
 
				+                "secureScore": "50",
			
 
				+                "categoryId": "47",
			
 
				+                "adTags": "11111111",
			
 
				+                "authorId": 121135924,
			
 
				+            },
			
 
				+            "resourceList": [
			
 
				+                {
			
 
				+                    "tplCompKey": "tpl-card-feed-pc-data",
			
 
				+                    "isServerRender": False,
			
 
				+                    "isSingleAd": False,
			
 
				+                    "configSource": "mp",
			
 
				+                    "content": {
			
 
				+                        "productId": "449975",
			
 
				+                        "productType": "15",
			
 
				+                        "size": 200,
			
 
				+                        "pro": "0,1,3,4,5",
			
 
				+                        "feedType": "XTOPIC_SYNTHETICAL",
			
 
				+                        "view": "feedMode",
			
 
				+                        "innerTag": "news-slice",
			
 
				+                        "spm": "smpc.channel_262.tpl-card-feed-pc",
			
 
				+                        "page": 1,
			
 
				+                        "requestId": f"{get_ms_timestamp()}{generate_random_strings(7)}_1650",
			
 
				+                    },
			
 
				+                    "adInfo": {
			
 
				+                        "posCode": 10069,
			
 
				+                        "rule": 2,
			
 
				+                        "turn": 5,
			
 
				+                        "number": 1,
			
 
				+                        "begin": 6,
			
 
				+                        "mergeType": 0,
			
 
				+                    },
			
 
				+                    "context": {"mkey": ""},
			
 
				+                }
			
 
				+            ],
			
 
				+        }
			
 
				+    )
			
 
				+    headers = {
			
 
				+        "Accept": "application/json, text/javascript, */*; q=0.01",
			
 
				+        "Accept-Language": "zh,zh-CN;q=0.9",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Content-Type": "application/json;charset=UTF-8",
			
 
				+        "Origin": "https://www.sohu.com",
			
 
				+        "Referer": "https://www.sohu.com/xchannel/TURBd01EQXhOalV3",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.post(url, headers=headers, data=payload, proxies=proxy())
			
 
				+        response.raise_for_status()
			
 
				+        return response.json()
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        log(
			
 
				+            task="sohu_hot_videos",
			
 
				+            function="get_hot_video_list",
			
 
				+            message=f"API请求失败: {e}",
			
 
				+        )
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        log(
			
 
				+            task="sohu_hot_videos",
			
 
				+            function="get_hot_video_list",
			
 
				+            message=f"响应解析失败: {e}",
			
 
				+        )
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# if __name__ == "__main__":
			
 
				+#     res = get_hot_point_videos()
			
 
				+#     hot_point_videos = res["data"]["tpl-card-feed-pc-data"]["list"]
			
 
				+#     for index, item in enumerate(hot_point_videos):
			
 
				+#         print(index, item["title"])
			
--- a/coldStartTasks/crawler/sohu/get_recommedation.py
+++ b/coldStartTasks/crawler/sohu/get_recommedation.py
@@ -59,7 +59,7 @@ def get_recommendation_video_list(seed_url, author_id, article_id, page):
 
				         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			
 
				     }
			
 
				     try:
			
 
				-        response = requests.post(url, headers=headers, data=payload)
			
 
				+        response = requests.post(url, headers=headers, data=payload, proxies=proxy())
			
 
				         response.raise_for_status()
			
 
				         return response.json()
			
 
				     except requests.exceptions.RequestException as e:
			
@@ -80,11 +80,11 @@ def get_recommendation_video_list(seed_url, author_id, article_id, page):
 
				 
			
 
				 
			
 
				 # usage example
			
 
				-if __name__ == '__main__':
			
 
				+if __name__ == "__main__":
			
 
				     res = get_recommendation_video_list(
			
 
				-        seed_url='https://www.sohu.com/a/877214751_121141867',
			
 
				-        author_id='121141867',
			
 
				-        article_id='877214751',
			
 
				-        page=2
			
 
				+        seed_url="https://www.sohu.com/a/877214751_121141867",
			
 
				+        author_id="121141867",
			
 
				+        article_id="877214751",
			
 
				+        page=2,
			
 
				     )
			
 
				     print(json.dumps(res, indent=4, ensure_ascii=False))
			
--- a/crawler_sohu_videos_task.py
+++ b/crawler_sohu_videos_task.py
@@ -0,0 +1,14 @@
 
				+from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuHotVideos
			
 
				+from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
			
 
				+
			
 
				+def main():
			
 
				+    # step1, crawl sohu hot videos
			
 
				+    crawler_sohu_hot_videos = CrawlerSohuHotVideos()
			
 
				+    crawler_sohu_hot_videos.deal()
			
 
				+
			
 
				+    # step2, crawl sohu recommend videos
			
 
				+    # crawler_sohu_recommend_videos = CrawlerSohuRecommendVideos()
			
 
				+    # crawler_sohu_recommend_videos.deal()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/tasks/crawler_tasks/crawler_video/crawler_sohu_account_videos.py
+++ b/tasks/crawler_tasks/crawler_video/crawler_sohu_account_videos.py
--- a/tasks/crawler_tasks/crawler_video/crawler_sohu_recommend_videos.py
+++ b/tasks/crawler_tasks/crawler_video/crawler_sohu_recommend_videos.py
@@ -1,113 +0,0 @@
 
				-from __future__ import annotations
			
 
				-
			
 
				-import time
			
 
				-import traceback
			
 
				-
			
 
				-from pymysql.cursors import DictCursor
			
 
				-from tqdm import tqdm
			
 
				-
			
 
				-from applications.db import DatabaseConnector
			
 
				-from applications.pipeline import scrape_video_entities_process
			
 
				-from applications.utils import Item
			
 
				-from applications.utils import str_to_md5
			
 
				-from applications.utils import insert_into_single_video_source_table
			
 
				-from config import long_articles_config
			
 
				-from coldStartTasks.crawler.sohu import get_recommendation_video_list
			
 
				-
			
 
				-
			
 
				-class CrawlerSohuRecommendVideos:
			
 
				-    def __init__(self):
			
 
				-        self.db_client = DatabaseConnector(long_articles_config)
			
 
				-        self.db_client.connect()
			
 
				-        self.platform = 'sohu'
			
 
				-
			
 
				-    def fetch_seed_videos(self) -> list[dict]:
			
 
				-        """
			
 
				-        get seed videos from database
			
 
				-        """
			
 
				-        fetch_query = f"""
			
 
				-            select id, out_account_id,  url_unique_md5, article_title, score
			
 
				-            from publish_single_video_source 
			
 
				-            where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
			
 
				-        """
			
 
				-        seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
			
 
				-        return seed_videos
			
 
				-
			
 
				-    def crawler_each_video(self, video_data):
			
 
				-        """
			
 
				-        crawler each video data
			
 
				-        """
			
 
				-        video_item = Item()
			
 
				-        unique_id = f"{self.platform}-{video_data['id']}"
			
 
				-
			
 
				-        # add info into item
			
 
				-        video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
			
 
				-        video_item.add("url_unique_md5", video_data["id"])
			
 
				-        video_item.add("article_title", video_data["title"])
			
 
				-        video_item.add("out_account_id", video_data['authorId'])
			
 
				-        video_item.add("out_account_name", video_data["authorName"])
			
 
				-        video_item.add("publish_timestamp", video_data["postTime"] / 1000)
			
 
				-        video_item.add("platform", self.platform)
			
 
				-        video_item.add("article_url", video_data["videoUrl"])
			
 
				-        video_item.add("source_account", 0)
			
 
				-        video_item.add("crawler_timestamp", int(time.time()))
			
 
				-
			
 
				-        # check item before insert
			
 
				-        video_item.check(source="video")
			
 
				-        try:
			
 
				-            item_with_oss_path = scrape_video_entities_process(
			
 
				-                video_item=video_item.item, db_client=self.db_client
			
 
				-            )
			
 
				-            if item_with_oss_path:
			
 
				-                insert_into_single_video_source_table(
			
 
				-                    db_client=self.db_client, video_item=item_with_oss_path
			
 
				-                )
			
 
				-        except Exception as e:
			
 
				-            detail = {
			
 
				-                "video_item": video_item.item,
			
 
				-                "error": str(e),
			
 
				-                "traceback": traceback.format_exc(),
			
 
				-            }
			
 
				-            print(detail)
			
 
				-
			
 
				-    def get_each_video_recommendation(self, seed_video: dict) -> None:
			
 
				-        """
			
 
				-        get each video recommendation
			
 
				-        """
			
 
				-        author_id = seed_video["out_account_id"]
			
 
				-        article_id = seed_video["url_unique_md5"]
			
 
				-        outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
			
 
				-        page_list = [i for i in range(1, 8)]
			
 
				-        for page in page_list:
			
 
				-            try:
			
 
				-                response = get_recommendation_video_list(outside_url, author_id, article_id, page)
			
 
				-                if response:
			
 
				-                    video_list = response['data']['recommendVideoFeed']['list']
			
 
				-                    for video in tqdm(video_list, desc=f"page: {page}"):
			
 
				-                        self.crawler_each_video(video)
			
 
				-
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-                print(traceback.format_exc())
			
 
				-                continue
			
 
				-
			
 
				-    def update_seed_video_status(self, task_id: int) -> int:
			
 
				-        """
			
 
				-        update seed video status
			
 
				-        """
			
 
				-        update_query = f"""
			
 
				-            update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
			
 
				-        """
			
 
				-        return self.db_client.save(
			
 
				-            query=update_query,
			
 
				-            params=(1, task_id, 0)
			
 
				-        )
			
 
				-
			
 
				-    def deal(self):
			
 
				-        task_list = self.fetch_seed_videos()
			
 
				-        for task in tqdm(task_list):
			
 
				-            try:
			
 
				-                self.get_each_video_recommendation(task)
			
 
				-                self.update_seed_video_status(task_id=task["id"])
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
--- a/tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py
+++ b/tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py
@@ -0,0 +1,232 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+import time
			
 
				+import traceback
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.pipeline import scrape_video_entities_process
			
 
				+from applications.utils import Item
			
 
				+from applications.utils import str_to_md5
			
 
				+from applications.utils import insert_into_single_video_source_table
			
 
				+
			
 
				+from coldStartTasks.crawler.sohu import get_hot_point_videos
			
 
				+from coldStartTasks.crawler.sohu import get_recommendation_video_list
			
 
				+from coldStartTasks.crawler.sohu import get_user_homepage_videos
			
 
				+
			
 
				+from config import long_articles_config
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuVideos:
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+        self.platform = "sohu"
			
 
				+
			
 
				+    def crawler_each_video(self, video_data):
			
 
				+        """
			
 
				+        crawler each video data
			
 
				+        """
			
 
				+        video_item = Item()
			
 
				+        unique_id = f"{self.platform}-{video_data['id']}"
			
 
				+
			
 
				+        # add info into item
			
 
				+        video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
			
 
				+        video_item.add("url_unique_md5", video_data["id"])
			
 
				+        video_item.add("article_title", video_data["title"])
			
 
				+        video_item.add("out_account_id", video_data["authorId"])
			
 
				+        video_item.add("out_account_name", video_data["authorName"])
			
 
				+        video_item.add("publish_timestamp", video_data["postTime"] / 1000)
			
 
				+        video_item.add("platform", self.platform)
			
 
				+        video_item.add("article_url", video_data["videoUrl"])
			
 
				+        video_item.add("source_account", 0)
			
 
				+        video_item.add("crawler_timestamp", int(time.time()))
			
 
				+
			
 
				+        # check item before insert
			
 
				+        video_item.check(source="video")
			
 
				+        try:
			
 
				+            item_with_oss_path = scrape_video_entities_process(
			
 
				+                video_item=video_item.item, db_client=self.db_client
			
 
				+            )
			
 
				+            if item_with_oss_path:
			
 
				+                insert_into_single_video_source_table(
			
 
				+                    db_client=self.db_client, video_item=item_with_oss_path
			
 
				+                )
			
 
				+        except Exception as e:
			
 
				+            detail = {
			
 
				+                "video_item": video_item.item,
			
 
				+                "error": str(e),
			
 
				+                "traceback": traceback.format_exc(),
			
 
				+            }
			
 
				+            print(detail)
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuHotVideos(CrawlerSohuVideos):
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        crawler sohu hot videos every day
			
 
				+        """
			
 
				+        hot_point_video_response = get_hot_point_videos()
			
 
				+        hot_point_video_list = hot_point_video_response["data"][
			
 
				+            "tpl-card-feed-pc-data"
			
 
				+        ]["list"]
			
 
				+        for video in tqdm(hot_point_video_list, desc="crawler sohu hot videos"):
			
 
				+            try:
			
 
				+                self.crawler_each_video(video)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="crawler_sohu_hot_videos",
			
 
				+                    message="crawler_sohu_hot_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "video": video,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
			
 
				+
			
 
				+    def fetch_seed_videos(self) -> list[dict]:
			
 
				+        """
			
 
				+        get seed videos from database
			
 
				+        """
			
 
				+        fetch_query = f"""
			
 
				+            select id, out_account_id,  url_unique_md5, article_title, score
			
 
				+            from publish_single_video_source 
			
 
				+            where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
			
 
				+        """
			
 
				+        seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
			
 
				+        return seed_videos
			
 
				+
			
 
				+    def get_each_video_recommendation(self, seed_video: dict) -> None:
			
 
				+        """
			
 
				+        get each video recommendation
			
 
				+        """
			
 
				+        author_id = seed_video["out_account_id"]
			
 
				+        article_id = seed_video["url_unique_md5"]
			
 
				+        outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
			
 
				+        page_list = [i for i in range(1, 8)]
			
 
				+        for page in page_list:
			
 
				+            try:
			
 
				+                response = get_recommendation_video_list(
			
 
				+                    outside_url, author_id, article_id, page
			
 
				+                )
			
 
				+                if response:
			
 
				+                    video_list = response["data"]["recommendVideoFeed"]["list"]
			
 
				+                    for video in tqdm(video_list, desc=f"page: {page}"):
			
 
				+                        self.crawler_each_video(video)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print(e)
			
 
				+                print(traceback.format_exc())
			
 
				+                continue
			
 
				+
			
 
				+    def update_seed_video_status(self, task_id: int) -> int:
			
 
				+        """
			
 
				+        update seed video status
			
 
				+        """
			
 
				+        update_query = f"""
			
 
				+            update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
			
 
				+        """
			
 
				+        return self.db_client.save(query=update_query, params=(1, task_id, 0))
			
 
				+
			
 
				+    def deal(self):
			
 
				+        task_list = self.fetch_seed_videos()
			
 
				+        for task in tqdm(task_list):
			
 
				+            try:
			
 
				+                self.get_each_video_recommendation(task)
			
 
				+                self.update_seed_video_status(task_id=task["id"])
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="crawler_sohu_hot_videos",
			
 
				+                    message="crawler_sohu_hot_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "video": task,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuUserPageVideos(CrawlerSohuVideos):
			
 
				+    def get_author_list(self):
			
 
				+        """
			
 
				+        get author list from database
			
 
				+        """
			
 
				+        return [121644888]
			
 
				+
			
 
				+    def process_each_page(self, response: dict):
			
 
				+        """
			
 
				+        process each page
			
 
				+        """
			
 
				+        video_list = response["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"][
			
 
				+            "list"
			
 
				+        ]
			
 
				+        for video in tqdm(video_list, desc="crawler sohu user page videos"):
			
 
				+            try:
			
 
				+                self.crawler_each_video(video)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="process_each_page",
			
 
				+                    message="crawler_sohu_user_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "video": video,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+    def get_each_user_videos(self, author_id: int):
			
 
				+        """
			
 
				+        get each user videos
			
 
				+        """
			
 
				+        page_list = [i for i in range(1, 2)]
			
 
				+        for page in page_list:
			
 
				+            try:
			
 
				+                response = get_user_homepage_videos(author_id, page)
			
 
				+                self.process_each_page(response)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="get_each_user_videos",
			
 
				+                    message="crawler_sohu_user_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "author_id": author_id,
			
 
				+                        "page": page,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        author_list = self.get_author_list()
			
 
				+        for author_id in tqdm(author_list, desc="crawler sohu user videos"):
			
 
				+            try:
			
 
				+                self.get_each_user_videos(author_id)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="crawler_sohu_hot_videos",
			
 
				+                    message="crawler_sohu_hot_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "author_od": author_id,
			
 
				+                    },
			
 
				+                )