luojunhui 2 هفته پیش
والد
کامیت
82290ef6ee

+ 2 - 1
coldStartTasks/crawler/sohu/__init__.py

@@ -1,2 +1,3 @@
 from .get_recommedation import get_recommendation_video_list
-from.get_user_homepage import get_user_homepage_videos
+from.get_user_homepage import get_user_homepage_videos
+from .get_hot_point import get_hot_point_videos

+ 93 - 0
coldStartTasks/crawler/sohu/get_hot_point.py

@@ -0,0 +1,93 @@
+import requests
+import json
+from tenacity import retry
+
+from applications import log
+from applications.utils import proxy, request_retry
+from coldStartTasks.crawler.sohu.basic import generate_random_strings
+from coldStartTasks.crawler.sohu.basic import generate_random_digits
+from coldStartTasks.crawler.sohu.basic import get_ms_timestamp
+
+
+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
+
+
+@retry(**retry_desc)
+def get_hot_point_videos():
+    url = "https://odin.sohu.com/odin/api/blockdata"
+    payload = json.dumps(
+        {
+            "pvId": f"{get_ms_timestamp()}_{generate_random_strings(7)}",
+            "pageId": f"{get_ms_timestamp()}_{generate_random_digits(11)}_{generate_random_strings(3)}",
+            "mainContent": {
+                "productType": "13",
+                "productId": "1650",
+                "secureScore": "50",
+                "categoryId": "47",
+                "adTags": "11111111",
+                "authorId": 121135924,
+            },
+            "resourceList": [
+                {
+                    "tplCompKey": "tpl-card-feed-pc-data",
+                    "isServerRender": False,
+                    "isSingleAd": False,
+                    "configSource": "mp",
+                    "content": {
+                        "productId": "449975",
+                        "productType": "15",
+                        "size": 200,
+                        "pro": "0,1,3,4,5",
+                        "feedType": "XTOPIC_SYNTHETICAL",
+                        "view": "feedMode",
+                        "innerTag": "news-slice",
+                        "spm": "smpc.channel_262.tpl-card-feed-pc",
+                        "page": 1,
+                        "requestId": f"{get_ms_timestamp()}{generate_random_strings(7)}_1650",
+                    },
+                    "adInfo": {
+                        "posCode": 10069,
+                        "rule": 2,
+                        "turn": 5,
+                        "number": 1,
+                        "begin": 6,
+                        "mergeType": 0,
+                    },
+                    "context": {"mkey": ""},
+                }
+            ],
+        }
+    )
+    headers = {
+        "Accept": "application/json, text/javascript, */*; q=0.01",
+        "Accept-Language": "zh,zh-CN;q=0.9",
+        "Connection": "keep-alive",
+        "Content-Type": "application/json;charset=UTF-8",
+        "Origin": "https://www.sohu.com",
+        "Referer": "https://www.sohu.com/xchannel/TURBd01EQXhOalV3",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
+    }
+    try:
+        response = requests.post(url, headers=headers, data=payload, proxies=proxy())
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        log(
+            task="sohu_hot_videos",
+            function="get_hot_video_list",
+            message=f"API请求失败: {e}",
+        )
+    except json.JSONDecodeError as e:
+        log(
+            task="sohu_hot_videos",
+            function="get_hot_video_list",
+            message=f"响应解析失败: {e}",
+        )
+    return None
+
+
+# if __name__ == "__main__":
+#     res = get_hot_point_videos()
+#     hot_point_videos = res["data"]["tpl-card-feed-pc-data"]["list"]
+#     for index, item in enumerate(hot_point_videos):
+#         print(index, item["title"])

+ 6 - 6
coldStartTasks/crawler/sohu/get_recommedation.py

@@ -59,7 +59,7 @@ def get_recommendation_video_list(seed_url, author_id, article_id, page):
         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
     }
     try:
-        response = requests.post(url, headers=headers, data=payload)
+        response = requests.post(url, headers=headers, data=payload, proxies=proxy())
         response.raise_for_status()
         return response.json()
     except requests.exceptions.RequestException as e:
@@ -80,11 +80,11 @@ def get_recommendation_video_list(seed_url, author_id, article_id, page):
 
 
 # usage example
-if __name__ == '__main__':
+if __name__ == "__main__":
     res = get_recommendation_video_list(
-        seed_url='https://www.sohu.com/a/877214751_121141867',
-        author_id='121141867',
-        article_id='877214751',
-        page=2
+        seed_url="https://www.sohu.com/a/877214751_121141867",
+        author_id="121141867",
+        article_id="877214751",
+        page=2,
     )
     print(json.dumps(res, indent=4, ensure_ascii=False))

+ 14 - 0
crawler_sohu_videos_task.py

@@ -0,0 +1,14 @@
+from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuHotVideos
+from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
+
+def main():
+    # step1, crawl sohu hot videos
+    crawler_sohu_hot_videos = CrawlerSohuHotVideos()
+    crawler_sohu_hot_videos.deal()
+
+    # step2, crawl sohu recommend videos
+    # crawler_sohu_recommend_videos = CrawlerSohuRecommendVideos()
+    # crawler_sohu_recommend_videos.deal()
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
tasks/crawler_tasks/crawler_video/crawler_sohu_account_videos.py


+ 0 - 113
tasks/crawler_tasks/crawler_video/crawler_sohu_recommend_videos.py

@@ -1,113 +0,0 @@
-from __future__ import annotations
-
-import time
-import traceback
-
-from pymysql.cursors import DictCursor
-from tqdm import tqdm
-
-from applications.db import DatabaseConnector
-from applications.pipeline import scrape_video_entities_process
-from applications.utils import Item
-from applications.utils import str_to_md5
-from applications.utils import insert_into_single_video_source_table
-from config import long_articles_config
-from coldStartTasks.crawler.sohu import get_recommendation_video_list
-
-
-class CrawlerSohuRecommendVideos:
-    def __init__(self):
-        self.db_client = DatabaseConnector(long_articles_config)
-        self.db_client.connect()
-        self.platform = 'sohu'
-
-    def fetch_seed_videos(self) -> list[dict]:
-        """
-        get seed videos from database
-        """
-        fetch_query = f"""
-            select id, out_account_id,  url_unique_md5, article_title, score
-            from publish_single_video_source 
-            where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
-        """
-        seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
-        return seed_videos
-
-    def crawler_each_video(self, video_data):
-        """
-        crawler each video data
-        """
-        video_item = Item()
-        unique_id = f"{self.platform}-{video_data['id']}"
-
-        # add info into item
-        video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
-        video_item.add("url_unique_md5", video_data["id"])
-        video_item.add("article_title", video_data["title"])
-        video_item.add("out_account_id", video_data['authorId'])
-        video_item.add("out_account_name", video_data["authorName"])
-        video_item.add("publish_timestamp", video_data["postTime"] / 1000)
-        video_item.add("platform", self.platform)
-        video_item.add("article_url", video_data["videoUrl"])
-        video_item.add("source_account", 0)
-        video_item.add("crawler_timestamp", int(time.time()))
-
-        # check item before insert
-        video_item.check(source="video")
-        try:
-            item_with_oss_path = scrape_video_entities_process(
-                video_item=video_item.item, db_client=self.db_client
-            )
-            if item_with_oss_path:
-                insert_into_single_video_source_table(
-                    db_client=self.db_client, video_item=item_with_oss_path
-                )
-        except Exception as e:
-            detail = {
-                "video_item": video_item.item,
-                "error": str(e),
-                "traceback": traceback.format_exc(),
-            }
-            print(detail)
-
-    def get_each_video_recommendation(self, seed_video: dict) -> None:
-        """
-        get each video recommendation
-        """
-        author_id = seed_video["out_account_id"]
-        article_id = seed_video["url_unique_md5"]
-        outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
-        page_list = [i for i in range(1, 8)]
-        for page in page_list:
-            try:
-                response = get_recommendation_video_list(outside_url, author_id, article_id, page)
-                if response:
-                    video_list = response['data']['recommendVideoFeed']['list']
-                    for video in tqdm(video_list, desc=f"page: {page}"):
-                        self.crawler_each_video(video)
-
-            except Exception as e:
-                print(e)
-                print(traceback.format_exc())
-                continue
-
-    def update_seed_video_status(self, task_id: int) -> int:
-        """
-        update seed video status
-        """
-        update_query = f"""
-            update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
-        """
-        return self.db_client.save(
-            query=update_query,
-            params=(1, task_id, 0)
-        )
-
-    def deal(self):
-        task_list = self.fetch_seed_videos()
-        for task in tqdm(task_list):
-            try:
-                self.get_each_video_recommendation(task)
-                self.update_seed_video_status(task_id=task["id"])
-            except Exception as e:
-                print(e)

+ 232 - 0
tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py

@@ -0,0 +1,232 @@
+from __future__ import annotations
+
+import time
+import traceback
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import log
+from applications.db import DatabaseConnector
+from applications.pipeline import scrape_video_entities_process
+from applications.utils import Item
+from applications.utils import str_to_md5
+from applications.utils import insert_into_single_video_source_table
+
+from coldStartTasks.crawler.sohu import get_hot_point_videos
+from coldStartTasks.crawler.sohu import get_recommendation_video_list
+from coldStartTasks.crawler.sohu import get_user_homepage_videos
+
+from config import long_articles_config
+
+
+class CrawlerSohuVideos:
+    def __init__(self):
+        self.db_client = DatabaseConnector(long_articles_config)
+        self.db_client.connect()
+        self.platform = "sohu"
+
+    def crawler_each_video(self, video_data):
+        """
+        crawler each video data
+        """
+        video_item = Item()
+        unique_id = f"{self.platform}-{video_data['id']}"
+
+        # add info into item
+        video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
+        video_item.add("url_unique_md5", video_data["id"])
+        video_item.add("article_title", video_data["title"])
+        video_item.add("out_account_id", video_data["authorId"])
+        video_item.add("out_account_name", video_data["authorName"])
+        video_item.add("publish_timestamp", video_data["postTime"] / 1000)
+        video_item.add("platform", self.platform)
+        video_item.add("article_url", video_data["videoUrl"])
+        video_item.add("source_account", 0)
+        video_item.add("crawler_timestamp", int(time.time()))
+
+        # check item before insert
+        video_item.check(source="video")
+        try:
+            item_with_oss_path = scrape_video_entities_process(
+                video_item=video_item.item, db_client=self.db_client
+            )
+            if item_with_oss_path:
+                insert_into_single_video_source_table(
+                    db_client=self.db_client, video_item=item_with_oss_path
+                )
+        except Exception as e:
+            detail = {
+                "video_item": video_item.item,
+                "error": str(e),
+                "traceback": traceback.format_exc(),
+            }
+            print(detail)
+
+
+class CrawlerSohuHotVideos(CrawlerSohuVideos):
+    def deal(self):
+        """
+        crawler sohu hot videos every day
+        """
+        hot_point_video_response = get_hot_point_videos()
+        hot_point_video_list = hot_point_video_response["data"][
+            "tpl-card-feed-pc-data"
+        ]["list"]
+        for video in tqdm(hot_point_video_list, desc="crawler sohu hot videos"):
+            try:
+                self.crawler_each_video(video)
+
+            except Exception as e:
+                log(
+                    task="crawler_sohu_videos",
+                    function="crawler_sohu_hot_videos",
+                    message="crawler_sohu_hot_videos failed",
+                    status="failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "video": video,
+                    },
+                )
+
+
+class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
+
+    def fetch_seed_videos(self) -> list[dict]:
+        """
+        get seed videos from database
+        """
+        fetch_query = f"""
+            select id, out_account_id,  url_unique_md5, article_title, score
+            from publish_single_video_source 
+            where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
+        """
+        seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
+        return seed_videos
+
+    def get_each_video_recommendation(self, seed_video: dict) -> None:
+        """
+        get each video recommendation
+        """
+        author_id = seed_video["out_account_id"]
+        article_id = seed_video["url_unique_md5"]
+        outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
+        page_list = [i for i in range(1, 8)]
+        for page in page_list:
+            try:
+                response = get_recommendation_video_list(
+                    outside_url, author_id, article_id, page
+                )
+                if response:
+                    video_list = response["data"]["recommendVideoFeed"]["list"]
+                    for video in tqdm(video_list, desc=f"page: {page}"):
+                        self.crawler_each_video(video)
+
+            except Exception as e:
+                print(e)
+                print(traceback.format_exc())
+                continue
+
+    def update_seed_video_status(self, task_id: int) -> int:
+        """
+        update seed video status
+        """
+        update_query = f"""
+            update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
+        """
+        return self.db_client.save(query=update_query, params=(1, task_id, 0))
+
+    def deal(self):
+        task_list = self.fetch_seed_videos()
+        for task in tqdm(task_list):
+            try:
+                self.get_each_video_recommendation(task)
+                self.update_seed_video_status(task_id=task["id"])
+            except Exception as e:
+                log(
+                    task="crawler_sohu_videos",
+                    function="crawler_sohu_hot_videos",
+                    message="crawler_sohu_hot_videos failed",
+                    status="failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "video": task,
+                    },
+                )
+
+
+class CrawlerSohuUserPageVideos(CrawlerSohuVideos):
+    def get_author_list(self):
+        """
+        get author list from database
+        """
+        return [121644888]
+
+    def process_each_page(self, response: dict):
+        """
+        process each page
+        """
+        video_list = response["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"][
+            "list"
+        ]
+        for video in tqdm(video_list, desc="crawler sohu user page videos"):
+            try:
+                self.crawler_each_video(video)
+
+            except Exception as e:
+                log(
+                    task="crawler_sohu_videos",
+                    function="process_each_page",
+                    message="crawler_sohu_user_videos failed",
+                    status="failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "video": video,
+                    },
+                )
+
+    def get_each_user_videos(self, author_id: int):
+        """
+        get each user videos
+        """
+        page_list = [i for i in range(1, 2)]
+        for page in page_list:
+            try:
+                response = get_user_homepage_videos(author_id, page)
+                self.process_each_page(response)
+
+            except Exception as e:
+                log(
+                    task="crawler_sohu_videos",
+                    function="get_each_user_videos",
+                    message="crawler_sohu_user_videos failed",
+                    status="failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "author_id": author_id,
+                        "page": page,
+                    },
+                )
+
+    def deal(self):
+        author_list = self.get_author_list()
+        for author_id in tqdm(author_list, desc="crawler sohu user videos"):
+            try:
+                self.get_each_user_videos(author_id)
+
+            except Exception as e:
+                log(
+                    task="crawler_sohu_videos",
+                    function="crawler_sohu_hot_videos",
+                    message="crawler_sohu_hot_videos failed",
+                    status="failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "author_od": author_id,
+                    },
+                )