luojunhui il y a 2 semaines
Parent
commit
b1c5f8b5bd

+ 2 - 0
coldStartTasks/crawler/sohu/__init__.py

@@ -0,0 +1,2 @@
+from .get_recommedation import get_recommendation_video_list
+from.get_user_homepage import get_user_homepage_videos

+ 17 - 0
coldStartTasks/crawler/sohu/basic.py

@@ -0,0 +1,17 @@
+import time
+import random
+import string
+
+
+def generate_random_strings(length):
+    chars = string.ascii_letters + string.digits
+    return "".join(random.choices(chars, k=length))
+
+
+def get_ms_timestamp():
+    ms = time.time_ns() // 1000000
+    return ms
+
+
+def generate_random_digits(length):
+    return "".join(random.choices(string.digits, k=length))

+ 80 - 64
coldStartTasks/crawler/sohu/get_recommedation.py

@@ -1,74 +1,90 @@
 import requests
 import requests
 import json
 import json
+from tenacity import retry
 
 
+from applications import log
+from applications.utils import proxy, request_retry
+from coldStartTasks.crawler.sohu.basic import generate_random_strings
+from coldStartTasks.crawler.sohu.basic import get_ms_timestamp
 
 
-def get_recommendation_video_list(page_id, page_size):
+
+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
+
+
+@retry(**retry_desc)
+def get_recommendation_video_list(seed_url, author_id, article_id, page):
     url = "https://odin.sohu.com/odin/api/a/blockdata?origin=article"
     url = "https://odin.sohu.com/odin/api/a/blockdata?origin=article"
-    payload = json.dumps({
-        "url": "//odin.sohu.com/odin/api/a/blockdata?origin=article",
-        "pageId": "1744186073720NK8",
-        "pvId": "1744186073657DQHXa2g",
-        "mainContent": {
-            "productId": "",
-            "productType": "",
-            "secureScore": "100",
-            "categoryId": "13",
-            "authorId": "121141867",
-            "articleId": "877216434"
-        },
-        "resourceList": [
-            {
-                "tplCompKey": "recommendVideoFeed",
-                "content": {
-                    "page": page_id,
-                    "requestId": "17441860918364TZ",
-                    "size": page_size,
-                    "productId": 1558,
-                    "productType": 13,
-                    "spm": "smpc.vd-land.end-rec"
-                },
-                "context": {
-                    "page_refer_url": "",
-                    "mkey": "channelId_13--mpid_877216434"
-                },
-                "adInfo": {},
-                "spmCCode": "end-rec",
-                "resourceId": "000000000000000000"
-            }
-        ]
-    })
+    payload = json.dumps(
+        {
+            "url": "//odin.sohu.com/odin/api/a/blockdata?origin=article",
+            "pageId": f"{get_ms_timestamp()}_{generate_random_strings(3)}",
+            "pvId": f"{get_ms_timestamp()}_{generate_random_strings(7)}",
+            "mainContent": {
+                "productId": "",
+                "productType": "",
+                "secureScore": "100",
+                "categoryId": "13",
+                "authorId": author_id,
+                "articleId": article_id,
+            },
+            "resourceList": [
+                {
+                    "tplCompKey": "recommendVideoFeed",
+                    "content": {
+                        "page": page,
+                        "requestId": f"{get_ms_timestamp()}_{generate_random_strings(3)}",
+                        "size": 24,
+                        "productId": 1558,
+                        "productType": 13,
+                        "spm": "smpc.vd-land.end-rec",
+                    },
+                    "context": {
+                        "page_refer_url": "",
+                        "mkey": "channelId_13--mpid_{}".format(article_id),
+                    },
+                    "adInfo": {},
+                    "spmCCode": "end-rec",
+                    "resourceId": "000000000000000000",
+                }
+            ],
+        }
+    )
     headers = {
     headers = {
-        'Accept': 'application/json, text/plain, */*',
-        'Accept-Language': 'zh,zh-CN;q=0.9',
-        'Connection': 'keep-alive',
-        'Content-Type': 'application/json',
-        'Origin': 'https://www.sohu.com',
-        'Referer': 'https://www.sohu.com/a/877216434_121141867?scm=10001.325_13-109000.0.0.5_32',
-        'Sec-Fetch-Dest': 'empty',
-        'Sec-Fetch-Mode': 'cors',
-        'Sec-Fetch-Site': 'same-site',
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
-        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
-        'sec-ch-ua-mobile': '?0',
-        'sec-ch-ua-platform': '"macOS"',
-        'Cookie': 'IPLOC=CN1200; SUV=250319174303GUDT; cto_bidid=DbraLl80TnZBSiUyRjd5Y3R3d3BPQ2gwNkhCQzFYcTR3cERUelpzdkVsOHIwbUx4VVB2Nm9obktXS1JicW00ZkZJbkY4MWtWTXJHajdKeEdIUEpnQ3ZNM2NNWFJRJTNEJTNE; _cc_id=16be057c82f6c7b9487f8e9de87093f8; cto_bundle=G-1cul95czh2RVh4SnRnZXRlUllFb0hyZFhKRkFiSHpWaU5JZDNNM0pKc25UMlUyQk9FcDYwRVNWcmc4VjdadmxDVyUyQmNhN3NydzJ6NVpJZSUyRjdHSnlhNVViSVUySDl0SCUyQk52blJFNk9VJTJCNTJZaVZHYzdUdUkwcHltWmkzR2d6aTI1TzNheFhkWiUyQjlvaGJkUldEQlElMkJOWTUlMkJTQSUzRCUzRA; gidinf=x099980107ee1a664f21e8892000bfb0cb568460d4f7; FCNEC=%5B%5B%22AKsRol-M9pfdhPRV6WdT0_UpWwGGHATDkhGhu3WhCRwchHNYyaiiIzdgxL07iMyYWnsT_EtmqDWtsEXTVncdSYtqnSPa5geKzsupz1uaDinhxC5vtZ5VYpyP2ce9ihomBxnBnoeGfP_Lbib3u5FF1RQacGdUubuKpg%3D%3D%22%5D%5D; clt=1743410197; cld=20250331163637; _ga=GA1.1.954524343.1743578691; reqtype=pc; _ga_DFBWYFE6Q0=GS1.1.1743578690.1.1.1743578734.16.0.0; cityIpLocation=61.48.133.26; beans_dmp=%7B%2210191%22%3A1744104695%2C%22admaster%22%3A1744104695%2C%22shunfei%22%3A1744104695%2C%22reachmax%22%3A1744104695%2C%22lingji%22%3A1744104695%2C%22yoyi%22%3A1744104695%2C%22ipinyou%22%3A1744104695%2C%22ipinyou_admaster%22%3A1744104695%2C%22miaozhen%22%3A1744104695%2C%22diantong%22%3A1744104695%2C%22huayang%22%3A1744104695%2C%22precisionS%22%3A1744104695%2C%22qunyi%22%3A1744104695%7D; _dfp=q4xXm1uuBqdI3QKRaKHbjDocPoUeGdyjTp29AM1Eak4%3D; hideAddDesktop=true; t=1744186073675'
+        "Accept": "application/json, text/plain, */*",
+        "Accept-Language": "zh",
+        "Connection": "keep-alive",
+        "Content-Type": "application/json",
+        "Origin": "https://www.sohu.com",
+        "Referer": seed_url,
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
     }
     }
-
-    response = requests.request("POST", url, headers=headers, data=payload)
-    recommend_list = response.json()['data']['recommendVideoFeed']['list']
-    for item in recommend_list:
-        L.append(item)
-
-L = []
-for i in range(1, 20):
     try:
     try:
-        get_recommendation_video_list(i, 30)
-    except Exception as e:
-        print(e)
-        print("page: ", i)
-        continue
+        response = requests.post(url, headers=headers, data=payload)
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        log(
+            task="sohu_recommendation",
+            function="get_recommendation_video_list",
+            message=f"API请求失败: {e}",
+            data={"url": seed_url},
+        )
+    except json.JSONDecodeError as e:
+        log(
+            task="sohu_recommendation",
+            function="get_recommendation_video_list",
+            message=f"响应解析失败: {e}",
+            data={"url": seed_url},
+        )
+    return None
 
 
-with open("877216434.json", "w") as f:
-    f.write(json.dumps(L, ensure_ascii=False, indent=4))
-    print("done")
 
 
+# usage example
+if __name__ == '__main__':
+    res = get_recommendation_video_list(
+        seed_url='https://www.sohu.com/a/877214751_121141867',
+        author_id='121141867',
+        article_id='877214751',
+        page=2
+    )
+    print(json.dumps(res, indent=4, ensure_ascii=False))

+ 89 - 0
coldStartTasks/crawler/sohu/get_user_homepage.py

@@ -0,0 +1,89 @@
+import requests
+import json
+from tenacity import retry
+
+from applications import log
+from applications.utils import proxy, request_retry
+from coldStartTasks.crawler.sohu.basic import generate_random_strings
+from coldStartTasks.crawler.sohu.basic import generate_random_digits
+from coldStartTasks.crawler.sohu.basic import get_ms_timestamp
+
+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
+
+
+@retry(**retry_desc)
+def get_user_homepage_videos(author_id, page):
+    url = "https://odin.sohu.com/odin/api/blockdata"
+    payload = {
+        "pvId": f"{get_ms_timestamp()}_{generate_random_strings(7)}",
+        "pageId": f"{get_ms_timestamp()}_{generate_random_digits(13)}_{get_ms_timestamp()}",
+        "mainContent": {
+            "productType": "13",
+            "productId": "324",
+            "secureScore": "5",
+            "categoryId": "47",
+            "adTags": "11111111",
+            "authorId": 121135924,
+        },
+        "resourceList": [
+            {
+                "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
+                "isServerRender": False,
+                "isSingleAd": False,
+                "configSource": "mp",
+                "content": {
+                    "productId": "325",
+                    "productType": "13",
+                    "size": 20,
+                    "pro": "0,1,3,4,5",
+                    "feedType": "XTOPIC_SYNTHETICAL",
+                    "view": "operateFeedMode",
+                    "innerTag": "work",
+                    "spm": "smpc.channel_248.block3_308_hHsK47_2_fd",
+                    "page": page,
+                    "requestId": f"{get_ms_timestamp()}{generate_random_strings(7)}_324",
+                },
+                "adInfo": {},
+                "context": {"mkey": author_id},
+            }
+        ],
+    }
+    headers = {
+        "Accept": "application/json, text/javascript, */*; q=0.01",
+        "Accept-Language": "zh",
+        "Connection": "keep-alive",
+        "Content-Type": "application/json;charset=UTF-8",
+        "Origin": "https://mp.sohu.com",
+        "Referer": "https://mp.sohu.com",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
+    }
+    try:
+        response = requests.post(
+            url, headers=headers, data=json.dumps(payload), proxies=proxy()
+        )
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        log(
+            task="sohu_author_homepage",
+            function="get_homepage_video_list",
+            message=f"API请求失败: {e}",
+            data={"author_id": author_id},
+        )
+    except json.JSONDecodeError as e:
+        log(
+            task="sohu_author_homepage",
+            function="get_homepage_video_list",
+            message=f"响应解析失败: {e}",
+            data={"author_id": author_id},
+        )
+    return None
+
+
+# # usage example
+# if __name__ == '__main__':
+#     response_ = get_user_homepage_videos(
+#         author_id="121141867",
+#         page=2
+#     )
+#     print(json.dumps(response_, indent=4, ensure_ascii=False))

+ 0 - 0
tasks/crawler_tasks/crawler_video/crawler_sohu_account_videos.py


+ 113 - 0
tasks/crawler_tasks/crawler_video/crawler_sohu_recommend_videos.py

@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import time
+import traceback
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications.db import DatabaseConnector
+from applications.pipeline import scrape_video_entities_process
+from applications.utils import Item
+from applications.utils import str_to_md5
+from applications.utils import insert_into_single_video_source_table
+from config import long_articles_config
+from coldStartTasks.crawler.sohu import get_recommendation_video_list
+
+
+class CrawlerSohuRecommendVideos:
+    def __init__(self):
+        self.db_client = DatabaseConnector(long_articles_config)
+        self.db_client.connect()
+        self.platform = 'sohu'
+
+    def fetch_seed_videos(self) -> list[dict]:
+        """
+        get seed videos from database
+        """
+        fetch_query = f"""
+            select id, out_account_id,  url_unique_md5, article_title, score
+            from publish_single_video_source 
+            where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
+        """
+        seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
+        return seed_videos
+
+    def crawler_each_video(self, video_data):
+        """
+        crawler each video data
+        """
+        video_item = Item()
+        unique_id = f"{self.platform}-{video_data['id']}"
+
+        # add info into item
+        video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
+        video_item.add("url_unique_md5", video_data["id"])
+        video_item.add("article_title", video_data["title"])
+        video_item.add("out_account_id", video_data['authorId'])
+        video_item.add("out_account_name", video_data["authorName"])
+        video_item.add("publish_timestamp", video_data["postTime"] / 1000)
+        video_item.add("platform", self.platform)
+        video_item.add("article_url", video_data["videoUrl"])
+        video_item.add("source_account", 0)
+        video_item.add("crawler_timestamp", int(time.time()))
+
+        # check item before insert
+        video_item.check(source="video")
+        try:
+            item_with_oss_path = scrape_video_entities_process(
+                video_item=video_item.item, db_client=self.db_client
+            )
+            if item_with_oss_path:
+                insert_into_single_video_source_table(
+                    db_client=self.db_client, video_item=item_with_oss_path
+                )
+        except Exception as e:
+            detail = {
+                "video_item": video_item.item,
+                "error": str(e),
+                "traceback": traceback.format_exc(),
+            }
+            print(detail)
+
+    def get_each_video_recommendation(self, seed_video: dict) -> None:
+        """
+        get each video recommendation
+        """
+        author_id = seed_video["out_account_id"]
+        article_id = seed_video["url_unique_md5"]
+        outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
+        page_list = [i for i in range(1, 8)]
+        for page in page_list:
+            try:
+                response = get_recommendation_video_list(outside_url, author_id, article_id, page)
+                if response:
+                    video_list = response['data']['recommendVideoFeed']['list']
+                    for video in tqdm(video_list):
+                        self.crawler_each_video(video)
+
+            except Exception as e:
+                print(e)
+                print(traceback.format_exc())
+                continue
+
+    def update_seed_video_status(self, task_id: int) -> int:
+        """
+        update seed video status
+        """
+        update_query = f"""
+            update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
+        """
+        return self.db_client.save(
+            query=update_query,
+            params=(1, task_id, 0)
+        )
+
+    def deal(self):
+        task_list = self.fetch_seed_videos()
+        for task in tqdm(task_list[:1]):
+            try:
+                self.get_each_video_recommendation(task)
+                self.update_seed_video_status(task_id=task["id"])
+            except Exception as e:
+                print(e)