6 kuukautta sitten · 6ba8706912
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -379,6 +379,42 @@ class ToutiaoVideoCrawlerConst:
 
				     SLEEP_SECOND = 3
			
 
				 
			
 
				 
			
 
				+class SohuVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for sohu video crawler
			
 
				+    """
			
 
				+    # platform
			
 
				+    PLATFORM = "sohu"
			
 
				+
			
 
				+    # account status
			
 
				+    GET_RECOMMEND_INIT_STATUS = 0
			
 
				+    GET_RECOMMEND_SUCCESS_STATUS = 1
			
 
				+    GET_RECOMMEND_FAIL_STATUS = 99
			
 
				+
			
 
				+    # title length min
			
 
				+    MIN_TITLE_LENGTH = 10
			
 
				+
			
 
				+    # max video length(second)
			
 
				+    MAX_VIDEO_LENGTH = 600
			
 
				+
			
 
				+    # sleep second
			
 
				+    SLEEP_SECOND = 3
			
 
				+
			
 
				+    # 获取推荐的最低相关性分
			
 
				+    GET_RECOMMEND_THRESHOLD_SCORE = 0.6
			
 
				+
			
 
				+    # 审核状态
			
 
				+    AUDIT_SUCCESS_STATUS = 1
			
 
				+
			
 
				+    # 视频状态
			
 
				+    VIDEO_NOT_BAD_STATUS = 0
			
 
				+
			
 
				+    # PAGE_LIST
			
 
				+    PAGE_LIST = [i for i in range(1, 8)]
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				 class SingleVideoPoolPublishTaskConst:
			
 
				     """
			
 
				     const for single video pool publish task
			
--- a/applications/pipeline/crawler_pipeline.py
+++ b/applications/pipeline/crawler_pipeline.py
@@ -7,6 +7,7 @@ import json
 
				 
			
 
				 from applications import log
			
 
				 
			
 
				+from applications.utils import download_sohu_video
			
 
				 from applications.utils import download_gzh_video
			
 
				 from applications.utils import download_toutiao_video
			
 
				 from applications.utils import upload_to_oss
			
@@ -70,6 +71,8 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
 
				             video_path = ""
			
 
				         case "sph":
			
 
				             video_path = ""
			
 
				+        case "sohu":
			
 
				+            video_path = download_sohu_video(article_url)
			
 
				         case _:
			
 
				             return empty_dict
			
 
				 
			
--- a/applications/utils/__init__.py
+++ b/applications/utils/__init__.py
@@ -6,6 +6,7 @@ from .cold_start import get_inner_account_set
 
				 from .common import *
			
 
				 from .download_video import download_gzh_video
			
 
				 from .download_video import download_sph_video
			
 
				+from .download_video import download_sohu_video
			
 
				 from .download_video import download_toutiao_video
			
 
				 from .item import Item
			
 
				 from .save_to_db import insert_into_single_video_source_table
			
--- a/applications/utils/download_video.py
+++ b/applications/utils/download_video.py
@@ -154,3 +154,16 @@ def download_toutiao_video(video_url: str) -> str:
 
				 
			
 
				     return save_path
			
 
				 
			
 
				+
			
 
				+def download_sohu_video(video_url: str) -> str:
			
 
				+    """
			
 
				+    download sohu video
			
 
				+    """
			
 
				+    save_path = "static/{}.mp4".format(str_to_md5(video_url))
			
 
				+    response = requests.get(video_url, headers=headers, stream=True)
			
 
				+    with open(save_path, "wb") as f:
			
 
				+        for chunk in response.iter_content(chunk_size=8192):
			
 
				+            if chunk:
			
 
				+                f.write(chunk)
			
 
				+
			
 
				+    return save_path
			
--- a/coldStartTasks/crawler/sohu/__init__.py
+++ b/coldStartTasks/crawler/sohu/__init__.py
@@ -0,0 +1,4 @@
 
				+from .get_detail import get_video_detail
			
 
				+from .get_recommedation import get_recommendation_video_list
			
 
				+from .get_user_homepage import get_user_homepage_videos
			
 
				+from .get_hot_point import get_hot_point_videos
			
--- a/coldStartTasks/crawler/sohu/basic.py
+++ b/coldStartTasks/crawler/sohu/basic.py
@@ -0,0 +1,17 @@
 
				+import time
			
 
				+import random
			
 
				+import string
			
 
				+
			
 
				+
			
 
				+def generate_random_strings(length):
			
 
				+    chars = string.ascii_letters + string.digits
			
 
				+    return "".join(random.choices(chars, k=length))
			
 
				+
			
 
				+
			
 
				+def get_ms_timestamp():
			
 
				+    ms = time.time_ns() // 1000000
			
 
				+    return ms
			
 
				+
			
 
				+
			
 
				+def generate_random_digits(length):
			
 
				+    return "".join(random.choices(string.digits, k=length))
			
--- a/coldStartTasks/crawler/sohu/get_detail.py
+++ b/coldStartTasks/crawler/sohu/get_detail.py
@@ -0,0 +1,96 @@
 
				+import re
			
 
				+import json
			
 
				+import requests
			
 
				+from datetime import datetime
			
 
				+from lxml import html
			
 
				+from tenacity import retry
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.utils import proxy, request_retry
			
 
				+
			
 
				+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
			
 
				+
			
 
				+
			
 
				+def extract_video_url(html_text: str) -> str | None:
			
 
				+    """
			
 
				+    extract video url from html text
			
 
				+    """
			
 
				+    patterns = [
			
 
				+        r'<source\s+src=["\'](.*?\.mp4)["\']',
			
 
				+        r'videoUrl\s*=\s*["\'](.*?\.mp4)["\']',
			
 
				+        r"(https?://\S+?\.mp4(?:\?\S+)?)",
			
 
				+    ]
			
 
				+    video_urls = []
			
 
				+    for pattern in patterns:
			
 
				+        match = re.findall(pattern, html_text, re.IGNORECASE)
			
 
				+        video_urls.extend(match)
			
 
				+
			
 
				+    if video_urls:
			
 
				+        return video_urls[0]
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def extract_video_info(html_text: str) -> dict | None:
			
 
				+    """
			
 
				+    extract video publish time from html text
			
 
				+    """
			
 
				+    tree = html.fromstring(html_text)
			
 
				+    publish_time_str = tree.xpath("//meta[@property='og:release_date']/@content")[0]
			
 
				+    account_name = tree.xpath("//meta[@name='mediaid']/@content")[0]
			
 
				+    sub_url = tree.xpath("//meta[@property='og:url']/@content")[0]
			
 
				+    article_id = sub_url.split("/")[-1].split("_")[0]
			
 
				+    account_id = sub_url.split("/")[-1].split("_")[1]
			
 
				+    title = tree.xpath("//meta[@name='description']/@content")[0]
			
 
				+    response = {
			
 
				+        "publish_timestamp": int(
			
 
				+            datetime.strptime(publish_time_str, "%Y-%m-%d %H:%M").timestamp() * 1000
			
 
				+        ),
			
 
				+        "account_name": account_name,
			
 
				+        "article_id": article_id,
			
 
				+        "account_id": account_id,
			
 
				+        "video_url": extract_video_url(html_text),
			
 
				+        "title": title,
			
 
				+    }
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+@retry(**retry_desc)
			
 
				+def get_video_detail(article_url: str) -> dict | None:
			
 
				+    """
			
 
				+    get detail video url
			
 
				+    """
			
 
				+    payload = {}
			
 
				+    headers = {
			
 
				+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+        "Accept-Language": "zh",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Upgrade-Insecure-Requests": "1",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.get(article_url, headers=headers, data=payload, proxies=proxy())
			
 
				+        response.raise_for_status()
			
 
				+        video_info = extract_video_info(response.text)
			
 
				+        return video_info
			
 
				+
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        log(
			
 
				+            task="sohu_detail_video",
			
 
				+            function="get_detail_video_url",
			
 
				+            message=f"API请求失败: {e}",
			
 
				+        )
			
 
				+
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        log(
			
 
				+            task="sohu_detail_video",
			
 
				+            function="get_detail_video_url",
			
 
				+            message=f"响应解析失败: {e}",
			
 
				+        )
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# url = 'https://www.sohu.com/a/877211651_121141867?scm=10001.325_13-109000.0.0.5_32&spm=smpc.channel_248.block3_308_NDdFbm_1_fd.25.1743578768825Lv6rTp1_324'
			
 
				+# res = get_detail_video_url(url)
			
 
				+#
			
 
				+# print(res)
			
--- a/coldStartTasks/crawler/sohu/get_hot_point.py
+++ b/coldStartTasks/crawler/sohu/get_hot_point.py
@@ -0,0 +1,93 @@
 
				+import requests
			
 
				+import json
			
 
				+from tenacity import retry
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.utils import proxy, request_retry
			
 
				+from coldStartTasks.crawler.sohu.basic import generate_random_strings
			
 
				+from coldStartTasks.crawler.sohu.basic import generate_random_digits
			
 
				+from coldStartTasks.crawler.sohu.basic import get_ms_timestamp
			
 
				+
			
 
				+
			
 
				+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
			
 
				+
			
 
				+
			
 
				+@retry(**retry_desc)
			
 
				+def get_hot_point_videos():
			
 
				+    url = "https://odin.sohu.com/odin/api/blockdata"
			
 
				+    payload = json.dumps(
			
 
				+        {
			
 
				+            "pvId": f"{get_ms_timestamp()}_{generate_random_strings(7)}",
			
 
				+            "pageId": f"{get_ms_timestamp()}_{generate_random_digits(11)}_{generate_random_strings(3)}",
			
 
				+            "mainContent": {
			
 
				+                "productType": "13",
			
 
				+                "productId": "1650",
			
 
				+                "secureScore": "50",
			
 
				+                "categoryId": "47",
			
 
				+                "adTags": "11111111",
			
 
				+                "authorId": 121135924,
			
 
				+            },
			
 
				+            "resourceList": [
			
 
				+                {
			
 
				+                    "tplCompKey": "tpl-card-feed-pc-data",
			
 
				+                    "isServerRender": False,
			
 
				+                    "isSingleAd": False,
			
 
				+                    "configSource": "mp",
			
 
				+                    "content": {
			
 
				+                        "productId": "449975",
			
 
				+                        "productType": "15",
			
 
				+                        "size": 200,
			
 
				+                        "pro": "0,1,3,4,5",
			
 
				+                        "feedType": "XTOPIC_SYNTHETICAL",
			
 
				+                        "view": "feedMode",
			
 
				+                        "innerTag": "news-slice",
			
 
				+                        "spm": "smpc.channel_262.tpl-card-feed-pc",
			
 
				+                        "page": 1,
			
 
				+                        "requestId": f"{get_ms_timestamp()}{generate_random_strings(7)}_1650",
			
 
				+                    },
			
 
				+                    "adInfo": {
			
 
				+                        "posCode": 10069,
			
 
				+                        "rule": 2,
			
 
				+                        "turn": 5,
			
 
				+                        "number": 1,
			
 
				+                        "begin": 6,
			
 
				+                        "mergeType": 0,
			
 
				+                    },
			
 
				+                    "context": {"mkey": ""},
			
 
				+                }
			
 
				+            ],
			
 
				+        }
			
 
				+    )
			
 
				+    headers = {
			
 
				+        "Accept": "application/json, text/javascript, */*; q=0.01",
			
 
				+        "Accept-Language": "zh,zh-CN;q=0.9",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Content-Type": "application/json;charset=UTF-8",
			
 
				+        "Origin": "https://www.sohu.com",
			
 
				+        "Referer": "https://www.sohu.com/xchannel/TURBd01EQXhOalV3",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.post(url, headers=headers, data=payload, proxies=proxy())
			
 
				+        response.raise_for_status()
			
 
				+        return response.json()
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        log(
			
 
				+            task="sohu_hot_videos",
			
 
				+            function="get_hot_video_list",
			
 
				+            message=f"API请求失败: {e}",
			
 
				+        )
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        log(
			
 
				+            task="sohu_hot_videos",
			
 
				+            function="get_hot_video_list",
			
 
				+            message=f"响应解析失败: {e}",
			
 
				+        )
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# if __name__ == "__main__":
			
 
				+#     res = get_hot_point_videos()
			
 
				+#     hot_point_videos = res["data"]["tpl-card-feed-pc-data"]["list"]
			
 
				+#     for index, item in enumerate(hot_point_videos):
			
 
				+#         print(index, item["title"])
			
--- a/coldStartTasks/crawler/sohu/get_recommedation.py
+++ b/coldStartTasks/crawler/sohu/get_recommedation.py
@@ -0,0 +1,90 @@
 
				+import requests
			
 
				+import json
			
 
				+from tenacity import retry
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.utils import proxy, request_retry
			
 
				+from coldStartTasks.crawler.sohu.basic import generate_random_strings
			
 
				+from coldStartTasks.crawler.sohu.basic import get_ms_timestamp
			
 
				+
			
 
				+
			
 
				+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
			
 
				+
			
 
				+
			
 
				+@retry(**retry_desc)
			
 
				+def get_recommendation_video_list(seed_url, author_id, article_id, page):
			
 
				+    url = "https://odin.sohu.com/odin/api/a/blockdata?origin=article"
			
 
				+    payload = json.dumps(
			
 
				+        {
			
 
				+            "url": "//odin.sohu.com/odin/api/a/blockdata?origin=article",
			
 
				+            "pageId": f"{get_ms_timestamp()}_{generate_random_strings(3)}",
			
 
				+            "pvId": f"{get_ms_timestamp()}_{generate_random_strings(7)}",
			
 
				+            "mainContent": {
			
 
				+                "productId": "",
			
 
				+                "productType": "",
			
 
				+                "secureScore": "100",
			
 
				+                "categoryId": "13",
			
 
				+                "authorId": author_id,
			
 
				+                "articleId": article_id,
			
 
				+            },
			
 
				+            "resourceList": [
			
 
				+                {
			
 
				+                    "tplCompKey": "recommendVideoFeed",
			
 
				+                    "content": {
			
 
				+                        "page": page,
			
 
				+                        "requestId": f"{get_ms_timestamp()}_{generate_random_strings(3)}",
			
 
				+                        "size": 24,
			
 
				+                        "productId": 1558,
			
 
				+                        "productType": 13,
			
 
				+                        "spm": "smpc.vd-land.end-rec",
			
 
				+                    },
			
 
				+                    "context": {
			
 
				+                        "page_refer_url": "",
			
 
				+                        "mkey": "channelId_13--mpid_{}".format(article_id),
			
 
				+                    },
			
 
				+                    "adInfo": {},
			
 
				+                    "spmCCode": "end-rec",
			
 
				+                    "resourceId": "000000000000000000",
			
 
				+                }
			
 
				+            ],
			
 
				+        }
			
 
				+    )
			
 
				+    headers = {
			
 
				+        "Accept": "application/json, text/plain, */*",
			
 
				+        "Accept-Language": "zh",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Content-Type": "application/json",
			
 
				+        "Origin": "https://www.sohu.com",
			
 
				+        "Referer": seed_url,
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.post(url, headers=headers, data=payload, proxies=proxy())
			
 
				+        response.raise_for_status()
			
 
				+        return response.json()
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        log(
			
 
				+            task="sohu_recommendation",
			
 
				+            function="get_recommendation_video_list",
			
 
				+            message=f"API请求失败: {e}",
			
 
				+            data={"url": seed_url},
			
 
				+        )
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        log(
			
 
				+            task="sohu_recommendation",
			
 
				+            function="get_recommendation_video_list",
			
 
				+            message=f"响应解析失败: {e}",
			
 
				+            data={"url": seed_url},
			
 
				+        )
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# # usage example
			
 
				+# if __name__ == "__main__":
			
 
				+#     res = get_recommendation_video_list(
			
 
				+#         seed_url="https://www.sohu.com/a/877214751_121141867",
			
 
				+#         author_id="121141867",
			
 
				+#         article_id="877214751",
			
 
				+#         page=2,
			
 
				+#     )
			
 
				+#     print(json.dumps(res, indent=4, ensure_ascii=False))
			
--- a/coldStartTasks/crawler/sohu/get_user_homepage.py
+++ b/coldStartTasks/crawler/sohu/get_user_homepage.py
@@ -0,0 +1,89 @@
 
				+import requests
			
 
				+import json
			
 
				+from tenacity import retry
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.utils import proxy, request_retry
			
 
				+from coldStartTasks.crawler.sohu.basic import generate_random_strings
			
 
				+from coldStartTasks.crawler.sohu.basic import generate_random_digits
			
 
				+from coldStartTasks.crawler.sohu.basic import get_ms_timestamp
			
 
				+
			
 
				+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
			
 
				+
			
 
				+
			
 
				+@retry(**retry_desc)
			
 
				+def get_user_homepage_videos(author_id, page):
			
 
				+    url = "https://odin.sohu.com/odin/api/blockdata"
			
 
				+    payload = {
			
 
				+        "pvId": f"{get_ms_timestamp()}_{generate_random_strings(7)}",
			
 
				+        "pageId": f"{get_ms_timestamp()}_{generate_random_digits(13)}_{get_ms_timestamp()}",
			
 
				+        "mainContent": {
			
 
				+            "productType": "13",
			
 
				+            "productId": "324",
			
 
				+            "secureScore": "5",
			
 
				+            "categoryId": "47",
			
 
				+            "adTags": "11111111",
			
 
				+            "authorId": 121135924,
			
 
				+        },
			
 
				+        "resourceList": [
			
 
				+            {
			
 
				+                "tplCompKey": "FeedSlideloadAuthor_2_0_pc_1655965929143_data2",
			
 
				+                "isServerRender": False,
			
 
				+                "isSingleAd": False,
			
 
				+                "configSource": "mp",
			
 
				+                "content": {
			
 
				+                    "productId": "325",
			
 
				+                    "productType": "13",
			
 
				+                    "size": 20,
			
 
				+                    "pro": "0,1,3,4,5",
			
 
				+                    "feedType": "XTOPIC_SYNTHETICAL",
			
 
				+                    "view": "operateFeedMode",
			
 
				+                    "innerTag": "work",
			
 
				+                    "spm": "smpc.channel_248.block3_308_hHsK47_2_fd",
			
 
				+                    "page": page,
			
 
				+                    "requestId": f"{get_ms_timestamp()}{generate_random_strings(7)}_324",
			
 
				+                },
			
 
				+                "adInfo": {},
			
 
				+                "context": {"mkey": author_id},
			
 
				+            }
			
 
				+        ],
			
 
				+    }
			
 
				+    headers = {
			
 
				+        "Accept": "application/json, text/javascript, */*; q=0.01",
			
 
				+        "Accept-Language": "zh",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Content-Type": "application/json;charset=UTF-8",
			
 
				+        "Origin": "https://mp.sohu.com",
			
 
				+        "Referer": "https://mp.sohu.com",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.post(
			
 
				+            url, headers=headers, data=json.dumps(payload), proxies=proxy()
			
 
				+        )
			
 
				+        response.raise_for_status()
			
 
				+        return response.json()
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        log(
			
 
				+            task="sohu_author_homepage",
			
 
				+            function="get_homepage_video_list",
			
 
				+            message=f"API请求失败: {e}",
			
 
				+            data={"author_id": author_id},
			
 
				+        )
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        log(
			
 
				+            task="sohu_author_homepage",
			
 
				+            function="get_homepage_video_list",
			
 
				+            message=f"响应解析失败: {e}",
			
 
				+            data={"author_id": author_id},
			
 
				+        )
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# # usage example
			
 
				+# if __name__ == '__main__':
			
 
				+#     response_ = get_user_homepage_videos(
			
 
				+#         author_id="121141867",
			
 
				+#         page=2
			
 
				+#     )
			
 
				+#     print(json.dumps(response_, indent=4, ensure_ascii=False))
			
--- a/coldStartTasks/publish/publish_single_video_pool_videos.py
+++ b/coldStartTasks/publish/publish_single_video_pool_videos.py
@@ -52,7 +52,7 @@ class PublishSingleVideoPoolVideos:
 
				         """
			
 
				         entrance of this class
			
 
				         """
			
 
				-        platform_list = ["sph", "gzh", "toutiao", "hksp"]
			
 
				+        platform_list = ["sph", "gzh", "toutiao", "hksp", "sohu"]
			
 
				         for platform in tqdm(platform_list, desc='process each platform'):
			
 
				             task_list = self.get_task_list(platform)
			
 
				             task_id_tuple = tuple([task['id'] for task in task_list])
			
--- a/crawler_sohu_videos_task.py
+++ b/crawler_sohu_videos_task.py
@@ -0,0 +1,14 @@
 
				+from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuHotVideos
			
 
				+from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
			
 
				+
			
 
				+def main():
			
 
				+    # step1, crawl sohu hot videos
			
 
				+    crawler_sohu_hot_videos = CrawlerSohuHotVideos()
			
 
				+    crawler_sohu_hot_videos.deal()
			
 
				+
			
 
				+    # step2, crawl sohu recommend videos
			
 
				+    crawler_sohu_recommend_videos = CrawlerSohuRecommendVideos()
			
 
				+    crawler_sohu_recommend_videos.deal()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,9 +17,13 @@ setuptools
 
				 tqdm~=4.66.4
			
 
				 pyapollos~=0.1.5
			
 
				 protobuf~=3.20.3
			
 
				-openai~=1.17.0
			
 
				+openai~=1.72.0
			
 
				 oss2~=2.19.1
			
 
				 fake-useragent~=1.5.1
			
 
				 playwright~=1.49.1
			
 
				 volcengine-python-sdk[ark]
			
 
				-tenacity~=9.0.0
			
 
				+tenacity~=9.0.0
			
 
				+scikit-learn~=1.6.1
			
 
				+google~=3.0.0
			
 
				+cffi~=1.17.1
			
 
				+lxml~=5.3.2
			
--- a/tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py
+++ b/tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py
@@ -0,0 +1,282 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+import time
			
 
				+import traceback
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.const import SohuVideoCrawlerConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.pipeline import scrape_video_entities_process
			
 
				+from applications.utils import Item
			
 
				+from applications.utils import str_to_md5
			
 
				+from applications.utils import insert_into_single_video_source_table
			
 
				+
			
 
				+from coldStartTasks.crawler.sohu import get_video_detail
			
 
				+from coldStartTasks.crawler.sohu import get_hot_point_videos
			
 
				+from coldStartTasks.crawler.sohu import get_recommendation_video_list
			
 
				+from coldStartTasks.crawler.sohu import get_user_homepage_videos
			
 
				+
			
 
				+from config import long_articles_config
			
 
				+
			
 
				+const = SohuVideoCrawlerConst()
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuVideos:
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+
			
 
				+    def crawler_each_video(self, video_data):
			
 
				+        """
			
 
				+        crawler each video data
			
 
				+        """
			
 
				+        video_item = Item()
			
 
				+        unique_id = f"{const.PLATFORM}-{video_data['id']}"
			
 
				+
			
 
				+        # add info into item
			
 
				+        video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
			
 
				+        video_item.add("url_unique_md5", video_data["id"])
			
 
				+        video_item.add("article_title", video_data["title"])
			
 
				+        video_item.add("out_account_id", video_data["authorId"])
			
 
				+        video_item.add("out_account_name", video_data["authorName"])
			
 
				+        video_item.add("publish_timestamp", video_data["postTime"] / 1000)
			
 
				+        video_item.add("platform", const.PLATFORM)
			
 
				+        video_item.add("article_url", video_data["videoUrl"])
			
 
				+        video_item.add("source_account", const.GET_RECOMMEND_INIT_STATUS)
			
 
				+        video_item.add("crawler_timestamp", int(time.time()))
			
 
				+
			
 
				+        # check item before insert
			
 
				+        video_item.check(source="video")
			
 
				+        try:
			
 
				+            item_with_oss_path = scrape_video_entities_process(
			
 
				+                video_item=video_item.item, db_client=self.db_client
			
 
				+            )
			
 
				+            if item_with_oss_path:
			
 
				+                insert_into_single_video_source_table(
			
 
				+                    db_client=self.db_client, video_item=item_with_oss_path
			
 
				+                )
			
 
				+        except Exception as e:
			
 
				+            detail = {
			
 
				+                "video_item": video_item.item,
			
 
				+                "error": str(e),
			
 
				+                "traceback": traceback.format_exc(),
			
 
				+            }
			
 
				+            log(
			
 
				+                task="crawler_sohu_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="crawler_sohu_videos failed",
			
 
				+                status="failed",
			
 
				+                data=detail,
			
 
				+            )
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuHotVideos(CrawlerSohuVideos):
			
 
				+
			
 
				+    # process hot video obj to satisfy video item
			
 
				+    def process_hot_video_obj(self, video_obj):
			
 
				+        """
			
 
				+        process hot video obj
			
 
				+        """
			
 
				+        article_url = f"https://www.sohu.com{video_obj['url']}"
			
 
				+        video_detail_response = get_video_detail(article_url=article_url)
			
 
				+        item_obj = {
			
 
				+            "id": video_obj["id"],
			
 
				+            "title": video_obj["title"],
			
 
				+            "authorId": video_detail_response["account_id"],
			
 
				+            "authorName": video_detail_response["account_name"],
			
 
				+            "postTime": video_detail_response["publish_timestamp"],
			
 
				+            "videoUrl": video_detail_response["video_url"],
			
 
				+        }
			
 
				+        self.crawler_each_video(item_obj)
			
 
				+
			
 
				+    def deal(self):
			
 
				+        """
			
 
				+        crawler sohu hot videos every day
			
 
				+        """
			
 
				+        hot_point_video_response = get_hot_point_videos()
			
 
				+        hot_point_video_list = hot_point_video_response["data"][
			
 
				+            "tpl-card-feed-pc-data"
			
 
				+        ]["list"]
			
 
				+        for video in tqdm(hot_point_video_list, desc="crawler sohu hot videos"):
			
 
				+            try:
			
 
				+                self.process_hot_video_obj(video)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="crawler_sohu_hot_videos",
			
 
				+                    message="crawler_sohu_hot_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "video": video,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
			
 
				+
			
 
				+    def fetch_seed_videos(self) -> list[dict]:
			
 
				+        """
			
 
				+        get seed videos from database
			
 
				+        """
			
 
				+        fetch_query = f"""
			
 
				+            select id, out_account_id,  url_unique_md5, article_title, score
			
 
				+            from publish_single_video_source 
			
 
				+            where platform = '{const.PLATFORM}' 
			
 
				+                and source_account = {const.GET_RECOMMEND_INIT_STATUS} 
			
 
				+                and score > {const.GET_RECOMMEND_THRESHOLD_SCORE} 
			
 
				+                and audit_status = {const.AUDIT_SUCCESS_STATUS} 
			
 
				+                and bad_status = {const.VIDEO_NOT_BAD_STATUS};
			
 
				+        """
			
 
				+        seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
			
 
				+        return seed_videos
			
 
				+
			
 
				+    def get_each_video_recommendation(self, seed_video: dict) -> None:
			
 
				+        """
			
 
				+        get each video recommendation
			
 
				+        """
			
 
				+        author_id = seed_video["out_account_id"]
			
 
				+        article_id = seed_video["url_unique_md5"]
			
 
				+        outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
			
 
				+        for page in const.PAGE_LIST:
			
 
				+            try:
			
 
				+                response = get_recommendation_video_list(
			
 
				+                    outside_url, author_id, article_id, page
			
 
				+                )
			
 
				+                if response:
			
 
				+                    video_list = response["data"]["recommendVideoFeed"]["list"]
			
 
				+                    for video in tqdm(video_list, desc=f"page: {page}"):
			
 
				+                        self.crawler_each_video(video)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="get_each_video_recommendation",
			
 
				+                    message="get_each_video_recommendation failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "page": page,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+    def update_seed_video_status(self, task_id: int) -> int:
			
 
				+        """
			
 
				+        update seed video status
			
 
				+        """
			
 
				+        update_query = f"""
			
 
				+            update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
			
 
				+        """
			
 
				+        return self.db_client.save(
			
 
				+            query=update_query,
			
 
				+            params=(
			
 
				+                const.GET_RECOMMEND_SUCCESS_STATUS,
			
 
				+                task_id,
			
 
				+                const.GET_RECOMMEND_INIT_STATUS,
			
 
				+            ),
			
 
				+        )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        task_list = self.fetch_seed_videos()
			
 
				+        for task in tqdm(task_list):
			
 
				+            try:
			
 
				+                self.get_each_video_recommendation(task)
			
 
				+                self.update_seed_video_status(task_id=task["id"])
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="crawler_sohu_hot_videos",
			
 
				+                    message="crawler_sohu_hot_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "video": task,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+
			
 
				+class CrawlerSohuUserPageVideos(CrawlerSohuVideos):
			
 
				+    """
			
 
				+    个人主页文章占比大，账号体系还未建设，本次上线暂时不抓取，后续有需要再考虑
			
 
				+    """
			
 
				+
			
 
				+    def get_author_list(self):
			
 
				+        """
			
 
				+        get author list from database
			
 
				+        """
			
 
				+        return []
			
 
				+
			
 
				+    def process_each_page(self, response: dict):
			
 
				+        """
			
 
				+        process each page
			
 
				+        """
			
 
				+        video_list = response["data"]["FeedSlideloadAuthor_2_0_pc_1655965929143_data2"][
			
 
				+            "list"
			
 
				+        ]
			
 
				+        for video in tqdm(video_list, desc="crawler sohu user page videos"):
			
 
				+            try:
			
 
				+                self.crawler_each_video(video)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="process_each_page",
			
 
				+                    message="crawler_sohu_user_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "video": video,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+    def get_each_user_videos(self, author_id: int):
			
 
				+        """
			
 
				+        get each user videos
			
 
				+        """
			
 
				+        page_list = [i for i in range(1, 2)]
			
 
				+        for page in page_list:
			
 
				+            try:
			
 
				+                response = get_user_homepage_videos(author_id, page)
			
 
				+                self.process_each_page(response)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="get_each_user_videos",
			
 
				+                    message="crawler_sohu_user_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "author_id": author_id,
			
 
				+                        "page": page,
			
 
				+                    },
			
 
				+                )
			
 
				+
			
 
				+    def deal(self):
			
 
				+        author_list = self.get_author_list()
			
 
				+        for author_id in tqdm(author_list, desc="crawler sohu user videos"):
			
 
				+            try:
			
 
				+                self.get_each_user_videos(author_id)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="crawler_sohu_hot_videos",
			
 
				+                    message="crawler_sohu_hot_videos failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "author_od": author_id,
			
 
				+                    },
			
 
				+                )