6 miesięcy temu · bf4c879345
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -379,6 +379,42 @@ class ToutiaoVideoCrawlerConst:
 
				     SLEEP_SECOND = 3
			
 
				 
			
 
				 
			
 
				+class SohuVideoCrawlerConst:
			
 
				+    """
			
 
				+    const for sohu video crawler
			
 
				+    """
			
 
				+    # platform
			
 
				+    PLATFORM = "sohu"
			
 
				+
			
 
				+    # account status
			
 
				+    GET_RECOMMEND_INIT_STATUS = 0
			
 
				+    GET_RECOMMEND_SUCCESS_STATUS = 1
			
 
				+    GET_RECOMMEND_FAIL_STATUS = 99
			
 
				+
			
 
				+    # title length min
			
 
				+    MIN_TITLE_LENGTH = 10
			
 
				+
			
 
				+    # max video length(second)
			
 
				+    MAX_VIDEO_LENGTH = 600
			
 
				+
			
 
				+    # sleep second
			
 
				+    SLEEP_SECOND = 3
			
 
				+
			
 
				+    # 获取推荐的最低相关性分
			
 
				+    GET_RECOMMEND_THRESHOLD_SCORE = 0.6
			
 
				+
			
 
				+    # 审核状态
			
 
				+    AUDIT_SUCCESS_STATUS = 1
			
 
				+
			
 
				+    # 视频状态
			
 
				+    VIDEO_NOT_BAD_STATUS = 0
			
 
				+
			
 
				+    # PAGE_LIST
			
 
				+    PAGE_LIST = [i for i in range(1, 8)]
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				 class SingleVideoPoolPublishTaskConst:
			
 
				     """
			
 
				     const for single video pool publish task
			
--- a/coldStartTasks/crawler/sohu/__init__.py
+++ b/coldStartTasks/crawler/sohu/__init__.py
@@ -1,3 +1,4 @@
 
				+from .get_detail import get_video_detail
			
 
				 from .get_recommedation import get_recommendation_video_list
			
 
				-from.get_user_homepage import get_user_homepage_videos
			
 
				+from .get_user_homepage import get_user_homepage_videos
			
 
				 from .get_hot_point import get_hot_point_videos
			
--- a/coldStartTasks/crawler/sohu/get_detail.py
+++ b/coldStartTasks/crawler/sohu/get_detail.py
@@ -0,0 +1,96 @@
 
				+import re
			
 
				+import json
			
 
				+import requests
			
 
				+from datetime import datetime
			
 
				+from lxml import html
			
 
				+from tenacity import retry
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.utils import proxy, request_retry
			
 
				+
			
 
				+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
			
 
				+
			
 
				+
			
 
				+def extract_video_url(html_text: str) -> str | None:
			
 
				+    """
			
 
				+    extract video url from html text
			
 
				+    """
			
 
				+    patterns = [
			
 
				+        r'<source\s+src=["\'](.*?\.mp4)["\']',
			
 
				+        r'videoUrl\s*=\s*["\'](.*?\.mp4)["\']',
			
 
				+        r"(https?://\S+?\.mp4(?:\?\S+)?)",
			
 
				+    ]
			
 
				+    video_urls = []
			
 
				+    for pattern in patterns:
			
 
				+        match = re.findall(pattern, html_text, re.IGNORECASE)
			
 
				+        video_urls.extend(match)
			
 
				+
			
 
				+    if video_urls:
			
 
				+        return video_urls[0]
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def extract_video_info(html_text: str) -> dict | None:
			
 
				+    """
			
 
				+    extract video publish time from html text
			
 
				+    """
			
 
				+    tree = html.fromstring(html_text)
			
 
				+    publish_time_str = tree.xpath("//meta[@property='og:release_date']/@content")[0]
			
 
				+    account_name = tree.xpath("//meta[@name='mediaid']/@content")[0]
			
 
				+    sub_url = tree.xpath("//meta[@property='og:url']/@content")[0]
			
 
				+    article_id = sub_url.split("/")[-1].split("_")[0]
			
 
				+    account_id = sub_url.split("/")[-1].split("_")[1]
			
 
				+    title = tree.xpath("//meta[@name='description']/@content")[0]
			
 
				+    response = {
			
 
				+        "publish_timestamp": int(
			
 
				+            datetime.strptime(publish_time_str, "%Y-%m-%d %H:%M").timestamp() * 1000
			
 
				+        ),
			
 
				+        "account_name": account_name,
			
 
				+        "article_id": article_id,
			
 
				+        "account_id": account_id,
			
 
				+        "video_url": extract_video_url(html_text),
			
 
				+        "title": title,
			
 
				+    }
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+@retry(**retry_desc)
			
 
				+def get_video_detail(article_url: str) -> dict | None:
			
 
				+    """
			
 
				+    get detail video url
			
 
				+    """
			
 
				+    payload = {}
			
 
				+    headers = {
			
 
				+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+        "Accept-Language": "zh",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "Upgrade-Insecure-Requests": "1",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
			
 
				+    }
			
 
				+    try:
			
 
				+        response = requests.get(article_url, headers=headers, data=payload, proxies=proxy)
			
 
				+        response.raise_for_status()
			
 
				+        video_info = extract_video_info(response.text)
			
 
				+        return video_info
			
 
				+
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        log(
			
 
				+            task="sohu_detail_video",
			
 
				+            function="get_detail_video_url",
			
 
				+            message=f"API请求失败: {e}",
			
 
				+        )
			
 
				+
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        log(
			
 
				+            task="sohu_detail_video",
			
 
				+            function="get_detail_video_url",
			
 
				+            message=f"响应解析失败: {e}",
			
 
				+        )
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# url = 'https://www.sohu.com/a/877211651_121141867?scm=10001.325_13-109000.0.0.5_32&spm=smpc.channel_248.block3_308_NDdFbm_1_fd.25.1743578768825Lv6rTp1_324'
			
 
				+# res = get_detail_video_url(url)
			
 
				+#
			
 
				+# print(res)
			
--- a/crawler_sohu_videos_task.py
+++ b/crawler_sohu_videos_task.py
@@ -1,5 +1,5 @@
 
				 from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuHotVideos
			
 
				-from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
			
 
				+# from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
			
 
				 
			
 
				 def main():
			
 
				     # step1, crawl sohu hot videos
			
--- a/tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py
+++ b/tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py
@@ -7,31 +7,34 @@ from pymysql.cursors import DictCursor
 
				 from tqdm import tqdm
			
 
				 
			
 
				 from applications import log
			
 
				+from applications.const import SohuVideoCrawlerConst
			
 
				 from applications.db import DatabaseConnector
			
 
				 from applications.pipeline import scrape_video_entities_process
			
 
				 from applications.utils import Item
			
 
				 from applications.utils import str_to_md5
			
 
				 from applications.utils import insert_into_single_video_source_table
			
 
				 
			
 
				+from coldStartTasks.crawler.sohu import get_video_detail
			
 
				 from coldStartTasks.crawler.sohu import get_hot_point_videos
			
 
				 from coldStartTasks.crawler.sohu import get_recommendation_video_list
			
 
				 from coldStartTasks.crawler.sohu import get_user_homepage_videos
			
 
				 
			
 
				 from config import long_articles_config
			
 
				 
			
 
				+const = SohuVideoCrawlerConst()
			
 
				+
			
 
				 
			
 
				 class CrawlerSohuVideos:
			
 
				     def __init__(self):
			
 
				         self.db_client = DatabaseConnector(long_articles_config)
			
 
				         self.db_client.connect()
			
 
				-        self.platform = "sohu"
			
 
				 
			
 
				     def crawler_each_video(self, video_data):
			
 
				         """
			
 
				         crawler each video data
			
 
				         """
			
 
				         video_item = Item()
			
 
				-        unique_id = f"{self.platform}-{video_data['id']}"
			
 
				+        unique_id = f"{const.PLATFORM}-{video_data['id']}"
			
 
				 
			
 
				         # add info into item
			
 
				         video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
			
@@ -40,9 +43,9 @@ class CrawlerSohuVideos:
 
				         video_item.add("out_account_id", video_data["authorId"])
			
 
				         video_item.add("out_account_name", video_data["authorName"])
			
 
				         video_item.add("publish_timestamp", video_data["postTime"] / 1000)
			
 
				-        video_item.add("platform", self.platform)
			
 
				+        video_item.add("platform", const.PLATFORM)
			
 
				         video_item.add("article_url", video_data["videoUrl"])
			
 
				-        video_item.add("source_account", 0)
			
 
				+        video_item.add("source_account", const.GET_RECOMMEND_INIT_STATUS)
			
 
				         video_item.add("crawler_timestamp", int(time.time()))
			
 
				 
			
 
				         # check item before insert
			
@@ -61,10 +64,34 @@ class CrawlerSohuVideos:
 
				                 "error": str(e),
			
 
				                 "traceback": traceback.format_exc(),
			
 
				             }
			
 
				-            print(detail)
			
 
				+            log(
			
 
				+                task="crawler_sohu_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="crawler_sohu_videos failed",
			
 
				+                status="failed",
			
 
				+                data=detail,
			
 
				+            )
			
 
				 
			
 
				 
			
 
				 class CrawlerSohuHotVideos(CrawlerSohuVideos):
			
 
				+
			
 
				+    # process hot video obj to satisfy video item
			
 
				+    def process_hot_video_obj(self, video_obj):
			
 
				+        """
			
 
				+        process hot video obj
			
 
				+        """
			
 
				+        article_url = f"https://www.sohu.com{video_obj['url']}"
			
 
				+        video_detail_response = get_video_detail(article_url=article_url)
			
 
				+        item_obj = {
			
 
				+            "id": video_obj["id"],
			
 
				+            "title": video_obj["title"],
			
 
				+            "authorId": video_detail_response["account_id"],
			
 
				+            "authorName": video_detail_response["account_name"],
			
 
				+            "postTime": video_detail_response["publish_timestamp"],
			
 
				+            "videoUrl": video_detail_response["video_url"],
			
 
				+        }
			
 
				+        self.crawler_each_video(item_obj)
			
 
				+
			
 
				     def deal(self):
			
 
				         """
			
 
				         crawler sohu hot videos every day
			
@@ -75,7 +102,7 @@ class CrawlerSohuHotVideos(CrawlerSohuVideos):
 
				         ]["list"]
			
 
				         for video in tqdm(hot_point_video_list, desc="crawler sohu hot videos"):
			
 
				             try:
			
 
				-                self.crawler_each_video(video)
			
 
				+                self.process_hot_video_obj(video)
			
 
				 
			
 
				             except Exception as e:
			
 
				                 log(
			
@@ -100,7 +127,11 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
 
				         fetch_query = f"""
			
 
				             select id, out_account_id,  url_unique_md5, article_title, score
			
 
				             from publish_single_video_source 
			
 
				-            where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
			
 
				+            where platform = '{const.PLATFORM}' 
			
 
				+                and source_account = {const.GET_RECOMMEND_INIT_STATUS} 
			
 
				+                and score > {const.GET_RECOMMEND_THRESHOLD_SCORE} 
			
 
				+                and audit_status = {const.AUDIT_SUCCESS_STATUS} 
			
 
				+                and bad_status = {const.VIDEO_NOT_BAD_STATUS};
			
 
				         """
			
 
				         seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
			
 
				         return seed_videos
			
@@ -112,8 +143,7 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
 
				         author_id = seed_video["out_account_id"]
			
 
				         article_id = seed_video["url_unique_md5"]
			
 
				         outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
			
 
				-        page_list = [i for i in range(1, 8)]
			
 
				-        for page in page_list:
			
 
				+        for page in const.PAGE_LIST:
			
 
				             try:
			
 
				                 response = get_recommendation_video_list(
			
 
				                     outside_url, author_id, article_id, page
			
@@ -124,9 +154,17 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
 
				                         self.crawler_each_video(video)
			
 
				 
			
 
				             except Exception as e:
			
 
				-                print(e)
			
 
				-                print(traceback.format_exc())
			
 
				-                continue
			
 
				+                log(
			
 
				+                    task="crawler_sohu_videos",
			
 
				+                    function="get_each_video_recommendation",
			
 
				+                    message="get_each_video_recommendation failed",
			
 
				+                    status="failed",
			
 
				+                    data={
			
 
				+                        "error": str(e),
			
 
				+                        "traceback": traceback.format_exc(),
			
 
				+                        "page": page,
			
 
				+                    },
			
 
				+                )
			
 
				 
			
 
				     def update_seed_video_status(self, task_id: int) -> int:
			
 
				         """
			
@@ -135,7 +173,14 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
 
				         update_query = f"""
			
 
				             update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
			
 
				         """
			
 
				-        return self.db_client.save(query=update_query, params=(1, task_id, 0))
			
 
				+        return self.db_client.save(
			
 
				+            query=update_query,
			
 
				+            params=(
			
 
				+                const.GET_RECOMMEND_SUCCESS_STATUS,
			
 
				+                task_id,
			
 
				+                const.GET_RECOMMEND_INIT_STATUS,
			
 
				+            ),
			
 
				+        )
			
 
				 
			
 
				     def deal(self):
			
 
				         task_list = self.fetch_seed_videos()
			
@@ -143,6 +188,7 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
 
				             try:
			
 
				                 self.get_each_video_recommendation(task)
			
 
				                 self.update_seed_video_status(task_id=task["id"])
			
 
				+
			
 
				             except Exception as e:
			
 
				                 log(
			
 
				                     task="crawler_sohu_videos",
			
@@ -158,11 +204,15 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
 
				 
			
 
				 
			
 
				 class CrawlerSohuUserPageVideos(CrawlerSohuVideos):
			
 
				+    """
			
 
				+    个人主页文章占比大，账号体系还未建设，本次上线暂时不抓取，后续有需要再考虑
			
 
				+    """
			
 
				+
			
 
				     def get_author_list(self):
			
 
				         """
			
 
				         get author list from database
			
 
				         """
			
 
				-        return [121644888]
			
 
				+        return []
			
 
				 
			
 
				     def process_each_page(self, response: dict):
			
 
				         """