luojunhui 1 هفته پیش
والد
کامیت
bf4c879345

+ 36 - 0
applications/const/__init__.py

@@ -379,6 +379,42 @@ class ToutiaoVideoCrawlerConst:
     SLEEP_SECOND = 3
 
 
+class SohuVideoCrawlerConst:
+    """
+    const for sohu video crawler
+    """
+    # platform
+    PLATFORM = "sohu"
+
+    # account status
+    GET_RECOMMEND_INIT_STATUS = 0
+    GET_RECOMMEND_SUCCESS_STATUS = 1
+    GET_RECOMMEND_FAIL_STATUS = 99
+
+    # title length min
+    MIN_TITLE_LENGTH = 10
+
+    # max video length(second)
+    MAX_VIDEO_LENGTH = 600
+
+    # sleep second
+    SLEEP_SECOND = 3
+
+    # 获取推荐的最低相关性分
+    GET_RECOMMEND_THRESHOLD_SCORE = 0.6
+
+    # 审核状态
+    AUDIT_SUCCESS_STATUS = 1
+
+    # 视频状态
+    VIDEO_NOT_BAD_STATUS = 0
+
+    # PAGE_LIST
+    PAGE_LIST = [i for i in range(1, 8)]
+
+
+
+
 class SingleVideoPoolPublishTaskConst:
     """
     const for single video pool publish task

+ 2 - 1
coldStartTasks/crawler/sohu/__init__.py

@@ -1,3 +1,4 @@
+from .get_detail import get_video_detail
 from .get_recommedation import get_recommendation_video_list
-from.get_user_homepage import get_user_homepage_videos
+from .get_user_homepage import get_user_homepage_videos
 from .get_hot_point import get_hot_point_videos

+ 96 - 0
coldStartTasks/crawler/sohu/get_detail.py

@@ -0,0 +1,96 @@
+import re
+import json
+import requests
+from datetime import datetime
+from lxml import html
+from tenacity import retry
+
+from applications import log
+from applications.utils import proxy, request_retry
+
+retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
+
+
+def extract_video_url(html_text: str) -> str | None:
+    """
+    extract video url from html text
+    """
+    patterns = [
+        r'<source\s+src=["\'](.*?\.mp4)["\']',
+        r'videoUrl\s*=\s*["\'](.*?\.mp4)["\']',
+        r"(https?://\S+?\.mp4(?:\?\S+)?)",
+    ]
+    video_urls = []
+    for pattern in patterns:
+        match = re.findall(pattern, html_text, re.IGNORECASE)
+        video_urls.extend(match)
+
+    if video_urls:
+        return video_urls[0]
+    else:
+        return None
+
+
+def extract_video_info(html_text: str) -> dict | None:
+    """
+    extract video publish time from html text
+    """
+    tree = html.fromstring(html_text)
+    publish_time_str = tree.xpath("//meta[@property='og:release_date']/@content")[0]
+    account_name = tree.xpath("//meta[@name='mediaid']/@content")[0]
+    sub_url = tree.xpath("//meta[@property='og:url']/@content")[0]
+    article_id = sub_url.split("/")[-1].split("_")[0]
+    account_id = sub_url.split("/")[-1].split("_")[1]
+    title = tree.xpath("//meta[@name='description']/@content")[0]
+    response = {
+        "publish_timestamp": int(
+            datetime.strptime(publish_time_str, "%Y-%m-%d %H:%M").timestamp() * 1000
+        ),
+        "account_name": account_name,
+        "article_id": article_id,
+        "account_id": account_id,
+        "video_url": extract_video_url(html_text),
+        "title": title,
+    }
+    return response
+
+
+@retry(**retry_desc)
+def get_video_detail(article_url: str) -> dict | None:
+    """
+    get detail video url
+    """
+    payload = {}
+    headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language": "zh",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
+    }
+    try:
+        response = requests.get(article_url, headers=headers, data=payload, proxies=proxy)
+        response.raise_for_status()
+        video_info = extract_video_info(response.text)
+        return video_info
+
+    except requests.exceptions.RequestException as e:
+        log(
+            task="sohu_detail_video",
+            function="get_detail_video_url",
+            message=f"API请求失败: {e}",
+        )
+
+    except json.JSONDecodeError as e:
+        log(
+            task="sohu_detail_video",
+            function="get_detail_video_url",
+            message=f"响应解析失败: {e}",
+        )
+    return None
+
+
+# url = 'https://www.sohu.com/a/877211651_121141867?scm=10001.325_13-109000.0.0.5_32&spm=smpc.channel_248.block3_308_NDdFbm_1_fd.25.1743578768825Lv6rTp1_324'
+# res = get_detail_video_url(url)
+#
+# print(res)

+ 1 - 1
crawler_sohu_videos_task.py

@@ -1,5 +1,5 @@
 from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuHotVideos
-from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
+# from tasks.crawler_tasks.crawler_video.crawler_sohu_videos import CrawlerSohuRecommendVideos
 
 def main():
     # step1, crawl sohu hot videos

+ 64 - 14
tasks/crawler_tasks/crawler_video/crawler_sohu_videos.py

@@ -7,31 +7,34 @@ from pymysql.cursors import DictCursor
 from tqdm import tqdm
 
 from applications import log
+from applications.const import SohuVideoCrawlerConst
 from applications.db import DatabaseConnector
 from applications.pipeline import scrape_video_entities_process
 from applications.utils import Item
 from applications.utils import str_to_md5
 from applications.utils import insert_into_single_video_source_table
 
+from coldStartTasks.crawler.sohu import get_video_detail
 from coldStartTasks.crawler.sohu import get_hot_point_videos
 from coldStartTasks.crawler.sohu import get_recommendation_video_list
 from coldStartTasks.crawler.sohu import get_user_homepage_videos
 
 from config import long_articles_config
 
+const = SohuVideoCrawlerConst()
+
 
 class CrawlerSohuVideos:
     def __init__(self):
         self.db_client = DatabaseConnector(long_articles_config)
         self.db_client.connect()
-        self.platform = "sohu"
 
     def crawler_each_video(self, video_data):
         """
         crawler each video data
         """
         video_item = Item()
-        unique_id = f"{self.platform}-{video_data['id']}"
+        unique_id = f"{const.PLATFORM}-{video_data['id']}"
 
         # add info into item
         video_item.add("content_trace_id", "video{}".format(str_to_md5(unique_id)))
@@ -40,9 +43,9 @@ class CrawlerSohuVideos:
         video_item.add("out_account_id", video_data["authorId"])
         video_item.add("out_account_name", video_data["authorName"])
         video_item.add("publish_timestamp", video_data["postTime"] / 1000)
-        video_item.add("platform", self.platform)
+        video_item.add("platform", const.PLATFORM)
         video_item.add("article_url", video_data["videoUrl"])
-        video_item.add("source_account", 0)
+        video_item.add("source_account", const.GET_RECOMMEND_INIT_STATUS)
         video_item.add("crawler_timestamp", int(time.time()))
 
         # check item before insert
@@ -61,10 +64,34 @@ class CrawlerSohuVideos:
                 "error": str(e),
                 "traceback": traceback.format_exc(),
             }
-            print(detail)
+            log(
+                task="crawler_sohu_videos",
+                function="crawler_each_video",
+                message="crawler_sohu_videos failed",
+                status="failed",
+                data=detail,
+            )
 
 
 class CrawlerSohuHotVideos(CrawlerSohuVideos):
+
+    # process hot video obj to satisfy video item
+    def process_hot_video_obj(self, video_obj):
+        """
+        process hot video obj
+        """
+        article_url = f"https://www.sohu.com{video_obj['url']}"
+        video_detail_response = get_video_detail(article_url=article_url)
+        item_obj = {
+            "id": video_obj["id"],
+            "title": video_obj["title"],
+            "authorId": video_detail_response["account_id"],
+            "authorName": video_detail_response["account_name"],
+            "postTime": video_detail_response["publish_timestamp"],
+            "videoUrl": video_detail_response["video_url"],
+        }
+        self.crawler_each_video(item_obj)
+
     def deal(self):
         """
         crawler sohu hot videos every day
@@ -75,7 +102,7 @@ class CrawlerSohuHotVideos(CrawlerSohuVideos):
         ]["list"]
         for video in tqdm(hot_point_video_list, desc="crawler sohu hot videos"):
             try:
-                self.crawler_each_video(video)
+                self.process_hot_video_obj(video)
 
             except Exception as e:
                 log(
@@ -100,7 +127,11 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
         fetch_query = f"""
             select id, out_account_id,  url_unique_md5, article_title, score
             from publish_single_video_source 
-            where platform = 'sohu' and source_account = 0 and score > 0.6 and audit_status = 1 and bad_status = 0;
+            where platform = '{const.PLATFORM}' 
+                and source_account = {const.GET_RECOMMEND_INIT_STATUS} 
+                and score > {const.GET_RECOMMEND_THRESHOLD_SCORE} 
+                and audit_status = {const.AUDIT_SUCCESS_STATUS} 
+                and bad_status = {const.VIDEO_NOT_BAD_STATUS};
         """
         seed_videos = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
         return seed_videos
@@ -112,8 +143,7 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
         author_id = seed_video["out_account_id"]
         article_id = seed_video["url_unique_md5"]
         outside_url = f"https://www.sohu.com/a/{article_id}_{author_id}"
-        page_list = [i for i in range(1, 8)]
-        for page in page_list:
+        for page in const.PAGE_LIST:
             try:
                 response = get_recommendation_video_list(
                     outside_url, author_id, article_id, page
@@ -124,9 +154,17 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
                         self.crawler_each_video(video)
 
             except Exception as e:
-                print(e)
-                print(traceback.format_exc())
-                continue
+                log(
+                    task="crawler_sohu_videos",
+                    function="get_each_video_recommendation",
+                    message="get_each_video_recommendation failed",
+                    status="failed",
+                    data={
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                        "page": page,
+                    },
+                )
 
     def update_seed_video_status(self, task_id: int) -> int:
         """
@@ -135,7 +173,14 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
         update_query = f"""
             update publish_single_video_source set source_account = %s where id = %s and source_account = %s;
         """
-        return self.db_client.save(query=update_query, params=(1, task_id, 0))
+        return self.db_client.save(
+            query=update_query,
+            params=(
+                const.GET_RECOMMEND_SUCCESS_STATUS,
+                task_id,
+                const.GET_RECOMMEND_INIT_STATUS,
+            ),
+        )
 
     def deal(self):
         task_list = self.fetch_seed_videos()
@@ -143,6 +188,7 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
             try:
                 self.get_each_video_recommendation(task)
                 self.update_seed_video_status(task_id=task["id"])
+
             except Exception as e:
                 log(
                     task="crawler_sohu_videos",
@@ -158,11 +204,15 @@ class CrawlerSohuRecommendVideos(CrawlerSohuVideos):
 
 
 class CrawlerSohuUserPageVideos(CrawlerSohuVideos):
+    """
+    个人主页文章占比大,账号体系还未建设,本次上线暂时不抓取,后续有需要再考虑
+    """
+
     def get_author_list(self):
         """
         get author list from database
         """
-        return [121644888]
+        return []
 
     def process_each_page(self, response: dict):
         """