浏览代码

视频号抓取任务

luojunhui 4 月之前
父节点
当前提交
0a88f27906
共有 2 个文件被更改,包括 20 次插入7 次删除
  1. 5 0
      applications/utils/download_video.py
  2. 15 7
      tasks/crawler_channel_account_videos.py

+ 5 - 0
applications/utils/download_video.py

@@ -92,6 +92,11 @@ def download_sph_video(download_url, key):
                     if chunk:  # filter out keep-alive chunks
                         f.write(chunk)
 
+        # 判断视频的大小
+        if os.path.getsize(encrypted_path) > 100 * 1024 * 1024:
+            os.remove(encrypted_path)
+            raise ValueError("Video size is too large")
+
         decrypt_sph_video(encrypted_path, key, decrypted_path)
         os.remove(encrypted_path)
         return decrypted_path

+ 15 - 7
tasks/crawler_channel_account_videos.py

@@ -2,6 +2,7 @@
 @author: luojunhui
 @tool: pycharm && deepseek
 """
+import re
 import os
 import traceback
 import time
@@ -61,16 +62,24 @@ class CrawlerChannelAccountVideos:
         object_desc = video["objectDesc"]
         title = object_desc["description"]
         if self.whether_video_exists(title):
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="video title exists",
+                data={"video_id": video["id"], "title": title}
+            )
             return
 
-        if not title:
+        cleaned_title = re.sub(r'[^\u4e00-\u9fff]', '', title)
+        if len(cleaned_title) < 10:
+            log(
+                task="crawler_channel_account_videos",
+                function="crawler_each_video",
+                message="video title is too short",
+                data={"video_id": video["id"], "title": title}
+            )
             return
 
-        if len(title) < 10:
-            bad_status = 4
-        else:
-            bad_status = 0
-
         video_item = Item()
         video_id = video["id"]
         video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
@@ -80,7 +89,6 @@ class CrawlerChannelAccountVideos:
         video_item.add("out_account_name", video["nickname"])
         video_item.add("publish_timestamp", video["createtime"])
         video_item.add("platform", 'sph')
-        video_item.add("bad_status", bad_status)
         media = object_desc["media"][0]
         url = media["Url"]
         decode_key = media["decodeKey"]