4 月之前 · 0a88f27906
--- a/applications/utils/download_video.py
+++ b/applications/utils/download_video.py
@@ -92,6 +92,11 @@ def download_sph_video(download_url, key):
 
				                     if chunk:  # filter out keep-alive chunks
			
 
				                         f.write(chunk)
			
 
				 
			
 
				+        # 判断视频的大小
			
 
				+        if os.path.getsize(encrypted_path) > 100 * 1024 * 1024:
			
 
				+            os.remove(encrypted_path)
			
 
				+            raise ValueError("Video size is too large")
			
 
				+
			
 
				         decrypt_sph_video(encrypted_path, key, decrypted_path)
			
 
				         os.remove(encrypted_path)
			
 
				         return decrypted_path
			
--- a/tasks/crawler_channel_account_videos.py
+++ b/tasks/crawler_channel_account_videos.py
@@ -2,6 +2,7 @@
 
				 @author: luojunhui
			
 
				 @tool: pycharm && deepseek
			
 
				 """
			
 
				+import re
			
 
				 import os
			
 
				 import traceback
			
 
				 import time
			
@@ -61,16 +62,24 @@ class CrawlerChannelAccountVideos:
 
				         object_desc = video["objectDesc"]
			
 
				         title = object_desc["description"]
			
 
				         if self.whether_video_exists(title):
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video title exists",
			
 
				+                data={"video_id": video["id"], "title": title}
			
 
				+            )
			
 
				             return
			
 
				 
			
 
				-        if not title:
			
 
				+        cleaned_title = re.sub(r'[^\u4e00-\u9fff]', '', title)
			
 
				+        if len(cleaned_title) < 10:
			
 
				+            log(
			
 
				+                task="crawler_channel_account_videos",
			
 
				+                function="crawler_each_video",
			
 
				+                message="video title is too short",
			
 
				+                data={"video_id": video["id"], "title": title}
			
 
				+            )
			
 
				             return
			
 
				 
			
 
				-        if len(title) < 10:
			
 
				-            bad_status = 4
			
 
				-        else:
			
 
				-            bad_status = 0
			
 
				-
			
 
				         video_item = Item()
			
 
				         video_id = video["id"]
			
 
				         video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
			
@@ -80,7 +89,6 @@ class CrawlerChannelAccountVideos:
 
				         video_item.add("out_account_name", video["nickname"])
			
 
				         video_item.add("publish_timestamp", video["createtime"])
			
 
				         video_item.add("platform", 'sph')
			
 
				-        video_item.add("bad_status", bad_status)
			
 
				         media = object_desc["media"][0]
			
 
				         url = media["Url"]
			
 
				         decode_key = media["decodeKey"]