|
@@ -2,6 +2,7 @@
|
|
|
@author: luojunhui
|
|
|
@tool: pycharm && deepseek
|
|
|
"""
|
|
|
+import re
|
|
|
import os
|
|
|
import traceback
|
|
|
import time
|
|
@@ -61,16 +62,24 @@ class CrawlerChannelAccountVideos:
|
|
|
object_desc = video["objectDesc"]
|
|
|
title = object_desc["description"]
|
|
|
if self.whether_video_exists(title):
|
|
|
+ log(
|
|
|
+ task="crawler_channel_account_videos",
|
|
|
+ function="crawler_each_video",
|
|
|
+ message="video title exists",
|
|
|
+ data={"video_id": video["id"], "title": title}
|
|
|
+ )
|
|
|
return
|
|
|
|
|
|
- if not title:
|
|
|
+ cleaned_title = re.sub(r'[^\u4e00-\u9fff]', '', title)
|
|
|
+ if len(cleaned_title) < 10:
|
|
|
+ log(
|
|
|
+ task="crawler_channel_account_videos",
|
|
|
+ function="crawler_each_video",
|
|
|
+ message="video title is too short",
|
|
|
+ data={"video_id": video["id"], "title": title}
|
|
|
+ )
|
|
|
return
|
|
|
|
|
|
- if len(title) < 10:
|
|
|
- bad_status = 4
|
|
|
- else:
|
|
|
- bad_status = 0
|
|
|
-
|
|
|
video_item = Item()
|
|
|
video_id = video["id"]
|
|
|
video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
|
|
@@ -80,7 +89,6 @@ class CrawlerChannelAccountVideos:
|
|
|
video_item.add("out_account_name", video["nickname"])
|
|
|
video_item.add("publish_timestamp", video["createtime"])
|
|
|
video_item.add("platform", 'sph')
|
|
|
- video_item.add("bad_status", bad_status)
|
|
|
media = object_desc["media"][0]
|
|
|
url = media["Url"]
|
|
|
decode_key = media["decodeKey"]
|