vor 3 Monaten · bd6ec141e7
--- a/applications/pipeline/crawler_pipeline.py
+++ b/applications/pipeline/crawler_pipeline.py
@@ -1,6 +1,7 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+
			
 
				 import os
			
 
				 
			
 
				 from applications.utils import download_gzh_video
			
@@ -29,15 +30,15 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
 
				     video crawler pipeline
			
 
				     """
			
 
				     # whether duplicate video title
			
 
				-    video_title = video_item['article_title']
			
 
				+    video_title = video_item["article_title"]
			
 
				     if whether_duplicate_video_title(video_title, db_client):
			
 
				         return empty_dict
			
 
				 
			
 
				     # video title sensitive words filter
			
 
				 
			
 
				     # download video
			
 
				-    article_url = video_item['article_url']
			
 
				-    platform = video_item['platform']
			
 
				+    article_url = video_item["article_url"]
			
 
				+    platform = video_item["platform"]
			
 
				 
			
 
				     match platform:
			
 
				         case "toutiao":
			
@@ -45,16 +46,16 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
 
				         case "gzh":
			
 
				             video_path = download_gzh_video(article_url)
			
 
				         case "hksp":
			
 
				-            video_path = ''
			
 
				+            video_path = ""
			
 
				         case "sph":
			
 
				-            video_path = ''
			
 
				+            video_path = ""
			
 
				         case _:
			
 
				             return empty_dict
			
 
				 
			
 
				     if video_path:
			
 
				         # upload video to oss
			
 
				         oss_path = upload_to_oss(video_path)
			
 
				-        video_item['video_oss_path'] = oss_path
			
 
				+        video_item["video_oss_path"] = oss_path
			
 
				         os.remove(video_path)
			
 
				         return video_item
			
 
				     else:
			
--- a/tasks/crawler_toutiao_account_videos.py
+++ b/tasks/crawler_toutiao_account_videos.py
@@ -1,6 +1,7 @@
 
				 """
			
 
				 @author: luojunhui
			
 
				 """
			
 
				+
			
 
				 import time
			
 
				 
			
 
				 from tqdm import tqdm
			
@@ -43,20 +44,20 @@ class CrawlerToutiaoAccountVideos:
 
				 
			
 
				         while has_more:
			
 
				             response = get_toutiao_account_video_list(
			
 
				-                account_id=account_id, cookie=cookie,
			
 
				-                max_behot_time=current_cursor)
			
 
				-            if response['message'] != 'success':
			
 
				+                account_id=account_id, cookie=cookie, max_behot_time=current_cursor
			
 
				+            )
			
 
				+            if response["message"] != "success":
			
 
				                 print("error")
			
 
				                 break
			
 
				 
			
 
				-            video_list = response['data']
			
 
				-            has_more = response['has_more']
			
 
				-            current_cursor = response['next']['max_behot_time']
			
 
				+            video_list = response["data"]
			
 
				+            has_more = response["has_more"]
			
 
				+            current_cursor = response["next"]["max_behot_time"]
			
 
				 
			
 
				             if not video_list:
			
 
				                 break
			
 
				 
			
 
				-            max_timestamp_in_this_group = video_list[0]['publish_time']
			
 
				+            max_timestamp_in_this_group = video_list[0]["publish_time"]
			
 
				             if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
			
 
				                 break
			
 
				 
			
@@ -75,20 +76,20 @@ class CrawlerToutiaoAccountVideos:
 
				         crawler each video data
			
 
				         """
			
 
				         video_item = Item()
			
 
				-        video_id = video_data['video_id']
			
 
				-        title = video_data['title']
			
 
				-        media = video_data['video']
			
 
				-        url = media["download_addr"]['url_list'][0]
			
 
				+        video_id = video_data["video_id"]
			
 
				+        title = video_data["title"]
			
 
				+        media = video_data["video"]
			
 
				+        url = media["download_addr"]["url_list"][0]
			
 
				 
			
 
				         # add info into item
			
 
				         video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
			
 
				         video_item.add("url_unique_md5", video_id)
			
 
				         video_item.add("article_title", title)
			
 
				-        video_item.add("out_account_id", video_data['user']['user_id'])
			
 
				-        video_item.add("out_account_name", video_data['source'])
			
 
				-        video_item.add("publish_timestamp", video_data['publish_time'])
			
 
				+        video_item.add("out_account_id", video_data["user"]["user_id"])
			
 
				+        video_item.add("out_account_name", video_data["source"])
			
 
				+        video_item.add("publish_timestamp", video_data["publish_time"])
			
 
				         video_item.add("platform", "toutiao")
			
 
				-        video_item.add("read_cnt", video_data['read_count'])
			
 
				+        video_item.add("read_cnt", video_data["read_count"])
			
 
				         video_item.add("article_url", url)
			
 
				         video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
			
 
				         video_item.add("crawler_timestamp", int(time.time()))
			
@@ -97,11 +98,12 @@ class CrawlerToutiaoAccountVideos:
 
				         video_item.check(source="video")
			
 
				         try:
			
 
				             item_with_oss_path = scrape_video_entities_process(
			
 
				-                video_item=video_item.item,
			
 
				-                db_client=self.db_client
			
 
				+                video_item=video_item.item, db_client=self.db_client
			
 
				             )
			
 
				             if item_with_oss_path:
			
 
				-                insert_into_single_video_source_table(self.db_client, item_with_oss_path)
			
 
				+                insert_into_single_video_source_table(
			
 
				+                    self.db_client, item_with_oss_path
			
 
				+                )
			
 
				         except Exception as e:
			
 
				             print(e)
			
 
				 
			
@@ -109,5 +111,5 @@ class CrawlerToutiaoAccountVideos:
 
				         """
			
 
				         class entrance
			
 
				         """
			
 
				-        account_id = 'MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi'
			
 
				+        account_id = "MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi"
			
 
				         self.crawler_each_account_video_list(account_id)