فهرست منبع

头条视频测试抓取

luojunhui 7 ماه پیش
والد
کامیت
bd6ec141e7
2فایلهای تغییر یافته به همراه28 افزوده شده و 25 حذف شده
  1. 7 6
      applications/pipeline/crawler_pipeline.py
  2. 21 19
      tasks/crawler_toutiao_account_videos.py

+ 7 - 6
applications/pipeline/crawler_pipeline.py

@@ -1,6 +1,7 @@
 """
 @author: luojunhui
 """
+
 import os
 
 from applications.utils import download_gzh_video
@@ -29,15 +30,15 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
     video crawler pipeline
     """
     # whether duplicate video title
-    video_title = video_item['article_title']
+    video_title = video_item["article_title"]
     if whether_duplicate_video_title(video_title, db_client):
         return empty_dict
 
     # video title sensitive words filter
 
     # download video
-    article_url = video_item['article_url']
-    platform = video_item['platform']
+    article_url = video_item["article_url"]
+    platform = video_item["platform"]
 
     match platform:
         case "toutiao":
@@ -45,16 +46,16 @@ def scrape_video_entities_process(video_item, db_client) -> dict:
         case "gzh":
             video_path = download_gzh_video(article_url)
         case "hksp":
-            video_path = ''
+            video_path = ""
         case "sph":
-            video_path = ''
+            video_path = ""
         case _:
             return empty_dict
 
     if video_path:
         # upload video to oss
         oss_path = upload_to_oss(video_path)
-        video_item['video_oss_path'] = oss_path
+        video_item["video_oss_path"] = oss_path
         os.remove(video_path)
         return video_item
     else:

+ 21 - 19
tasks/crawler_toutiao_account_videos.py

@@ -1,6 +1,7 @@
 """
 @author: luojunhui
 """
+
 import time
 
 from tqdm import tqdm
@@ -43,20 +44,20 @@ class CrawlerToutiaoAccountVideos:
 
         while has_more:
             response = get_toutiao_account_video_list(
-                account_id=account_id, cookie=cookie,
-                max_behot_time=current_cursor)
-            if response['message'] != 'success':
+                account_id=account_id, cookie=cookie, max_behot_time=current_cursor
+            )
+            if response["message"] != "success":
                 print("error")
                 break
 
-            video_list = response['data']
-            has_more = response['has_more']
-            current_cursor = response['next']['max_behot_time']
+            video_list = response["data"]
+            has_more = response["has_more"]
+            current_cursor = response["next"]["max_behot_time"]
 
             if not video_list:
                 break
 
-            max_timestamp_in_this_group = video_list[0]['publish_time']
+            max_timestamp_in_this_group = video_list[0]["publish_time"]
             if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
                 break
 
@@ -75,20 +76,20 @@ class CrawlerToutiaoAccountVideos:
         crawler each video data
         """
         video_item = Item()
-        video_id = video_data['video_id']
-        title = video_data['title']
-        media = video_data['video']
-        url = media["download_addr"]['url_list'][0]
+        video_id = video_data["video_id"]
+        title = video_data["title"]
+        media = video_data["video"]
+        url = media["download_addr"]["url_list"][0]
 
         # add info into item
         video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
         video_item.add("url_unique_md5", video_id)
         video_item.add("article_title", title)
-        video_item.add("out_account_id", video_data['user']['user_id'])
-        video_item.add("out_account_name", video_data['source'])
-        video_item.add("publish_timestamp", video_data['publish_time'])
+        video_item.add("out_account_id", video_data["user"]["user_id"])
+        video_item.add("out_account_name", video_data["source"])
+        video_item.add("publish_timestamp", video_data["publish_time"])
         video_item.add("platform", "toutiao")
-        video_item.add("read_cnt", video_data['read_count'])
+        video_item.add("read_cnt", video_data["read_count"])
         video_item.add("article_url", url)
         video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
         video_item.add("crawler_timestamp", int(time.time()))
@@ -97,11 +98,12 @@ class CrawlerToutiaoAccountVideos:
         video_item.check(source="video")
         try:
             item_with_oss_path = scrape_video_entities_process(
-                video_item=video_item.item,
-                db_client=self.db_client
+                video_item=video_item.item, db_client=self.db_client
             )
             if item_with_oss_path:
-                insert_into_single_video_source_table(self.db_client, item_with_oss_path)
+                insert_into_single_video_source_table(
+                    self.db_client, item_with_oss_path
+                )
         except Exception as e:
             print(e)
 
@@ -109,5 +111,5 @@ class CrawlerToutiaoAccountVideos:
         """
         class entrance
         """
-        account_id = 'MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi'
+        account_id = "MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi"
         self.crawler_each_account_video_list(account_id)