浏览代码

头条视频测试抓取

luojunhui 3 月之前
父节点
当前提交
7626ae815c
共有 2 个文件被更改,包括 33 次插入6 次删除
  1. 24 0
      applications/const/__init__.py
  2. 9 6
      tasks/crawler_toutiao_account_videos.py

+ 24 - 0
applications/const/__init__.py

@@ -319,6 +319,30 @@ class ChannelVideoCrawlerConst:
     SLEEP_SECOND = 2
 
 
+class ToutiaoVideoCrawlerConst:
+    """
+    const for toutiao video crawler
+    """
+    # account status
+    CHANNEL_ACCOUNT_GOOD_STATUS = 1
+    CHANNEL_ACCOUNT_BAD_STATUS = 0
+
+    # earliest cursor, 2021-01-01 00:00:00
+    DEFAULT_CURSOR = 1609430400
+
+    # no source account
+    NO_SOURCE_ACCOUNT_STATUS = 0
+
+    # title length min
+    MIN_TITLE_LENGTH = 10
+
+    # max video length(second)
+    MAX_VIDEO_LENGTH = 600
+
+    # sleep second
+    SLEEP_SECOND = 3
+
+
 
 
 

+ 9 - 6
tasks/crawler_toutiao_account_videos.py

@@ -2,8 +2,10 @@
 @author: luojunhui
 """
 import time
+
 from tqdm import tqdm
 
+from applications.const import ToutiaoVideoCrawlerConst
 from applications.db import DatabaseConnector
 from applications.pipeline import scrape_video_entities_process
 from applications.utils import Item
@@ -12,6 +14,7 @@ from applications.utils import insert_into_single_video_source_table
 from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
 from config import apolloConfig, long_articles_config
 
+const = ToutiaoVideoCrawlerConst()
 config = apolloConfig()
 cookie = config.getConfigValue("toutiao_blogger_cookie")
 
@@ -35,13 +38,13 @@ class CrawlerToutiaoAccountVideos:
         """
         get each account video list
         """
-        min_behot_time = 1609430400
         current_cursor = max_behot_time
         has_more = True
 
         while has_more:
-            response = get_toutiao_account_video_list(account_id=account_id, cookie=cookie,
-                                                      max_behot_time=current_cursor)
+            response = get_toutiao_account_video_list(
+                account_id=account_id, cookie=cookie,
+                max_behot_time=current_cursor)
             if response['message'] != 'success':
                 print("error")
                 break
@@ -54,7 +57,7 @@ class CrawlerToutiaoAccountVideos:
                 break
 
             max_timestamp_in_this_group = video_list[0]['publish_time']
-            if max_timestamp_in_this_group < min_behot_time:
+            if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
                 break
 
             crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
@@ -63,7 +66,7 @@ class CrawlerToutiaoAccountVideos:
                 self.crawler_each_video(video)
 
             if has_more:
-                time.sleep(3)
+                time.sleep(const.SLEEP_SECOND)
             else:
                 break
 
@@ -87,7 +90,7 @@ class CrawlerToutiaoAccountVideos:
         video_item.add("platform", "toutiao")
         video_item.add("read_cnt", video_data['read_count'])
         video_item.add("article_url", url)
-        video_item.add("source_account", 0)
+        video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
         video_item.add("crawler_timestamp", int(time.time()))
 
         # check item before insert