|
@@ -2,8 +2,10 @@
|
|
@author: luojunhui
|
|
@author: luojunhui
|
|
"""
|
|
"""
|
|
import time
|
|
import time
|
|
|
|
+
|
|
from tqdm import tqdm
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
+from applications.const import ToutiaoVideoCrawlerConst
|
|
from applications.db import DatabaseConnector
|
|
from applications.db import DatabaseConnector
|
|
from applications.pipeline import scrape_video_entities_process
|
|
from applications.pipeline import scrape_video_entities_process
|
|
from applications.utils import Item
|
|
from applications.utils import Item
|
|
@@ -12,6 +14,7 @@ from applications.utils import insert_into_single_video_source_table
|
|
from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
|
|
from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
|
|
from config import apolloConfig, long_articles_config
|
|
from config import apolloConfig, long_articles_config
|
|
|
|
|
|
|
|
+const = ToutiaoVideoCrawlerConst()
|
|
config = apolloConfig()
|
|
config = apolloConfig()
|
|
cookie = config.getConfigValue("toutiao_blogger_cookie")
|
|
cookie = config.getConfigValue("toutiao_blogger_cookie")
|
|
|
|
|
|
@@ -35,13 +38,13 @@ class CrawlerToutiaoAccountVideos:
|
|
"""
|
|
"""
|
|
get each account video list
|
|
get each account video list
|
|
"""
|
|
"""
|
|
- min_behot_time = 1609430400
|
|
|
|
current_cursor = max_behot_time
|
|
current_cursor = max_behot_time
|
|
has_more = True
|
|
has_more = True
|
|
|
|
|
|
while has_more:
|
|
while has_more:
|
|
- response = get_toutiao_account_video_list(account_id=account_id, cookie=cookie,
|
|
|
|
- max_behot_time=current_cursor)
|
|
|
|
|
|
+ response = get_toutiao_account_video_list(
|
|
|
|
+ account_id=account_id, cookie=cookie,
|
|
|
|
+ max_behot_time=current_cursor)
|
|
if response['message'] != 'success':
|
|
if response['message'] != 'success':
|
|
print("error")
|
|
print("error")
|
|
break
|
|
break
|
|
@@ -54,7 +57,7 @@ class CrawlerToutiaoAccountVideos:
|
|
break
|
|
break
|
|
|
|
|
|
max_timestamp_in_this_group = video_list[0]['publish_time']
|
|
max_timestamp_in_this_group = video_list[0]['publish_time']
|
|
- if max_timestamp_in_this_group < min_behot_time:
|
|
|
|
|
|
+ if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
|
|
break
|
|
break
|
|
|
|
|
|
crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
|
|
crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
|
|
@@ -63,7 +66,7 @@ class CrawlerToutiaoAccountVideos:
|
|
self.crawler_each_video(video)
|
|
self.crawler_each_video(video)
|
|
|
|
|
|
if has_more:
|
|
if has_more:
|
|
- time.sleep(3)
|
|
|
|
|
|
+ time.sleep(const.SLEEP_SECOND)
|
|
else:
|
|
else:
|
|
break
|
|
break
|
|
|
|
|
|
@@ -87,7 +90,7 @@ class CrawlerToutiaoAccountVideos:
|
|
video_item.add("platform", "toutiao")
|
|
video_item.add("platform", "toutiao")
|
|
video_item.add("read_cnt", video_data['read_count'])
|
|
video_item.add("read_cnt", video_data['read_count'])
|
|
video_item.add("article_url", url)
|
|
video_item.add("article_url", url)
|
|
- video_item.add("source_account", 0)
|
|
|
|
|
|
+ video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
|
|
video_item.add("crawler_timestamp", int(time.time()))
|
|
video_item.add("crawler_timestamp", int(time.time()))
|
|
|
|
|
|
# check item before insert
|
|
# check item before insert
|