|
@@ -1,6 +1,7 @@
|
|
"""
|
|
"""
|
|
@author: luojunhui
|
|
@author: luojunhui
|
|
"""
|
|
"""
|
|
|
|
+
|
|
import time
|
|
import time
|
|
|
|
|
|
from tqdm import tqdm
|
|
from tqdm import tqdm
|
|
@@ -43,20 +44,20 @@ class CrawlerToutiaoAccountVideos:
|
|
|
|
|
|
while has_more:
|
|
while has_more:
|
|
response = get_toutiao_account_video_list(
|
|
response = get_toutiao_account_video_list(
|
|
- account_id=account_id, cookie=cookie,
|
|
|
|
- max_behot_time=current_cursor)
|
|
|
|
- if response['message'] != 'success':
|
|
|
|
|
|
+ account_id=account_id, cookie=cookie, max_behot_time=current_cursor
|
|
|
|
+ )
|
|
|
|
+ if response["message"] != "success":
|
|
print("error")
|
|
print("error")
|
|
break
|
|
break
|
|
|
|
|
|
- video_list = response['data']
|
|
|
|
- has_more = response['has_more']
|
|
|
|
- current_cursor = response['next']['max_behot_time']
|
|
|
|
|
|
+ video_list = response["data"]
|
|
|
|
+ has_more = response["has_more"]
|
|
|
|
+ current_cursor = response["next"]["max_behot_time"]
|
|
|
|
|
|
if not video_list:
|
|
if not video_list:
|
|
break
|
|
break
|
|
|
|
|
|
- max_timestamp_in_this_group = video_list[0]['publish_time']
|
|
|
|
|
|
+ max_timestamp_in_this_group = video_list[0]["publish_time"]
|
|
if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
|
|
if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
|
|
break
|
|
break
|
|
|
|
|
|
@@ -75,20 +76,20 @@ class CrawlerToutiaoAccountVideos:
|
|
crawler each video data
|
|
crawler each video data
|
|
"""
|
|
"""
|
|
video_item = Item()
|
|
video_item = Item()
|
|
- video_id = video_data['video_id']
|
|
|
|
- title = video_data['title']
|
|
|
|
- media = video_data['video']
|
|
|
|
- url = media["download_addr"]['url_list'][0]
|
|
|
|
|
|
+ video_id = video_data["video_id"]
|
|
|
|
+ title = video_data["title"]
|
|
|
|
+ media = video_data["video"]
|
|
|
|
+ url = media["download_addr"]["url_list"][0]
|
|
|
|
|
|
# add info into item
|
|
# add info into item
|
|
video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
|
|
video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
|
|
video_item.add("url_unique_md5", video_id)
|
|
video_item.add("url_unique_md5", video_id)
|
|
video_item.add("article_title", title)
|
|
video_item.add("article_title", title)
|
|
- video_item.add("out_account_id", video_data['user']['user_id'])
|
|
|
|
- video_item.add("out_account_name", video_data['source'])
|
|
|
|
- video_item.add("publish_timestamp", video_data['publish_time'])
|
|
|
|
|
|
+ video_item.add("out_account_id", video_data["user"]["user_id"])
|
|
|
|
+ video_item.add("out_account_name", video_data["source"])
|
|
|
|
+ video_item.add("publish_timestamp", video_data["publish_time"])
|
|
video_item.add("platform", "toutiao")
|
|
video_item.add("platform", "toutiao")
|
|
- video_item.add("read_cnt", video_data['read_count'])
|
|
|
|
|
|
+ video_item.add("read_cnt", video_data["read_count"])
|
|
video_item.add("article_url", url)
|
|
video_item.add("article_url", url)
|
|
video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
|
|
video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
|
|
video_item.add("crawler_timestamp", int(time.time()))
|
|
video_item.add("crawler_timestamp", int(time.time()))
|
|
@@ -97,11 +98,12 @@ class CrawlerToutiaoAccountVideos:
|
|
video_item.check(source="video")
|
|
video_item.check(source="video")
|
|
try:
|
|
try:
|
|
item_with_oss_path = scrape_video_entities_process(
|
|
item_with_oss_path = scrape_video_entities_process(
|
|
- video_item=video_item.item,
|
|
|
|
- db_client=self.db_client
|
|
|
|
|
|
+ video_item=video_item.item, db_client=self.db_client
|
|
)
|
|
)
|
|
if item_with_oss_path:
|
|
if item_with_oss_path:
|
|
- insert_into_single_video_source_table(self.db_client, item_with_oss_path)
|
|
|
|
|
|
+ insert_into_single_video_source_table(
|
|
|
|
+ self.db_client, item_with_oss_path
|
|
|
|
+ )
|
|
except Exception as e:
|
|
except Exception as e:
|
|
print(e)
|
|
print(e)
|
|
|
|
|
|
@@ -109,5 +111,5 @@ class CrawlerToutiaoAccountVideos:
|
|
"""
|
|
"""
|
|
class entrance
|
|
class entrance
|
|
"""
|
|
"""
|
|
- account_id = 'MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi'
|
|
|
|
|
|
+ account_id = "MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi"
|
|
self.crawler_each_account_video_list(account_id)
|
|
self.crawler_each_account_video_list(account_id)
|