crawler_toutiao_account_videos.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. from tqdm import tqdm
  6. from applications.db import DatabaseConnector
  7. from applications.pipeline import video_crawler_pipeline
  8. from applications.utils import Item
  9. from applications.utils import str_to_md5
  10. from applications.utils import insert_into_single_video_source_table
  11. from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
  12. from config import apolloConfig, long_articles_config
  13. config = apolloConfig()
  14. cookie = config.getConfigValue("toutiao_blogger_cookie")
  15. class CrawlerToutiaoAccountVideos:
  16. """
  17. toutiao blogger crawler
  18. """
  19. def __init__(self):
  20. self.db_client = DatabaseConnector(db_config=long_articles_config)
  21. self.db_client.connect()
  22. def get_account_list(self):
  23. """
  24. get account list
  25. """
  26. return
  27. def crawler_each_account_video_list(self, account_id, max_behot_time=0):
  28. """
  29. get each account video list
  30. """
  31. min_behot_time = 1704038400
  32. current_cursor = max_behot_time
  33. has_more = True
  34. while has_more:
  35. response = get_toutiao_account_video_list(account_id=account_id, cookie=cookie,
  36. max_behot_time=current_cursor)
  37. if response['message'] != 'success':
  38. print("error")
  39. break
  40. video_list = response['data']
  41. has_more = response['has_more']
  42. current_cursor = response['next']['max_behot_time']
  43. if not video_list:
  44. break
  45. max_timestamp_in_this_group = video_list[0]['publish_time']
  46. if max_timestamp_in_this_group < min_behot_time:
  47. break
  48. crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
  49. for video in crawler_video_list_bar:
  50. crawler_video_list_bar.set_postfix({"video_id": video["id"]})
  51. self.crawler_each_video(video)
  52. if has_more:
  53. time.sleep(3)
  54. else:
  55. break
  56. def crawler_each_video(self, video_data):
  57. """
  58. crawler each video data
  59. """
  60. video_item = Item()
  61. video_id = video_data['video_id']
  62. title = video_data['title']
  63. video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
  64. video_item.add("url_unique_md5", video_id)
  65. video_item.add("article_title", title)
  66. video_item.add("out_account_id", video_data['user']['user_id'])
  67. video_item.add("out_account_name", video_data['source'])
  68. video_item.add("publish_timestamp", video_data['publish_time'])
  69. video_item.add("platform", "toutiao")
  70. video_item.add("read_cnt", video_data['read_count'])
  71. media = video_data['video']
  72. url = media["download_addr"]['url_list'][0]
  73. video_item.add("article_url", url)
  74. video_item.add("source_account", 0)
  75. video_item.check(source="video")
  76. try:
  77. item_with_oss_path = video_crawler_pipeline(
  78. video_item=video_item.item,
  79. db_client=self.db_client
  80. )
  81. insert_into_single_video_source_table(self.db_client, item_with_oss_path)
  82. except Exception as e:
  83. print(e)
  84. def deal(self):
  85. """
  86. class entrance
  87. """
  88. account_id = 'MS4wLjABAAAAXp7v7A9VfXh-Pfo1TwejlJViATS7aqxuLnBHjaEb8tx1nDTLe7jF7KsNAR9RoVWk'
  89. self.crawler_each_account_video_list(account_id)