crawler_toutiao_account_videos.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. from tqdm import tqdm
  6. from applications.const import ToutiaoVideoCrawlerConst
  7. from applications.db import DatabaseConnector
  8. from applications.pipeline import scrape_video_entities_process
  9. from applications.utils import Item
  10. from applications.utils import str_to_md5
  11. from applications.utils import insert_into_single_video_source_table
  12. from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
  13. from config import apolloConfig, long_articles_config
  14. const = ToutiaoVideoCrawlerConst()
  15. config = apolloConfig()
  16. cookie = config.getConfigValue("toutiao_blogger_cookie")
  17. class CrawlerToutiaoAccountVideos:
  18. """
  19. toutiao blogger crawler
  20. """
  21. def __init__(self):
  22. self.db_client = DatabaseConnector(db_config=long_articles_config)
  23. self.db_client.connect()
  24. def get_account_list(self):
  25. """
  26. get account list
  27. """
  28. return
  29. def crawler_each_account_video_list(self, account_id, max_behot_time=0):
  30. """
  31. get each account video list
  32. """
  33. current_cursor = max_behot_time
  34. has_more = True
  35. while has_more:
  36. response = get_toutiao_account_video_list(
  37. account_id=account_id, cookie=cookie,
  38. max_behot_time=current_cursor)
  39. if response['message'] != 'success':
  40. print("error")
  41. break
  42. video_list = response['data']
  43. has_more = response['has_more']
  44. current_cursor = response['next']['max_behot_time']
  45. if not video_list:
  46. break
  47. max_timestamp_in_this_group = video_list[0]['publish_time']
  48. if max_timestamp_in_this_group < const.DEFAULT_CURSOR:
  49. break
  50. crawler_video_list_bar = tqdm(video_list, desc="crawler videos")
  51. for video in crawler_video_list_bar:
  52. crawler_video_list_bar.set_postfix({"video_id": video["id"]})
  53. self.crawler_each_video(video)
  54. if has_more:
  55. time.sleep(const.SLEEP_SECOND)
  56. else:
  57. break
  58. def crawler_each_video(self, video_data):
  59. """
  60. crawler each video data
  61. """
  62. video_item = Item()
  63. video_id = video_data['video_id']
  64. title = video_data['title']
  65. media = video_data['video']
  66. url = media["download_addr"]['url_list'][0]
  67. # add info into item
  68. video_item.add("content_trace_id", "video{}".format(str_to_md5(str(video_id))))
  69. video_item.add("url_unique_md5", video_id)
  70. video_item.add("article_title", title)
  71. video_item.add("out_account_id", video_data['user']['user_id'])
  72. video_item.add("out_account_name", video_data['source'])
  73. video_item.add("publish_timestamp", video_data['publish_time'])
  74. video_item.add("platform", "toutiao")
  75. video_item.add("read_cnt", video_data['read_count'])
  76. video_item.add("article_url", url)
  77. video_item.add("source_account", const.NO_SOURCE_ACCOUNT_STATUS)
  78. video_item.add("crawler_timestamp", int(time.time()))
  79. # check item before insert
  80. video_item.check(source="video")
  81. try:
  82. item_with_oss_path = scrape_video_entities_process(
  83. video_item=video_item.item,
  84. db_client=self.db_client
  85. )
  86. if item_with_oss_path:
  87. insert_into_single_video_source_table(self.db_client, item_with_oss_path)
  88. except Exception as e:
  89. print(e)
  90. def deal(self):
  91. """
  92. class entrance
  93. """
  94. account_id = 'MS4wLjABAAAAaQYyWZTkidUVmt1tivdSY5UZdGD02UfW9yRlLfrxNGwVltHSwvInIauOKyra-HCi'
  95. self.crawler_each_account_video_list(account_id)