crawler_toutiao_accounts.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. """
  2. @author: luojunhui
  3. @description: crawler toutiao account videos by recommendation
  4. """
  5. from __future__ import annotations
  6. import time
  7. import traceback
  8. from pymysql.cursors import DictCursor
  9. from tqdm import tqdm
  10. from applications import log
  11. from applications.const import ToutiaoVideoCrawlerConst
  12. from applications.db import DatabaseConnector
  13. from coldStartTasks.crawler.toutiao import get_associated_recommendation
  14. from config import apolloConfig, long_articles_config
  15. const = ToutiaoVideoCrawlerConst()
  16. config = apolloConfig()
  17. cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
  18. class CrawlerToutiaoAccounts:
  19. """
  20. toutiao blogger crawler
  21. """
  22. def __init__(self):
  23. self.db_client = DatabaseConnector(db_config=long_articles_config)
  24. self.db_client.connect()
  25. def get_seed_video_with_strategy(self, strategy: str = 'basic'):
  26. """
  27. 采用策略获取种子视频
  28. """
  29. match strategy:
  30. case 'basic':
  31. sql = "select id, article_title, out_account_name, url_unique_md5 from publish_single_video_source where platform = 'toutiao' and audit_status = 1 and bad_status = 0;"
  32. seed_video_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
  33. return seed_video_list
  34. case _:
  35. return []
  36. def get_exist_account_list(self) -> set:
  37. """
  38. get already exist account list
  39. """
  40. sql = f"""
  41. select account_id
  42. from video_meta_accounts
  43. where platform = 'toutiao' and status = {const.TOUTIAO_ACCOUNT_GOOD_STATUS};
  44. """
  45. account_list = self.db_client.fetch(query=sql, cursor_type=DictCursor)
  46. account_id_set = set(
  47. [i['account_id'] for i in account_list]
  48. )
  49. return account_id_set
  50. def insert_video_into_video_association_table(self, video_info: dict):
  51. """
  52. insert video into video_association table
  53. """
  54. select_sql = f"""select id from video_association where recommend_video_id = %s"""
  55. video_id = self.db_client.fetch(query=select_sql, params=(video_info['recommend_video_id'],), cursor_type=DictCursor)
  56. if video_id:
  57. print(f'duplicated video id: {video_id}')
  58. return
  59. sql = f"""
  60. insert into video_association
  61. (account_name, account_id, recommend_video_id, title, read_cnt, duration, seed_account, seed_title, recommend_date, platform)
  62. values
  63. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  64. """
  65. self.db_client.save(
  66. query=sql,
  67. params=(
  68. video_info['author'],
  69. video_info['account_id'],
  70. video_info['recommend_video_id'],
  71. video_info['title'],
  72. video_info['read_cnt'],
  73. video_info['duration'],
  74. video_info['seed_account'],
  75. video_info['seed_title'],
  76. video_info['recommend_date'],
  77. video_info['platform'],
  78. )
  79. )
  80. def get_recommend_video_list(self, seed_video: dict, exist_account_set: set):
  81. """
  82. group_id: toutiao group id
  83. """
  84. group_id = seed_video['url_unique_md5']
  85. seed_title = seed_video['article_title']
  86. seed_account = seed_video['out_account_name']
  87. response = get_associated_recommendation(article_id=group_id, cookie=cookie)
  88. recommend_video_list = response['data']
  89. recommend_video_list_bar = tqdm(recommend_video_list, desc="get recommend video list")
  90. for video in recommend_video_list_bar:
  91. try:
  92. account_info = video["user_info"]
  93. author = account_info["name"]
  94. account_id = account_info["user_id"]
  95. if account_id in exist_account_set:
  96. print("exists account:\t", author, )
  97. continue
  98. video_obj = {
  99. "author": author,
  100. "account_id": account_id,
  101. "title": video["title"],
  102. "read_cnt": video.get("read_count", 0),
  103. "duration": video.get("video_duration", 0),
  104. "recommend_video_id": video["group_id"],
  105. "seed_account": seed_account,
  106. "seed_title": seed_title,
  107. "recommend_date": time.strftime("%Y-%m-%d", time.localtime()),
  108. "platform": 'toutiao'
  109. }
  110. self.insert_video_into_video_association_table(video_obj)
  111. except Exception as e:
  112. print(e)
  113. def deal(self) -> None:
  114. """
  115. class entrance
  116. """
  117. # get exist account id set
  118. exist_account_id_set = self.get_exist_account_list()
  119. # get seed video list
  120. seed_video_list = self.get_seed_video_with_strategy()
  121. seed_video_list_bar = tqdm(seed_video_list, desc="get recommend video list")
  122. for video in seed_video_list_bar:
  123. seed_title = video['article_title']
  124. seed_video_list_bar.set_postfix({"seed_title": seed_title})
  125. try:
  126. self.get_recommend_video_list(
  127. seed_video=video,
  128. exist_account_set=exist_account_id_set
  129. )
  130. except Exception as e:
  131. print(e)