crawler_accounts.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. """
  2. @author: luojunhui
  3. """
  4. from __future__ import annotations
  5. import time
  6. import traceback
  7. from pymysql.cursors import DictCursor
  8. from tqdm import tqdm
  9. from applications import log
  10. from applications.const import ToutiaoVideoCrawlerConst
  11. from applications.db import DatabaseConnector
  12. from applications.pipeline import scrape_video_entities_process
  13. from applications.utils import Item
  14. from applications.utils import str_to_md5
  15. from applications.utils import insert_into_single_video_source_table
  16. from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
  17. from config import apolloConfig, long_articles_config
  18. const = ToutiaoVideoCrawlerConst()
  19. config = apolloConfig()
  20. cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
  21. class CrawlerAccounts:
  22. def __init__(self):
  23. self.db_client = DatabaseConnector(db_config=long_articles_config)
  24. self.db_client.connect()
  25. class ChannelAccountCrawler(CrawlerAccounts):
  26. """
  27. crawler channel accounts
  28. strategy:
  29. 1. try to get search keys and titles from database
  30. 2. try to get hot_points from web
  31. 2. use search api to get accounts
  32. """
  33. def get_seed_keys(self):
  34. """
  35. get search keys from database
  36. """
  37. sql = "select * from datastat_sort_strategy limit 100;"
  38. result = self.db_client.fetch(sql)
  39. return result
  40. class ToutiaoAccountCrawler(CrawlerAccounts):
  41. def get_seed_videos(self):
  42. fetch_query = f"""
  43. select article_title, url_unique_md5
  44. from publish_single_video_source
  45. where platform = 'toutiao' and video_pool_audit_status = 1 and bad_status = 0
  46. order by score desc limit 100;
  47. """
  48. seed_video_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
  49. return seed_video_list
  50. def get_recommend_videos(self, seed_video_id: str):
  51. # get recommend videos for each video
  52. recommend_video_list = get_toutiao_account_video_list(seed_video_id, cookie)
  53. return recommend_video_list