|
@@ -0,0 +1,69 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+from applications import log
|
|
|
+from applications.const import ToutiaoVideoCrawlerConst
|
|
|
+from applications.db import DatabaseConnector
|
|
|
+from applications.pipeline import scrape_video_entities_process
|
|
|
+from applications.utils import Item
|
|
|
+from applications.utils import str_to_md5
|
|
|
+from applications.utils import insert_into_single_video_source_table
|
|
|
+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
|
|
|
+from config import apolloConfig, long_articles_config
|
|
|
+
|
|
|
+const = ToutiaoVideoCrawlerConst()
|
|
|
+config = apolloConfig()
|
|
|
+cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerAccounts:
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.db_client = DatabaseConnector(db_config=long_articles_config)
|
|
|
+ self.db_client.connect()
|
|
|
+
|
|
|
+
|
|
|
+class ChannelAccountCrawler(CrawlerAccounts):
|
|
|
+ """
|
|
|
+ crawler channel accounts
|
|
|
+ strategy:
|
|
|
+ 1. try to get search keys and titles from database
|
|
|
+ 2. try to get hot_points from web
|
|
|
+ 2. use search api to get accounts
|
|
|
+ """
|
|
|
+
|
|
|
+ def get_seed_keys(self):
|
|
|
+ """
|
|
|
+ get search keys from database
|
|
|
+ """
|
|
|
+ sql = "select * from datastat_sort_strategy limit 100;"
|
|
|
+ result = self.db_client.fetch(sql)
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+class ToutiaoAccountCrawler(CrawlerAccounts):
|
|
|
+ def get_seed_videos(self):
|
|
|
+ fetch_query = f"""
|
|
|
+ select article_title, url_unique_md5
|
|
|
+ from publish_single_video_source
|
|
|
+ where platform = 'toutiao' and video_pool_audit_status = 1 and bad_status = 0
|
|
|
+ order by score desc limit 100;
|
|
|
+ """
|
|
|
+ seed_video_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
|
|
|
+ return seed_video_list
|
|
|
+
|
|
|
+ def get_recommend_videos(self, seed_video_id: str):
|
|
|
+ # get recommend videos for each video
|
|
|
+ recommend_video_list = get_toutiao_account_video_list(seed_video_id, cookie)
|
|
|
+ return recommend_video_list
|
|
|
+
|
|
|
+
|