3 miesięcy temu · b7ce7f7195
--- a/tasks/crawler_accounts.py
+++ b/tasks/crawler_accounts.py
@@ -0,0 +1,69 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import time
			
 
				+import traceback
			
 
				+
			
 
				+from pymysql.cursors import DictCursor
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from applications import log
			
 
				+from applications.const import ToutiaoVideoCrawlerConst
			
 
				+from applications.db import DatabaseConnector
			
 
				+from applications.pipeline import scrape_video_entities_process
			
 
				+from applications.utils import Item
			
 
				+from applications.utils import str_to_md5
			
 
				+from applications.utils import insert_into_single_video_source_table
			
 
				+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
			
 
				+from config import apolloConfig, long_articles_config
			
 
				+
			
 
				+const = ToutiaoVideoCrawlerConst()
			
 
				+config = apolloConfig()
			
 
				+cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
			
 
				+
			
 
				+
			
 
				+class CrawlerAccounts:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				+        self.db_client.connect()
			
 
				+
			
 
				+
			
 
				+class ChannelAccountCrawler(CrawlerAccounts):
			
 
				+    """
			
 
				+    crawler channel accounts
			
 
				+    strategy:
			
 
				+        1. try to get search keys and titles from database
			
 
				+        2. try to get hot_points from web
			
 
				+        2. use search api to get accounts
			
 
				+    """
			
 
				+
			
 
				+    def get_seed_keys(self):
			
 
				+        """
			
 
				+        get search keys from database
			
 
				+        """
			
 
				+        sql = "select * from datastat_sort_strategy limit 100;"
			
 
				+        result = self.db_client.fetch(sql)
			
 
				+        return result
			
 
				+
			
 
				+
			
 
				+class ToutiaoAccountCrawler(CrawlerAccounts):
			
 
				+    def get_seed_videos(self):
			
 
				+        fetch_query = f"""
			
 
				+            select article_title, url_unique_md5 
			
 
				+            from publish_single_video_source
			
 
				+            where platform = 'toutiao' and video_pool_audit_status = 1 and bad_status = 0
			
 
				+            order by score desc limit 100;
			
 
				+        """
			
 
				+        seed_video_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
			
 
				+        return seed_video_list
			
 
				+
			
 
				+    def get_recommend_videos(self, seed_video_id: str):
			
 
				+        # get recommend videos for each video
			
 
				+        recommend_video_list = get_toutiao_account_video_list(seed_video_id, cookie)
			
 
				+        return recommend_video_list
			
 
				+
			
 
				+
			
--- a/tasks/crawler_channel_accounts.py
+++ b/tasks/crawler_channel_accounts.py
@@ -1,42 +0,0 @@
 
				-"""
			
 
				-@author: luojunhui
			
 
				-@description: crawler channel accounts
			
 
				-"""
			
 
				-
			
 
				-from applications.api import WechatChannelAPI
			
 
				-from applications.db import DatabaseConnector
			
 
				-from applications.pipeline import scrape_account_entities_process
			
 
				-from applications.utils import Item
			
 
				-from applications.utils import insert_into_video_meta_accounts_table
			
 
				-from config import long_articles_config
			
 
				-
			
 
				-class ChannelAccountCrawler:
			
 
				-    """
			
 
				-    crawler channel accounts
			
 
				-    strategy:
			
 
				-        1. try to get search keys and titles from database
			
 
				-        2. try to get hot_points from web
			
 
				-        2. use search api to get accounts
			
 
				-    """
			
 
				-    def __init__(self):
			
 
				-        self.db_client = DatabaseConnector(db_config=long_articles_config)
			
 
				-        self.db_client.connect()
			
 
				-
			
 
				-    def get_seed_keys_from_db(self):
			
 
				-        """
			
 
				-        get search keys from database
			
 
				-        """
			
 
				-        sql = "select * from datastat_sort_strategy limit 100;"
			
 
				-        result = self.db_client.fetch(sql)
			
 
				-        return result
			
 
				-
			
 
				-
			
 
				-CA = ChannelAccountCrawler()
			
 
				-result_list = CA.get_seed_keys_from_db()
			
 
				-for item in result_list:
			
 
				-    print(item)
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-