Przeglądaj źródła

develop publish_single_video_pool_videos.py

luojunhui 3 miesięcy temu
rodzic
commit
b7ce7f7195
2 zmienionych plików z 69 dodań i 42 usunięć
  1. 69 0
      tasks/crawler_accounts.py
  2. 0 42
      tasks/crawler_channel_accounts.py

+ 69 - 0
tasks/crawler_accounts.py

@@ -0,0 +1,69 @@
+"""
+@author: luojunhui
+"""
+
+from __future__ import annotations
+
+import time
+import traceback
+
+from pymysql.cursors import DictCursor
+from tqdm import tqdm
+
+from applications import log
+from applications.const import ToutiaoVideoCrawlerConst
+from applications.db import DatabaseConnector
+from applications.pipeline import scrape_video_entities_process
+from applications.utils import Item
+from applications.utils import str_to_md5
+from applications.utils import insert_into_single_video_source_table
+from coldStartTasks.crawler.toutiao import get_toutiao_account_video_list
+from config import apolloConfig, long_articles_config
+
+const = ToutiaoVideoCrawlerConst()
+config = apolloConfig()
+cookie = config.getConfigValue("toutiao_detail_recommend_cookie")
+
+
+class CrawlerAccounts:
+
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+
+class ChannelAccountCrawler(CrawlerAccounts):
+    """
+    crawler channel accounts
+    strategy:
+        1. try to get search keys and titles from database
+        2. try to get hot_points from web
+        2. use search api to get accounts
+    """
+
+    def get_seed_keys(self):
+        """
+        get search keys from database
+        """
+        sql = "select * from datastat_sort_strategy limit 100;"
+        result = self.db_client.fetch(sql)
+        return result
+
+
+class ToutiaoAccountCrawler(CrawlerAccounts):
+    def get_seed_videos(self):
+        fetch_query = f"""
+            select article_title, url_unique_md5 
+            from publish_single_video_source
+            where platform = 'toutiao' and video_pool_audit_status = 1 and bad_status = 0
+            order by score desc limit 100;
+        """
+        seed_video_list = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
+        return seed_video_list
+
+    def get_recommend_videos(self, seed_video_id: str):
+        # get recommend videos for each video
+        recommend_video_list = get_toutiao_account_video_list(seed_video_id, cookie)
+        return recommend_video_list
+
+

+ 0 - 42
tasks/crawler_channel_accounts.py

@@ -1,42 +0,0 @@
-"""
-@author: luojunhui
-@description: crawler channel accounts
-"""
-
-from applications.api import WechatChannelAPI
-from applications.db import DatabaseConnector
-from applications.pipeline import scrape_account_entities_process
-from applications.utils import Item
-from applications.utils import insert_into_video_meta_accounts_table
-from config import long_articles_config
-
-class ChannelAccountCrawler:
-    """
-    crawler channel accounts
-    strategy:
-        1. try to get search keys and titles from database
-        2. try to get hot_points from web
-        2. use search api to get accounts
-    """
-    def __init__(self):
-        self.db_client = DatabaseConnector(db_config=long_articles_config)
-        self.db_client.connect()
-
-    def get_seed_keys_from_db(self):
-        """
-        get search keys from database
-        """
-        sql = "select * from datastat_sort_strategy limit 100;"
-        result = self.db_client.fetch(sql)
-        return result
-
-
-CA = ChannelAccountCrawler()
-result_list = CA.get_seed_keys_from_db()
-for item in result_list:
-    print(item)
-
-
-
-
-