Просмотр исходного кода

Merge branch '2025-01-10-toutiao-recommend-crawler-v2' of luojunhui/LongArticlesJob into master

luojunhui 3 месяцев назад
Родитель
Сommit
30aecfe200

+ 9 - 6
coldStartTasks/crawler/toutiao_get_bogus.py

@@ -14,6 +14,9 @@ from config import long_articles_config
 
 
 class ToutiaoBogus:
+    """
+    获取头条请求参数
+    """
     def __init__(self):
         """
         初始化ToutiaoBogus类的实例。
@@ -28,12 +31,12 @@ class ToutiaoBogus:
 
     def on_request(self, request, category):
         if "https://www.toutiao.com/api/pc/list/feed?" in request.url:
-            request_info = {
-                'method': request.method,
-                'url': request.url,
-                'headers': request.headers if request.headers else {},
-                'postData': request.post_data if request.post_data else {}
-            }
+            # request_info = {
+            #     'method': request.method,
+            #     'url': request.url,
+            #     'headers': request.headers if request.headers else {},
+            #     'postData': request.post_data if request.post_data else {}
+            # }
             insert_sql = f"""
                 INSERT INTO toutiao_request_params
                 (request_method, request_url, request_headers, post_data, category) 

+ 37 - 26
coldStartTasks/crawler/toutiao_recommend_crawler.py

@@ -18,10 +18,12 @@ from config import long_articles_config
 
 functions = Functions()
 
+
 class ToutiaoRecommendCrawler(object):
     """
     今日头条推荐流
     """
+
     def __init__(self) -> None:
         self.db_client = None
 
@@ -42,18 +44,18 @@ class ToutiaoRecommendCrawler(object):
                 }
             )
 
-    def get_history_recommendation(self) -> Dict:
+    def get_request_params(self, category) -> Dict:
         """
-        获取历史推荐流文章
+        获取请求参数
         :return:
         """
         select_sql = f"""
-            SELECT request_method, request_url, request_headers, post_data
-            FROM toutiao_request_params
-            WHERE category = 'history' and expire_flag = 0
-            ORDER BY id
-            LIMIT 1;
-        """
+                    SELECT request_method, request_url, request_headers, post_data
+                    FROM toutiao_request_params
+                    WHERE category = '{category}' and expire_flag = 0
+                    ORDER BY id
+                    LIMIT 1;
+                """
         result = self.db_client.fetch(
             query=select_sql,
             cursor_type=DictCursor
@@ -61,26 +63,32 @@ class ToutiaoRecommendCrawler(object):
         if not result:
             print("cookie没了报警")
             return {}
-        cookie_obj = result[0]
+        else:
+            return result[0]
+
+    def get_recommendation_article_list(self, category) -> Dict:
+        """
+        获取历史推荐流文章
+        :return:
+        """
+        cookie_obj = self.get_request_params(category)
+        if not cookie_obj:
+            return {}
         response = requests.request(
             method=cookie_obj['request_method'],
             url=cookie_obj['request_url'],
             headers=json.loads(cookie_obj['request_headers']),
             proxies=functions.proxy()
         )
+        if response.text is None:
+            print("{}: cookie  失效".format(category))
         return response.json()
 
-    def get_tech_recommendation(self) -> Dict:
-        """
-        获取科技推荐流文章
-        :return:
-        """
-        return
-
-    def insert_each_article(self, item: Dict) -> Dict:
+    def insert_each_article(self, category, item: Dict) -> None:
         """
         提取文章信息
-        :param article_info:
+        :param item
+        :param category
         :return:
         """
         item_id = item.get('item_id')
@@ -103,7 +111,7 @@ class ToutiaoRecommendCrawler(object):
             params=(
                 "toutiao",
                 "recommend",
-                "history",
+                category,
                 user_id,
                 title,
                 article_url,
@@ -117,10 +125,11 @@ class ToutiaoRecommendCrawler(object):
             )
         )
 
-    def process_recommendation(self, recommendation) -> Dict:
+    def process_recommendation(self, category, recommendation) -> Dict:
         """
         处理推荐流文章
-        :param recommendation:
+        :param recommendation
+        :param category
         :return:
         """
         for item in tqdm(recommendation['data']):
@@ -128,13 +137,14 @@ class ToutiaoRecommendCrawler(object):
                 video_flag = item.get('has_video')
                 if not video_flag:
                     try:
-                        self.insert_each_article(item)
+                        self.insert_each_article(category=category, item=item)
                     except Exception as e:
                         error_data = {
                             "error": str(e),
                             "error_stack": traceback.format_exc()
                         }
                         log(
+                            function='toutiao_recommend_crawler',
                             task='toutiao_recommend',
                             message='头条推荐流文章插入失败',
                             data=error_data,
@@ -144,16 +154,16 @@ class ToutiaoRecommendCrawler(object):
                     print("视频文章跳过")
             else:
                 print("无链接文章跳过")
-    
-    def run(self) -> None:
+
+    def run(self, category) -> None:
         """
         主函数
         :return:
         """
         for i in range(10):
             try:
-                article_list = self.get_history_recommendation()
-                self.process_recommendation(article_list)
+                article_list = self.get_recommendation_article_list(category=category)
+                self.process_recommendation(category=category, recommendation=article_list)
                 time.sleep(3)
             except Exception as e:
                 error_data = {
@@ -161,3 +171,4 @@ class ToutiaoRecommendCrawler(object):
                     "error_stack": traceback.format_exc()
                 }
                 print(error_data)
+

+ 7 - 1
run_toutiao_recommend.py

@@ -1,7 +1,13 @@
+"""
+@author: luojunhui
+@description: 今日头条推荐流文章抓取任务
+"""
 from coldStartTasks.crawler.toutiao_recommend_crawler import ToutiaoRecommendCrawler
 
 
 if __name__ == "__main__":
     toutiao_recommend_crawler = ToutiaoRecommendCrawler()
     toutiao_recommend_crawler.init_database()
-    toutiao_recommend_crawler.run()
+    category_list = ['finance', 'tech', 'history', 'entertainment']
+    for category in category_list:
+        toutiao_recommend_crawler.run(category=category)

+ 16 - 0
sh/run_toutiao_recommend.sh

@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# 获取当前日期,格式为 YYYY-MM-DD
+CURRENT_DATE=$(date +%F)
+
+# 日志文件路径,包含日期
+LOG_FILE="/root/luojunhui/logs/toutiao_crawler_task_log_$CURRENT_DATE.txt"
+
+# 重定向整个脚本的输出到带日期的日志文件
+exec >> "$LOG_FILE" 2>&1
+
+cd /root/luojunhui/LongArticlesJob
+source /root/miniconda3/etc/profile.d/conda.sh
+conda activate tasks
+
+nohup python3 run_toutiao_recommend.py >> "${LOG_FILE}" 2>&1 &