3 months ago · fae3665e9e
--- a/applications/pipeline/crawler_pipeline.py
+++ b/applications/pipeline/crawler_pipeline.py
@@ -58,7 +58,7 @@ class CrawlerPipeline(AsyncApolloApi):
 
				         sql = f"INSERT INTO `{table_name}` ({columns}) VALUES ({placeholders})"
			
 
				         await self.pool.async_save(sql, tuple(insert_data.values()))
			
 
				 
			
 
				-    async def save_item_to_database(self, media_type: str, item: dict):
			
 
				+    async def save_item_to_database(self, media_type: str, item: dict, trace_id: str):
			
 
				         """deal function"""
			
 
				         match media_type:
			
 
				             case "video":
			
@@ -101,6 +101,14 @@ class CrawlerPipeline(AsyncApolloApi):
 
				                     item["title_sensitive"] = 1
			
 
				 
			
 
				                 await self.save_single_record(media_type, item)
			
 
				+                await self.log_client.log(
			
 
				+                    contents={
			
 
				+                        "trace_id": trace_id,
			
 
				+                        "function": "save_article",
			
 
				+                        "data": item,
			
 
				+                        "message": "save article successfully",
			
 
				+                    }
			
 
				+                )
			
 
				 
			
 
				             case "account":
			
 
				                 if await self.whether_account_exist(
			
@@ -109,6 +117,14 @@ class CrawlerPipeline(AsyncApolloApi):
 
				                     return
			
 
				 
			
 
				                 await self.save_single_record(media_type, item)
			
 
				+                await self.log_client.log(
			
 
				+                    contents={
			
 
				+                        "trace_id": trace_id,
			
 
				+                        "function": "save_account",
			
 
				+                        "data": item,
			
 
				+                        "message": "save account successfully",
			
 
				+                    }
			
 
				+                )
			
 
				 
			
 
				             case _:
			
 
				                 raise Exception("Unknown media type")
			
--- a/applications/tasks/crawler_tasks/crawler_toutiao.py
+++ b/applications/tasks/crawler_tasks/crawler_toutiao.py
@@ -182,7 +182,7 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
 
				             else:
			
 
				                 break
			
 
				 
			
 
				-    async def crawler_each_account(self, account_name, account_id, media_type):
			
 
				+    async def crawler_each_account(self, account_name, account_id, media_type, cookie):
			
 
				         """
			
 
				         get toutiao account info
			
 
				         """
			
@@ -193,17 +193,19 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
 
				             "crawler_date": datetime.now().strftime("%Y-%m-%d"),
			
 
				             "media_type": media_type,
			
 
				         }
			
 
				-        await self.log_client.log(
			
 
				-            contents={
			
 
				-                "task": "crawler_toutiao",
			
 
				-                "function": "crawler_each_account",
			
 
				-                "trace_id": self.trace_id,
			
 
				-                "message": "抓取账号成功",
			
 
				-                "status": "success",
			
 
				-                "data": new_account_item,
			
 
				-            }
			
 
				+        # get title_list
			
 
				+        response = await get_toutiao_account_info_list(
			
 
				+            account_id=account_id,
			
 
				+            cookie=cookie,
			
 
				+            media_type="article"
			
 
				         )
			
 
				-        await self.save_item_to_database(media_type="account", item=new_account_item)
			
 
				+        if not response:
			
 
				+            return
			
 
				+
			
 
				+        article_raw_data = response["data"]
			
 
				+        title_list = [i['title'] for i in article_raw_data]
			
 
				+        new_account_item["title_list"] = json.dumps(title_list, ensure_ascii=False)
			
 
				+        await self.save_item_to_database(media_type="account", item=new_account_item, trace_id=self.trace_id)
			
 
				 
			
 
				     async def crawler_each_article(self, method, article_raw_data, category=None):
			
 
				         """
			
@@ -240,17 +242,7 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
 
				                 }
			
 
				             case _:
			
 
				                 raise Exception(f"unknown method: {method}")
			
 
				-        await self.log_client.log(
			
 
				-            contents={
			
 
				-                "task": "crawler_toutiao",
			
 
				-                "function": "crawler_each_article",
			
 
				-                "trace_id": self.trace_id,
			
 
				-                "message": "抓取文章成功",
			
 
				-                "status": "success",
			
 
				-                "data": new_article_item,
			
 
				-            }
			
 
				-        )
			
 
				-        await self.save_item_to_database(media_type="article", item=new_article_item)
			
 
				+        await self.save_item_to_database(media_type="article", item=new_article_item, trace_id=self.trace_id)
			
 
				 
			
 
				     async def crawler_each_video(self, video_raw_data):
			
 
				         pass
			
@@ -416,6 +408,9 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
 
				     # 搜索抓账号
			
 
				     async def search_candidate_accounts(self):
			
 
				         top_title_list = await get_top_article_title_list(pool=self.pool)
			
 
				+        cookie = await self.get_config_value(
			
 
				+            key="toutiao_blogger_cookie", output_type="string"
			
 
				+        )
			
 
				         for article in top_title_list:
			
 
				             title = article["title"]
			
 
				             try:
			
@@ -446,7 +441,7 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
 
				                         )
			
 
				                         if account_id:
			
 
				                             await self.crawler_each_account(
			
 
				-                                account_name, account_id, self.ARTICLE_TYPE
			
 
				+                                account_name, account_id, self.ARTICLE_TYPE, cookie
			
 
				                             )
			
 
				 
			
 
				                         await asyncio.sleep(1)