Explorar o código

新增头条搜索账号模式

luojunhui hai 1 mes
pai
achega
fae3665e9e

+ 17 - 1
applications/pipeline/crawler_pipeline.py

@@ -58,7 +58,7 @@ class CrawlerPipeline(AsyncApolloApi):
         sql = f"INSERT INTO `{table_name}` ({columns}) VALUES ({placeholders})"
         await self.pool.async_save(sql, tuple(insert_data.values()))
 
-    async def save_item_to_database(self, media_type: str, item: dict):
+    async def save_item_to_database(self, media_type: str, item: dict, trace_id: str):
         """deal function"""
         match media_type:
             case "video":
@@ -101,6 +101,14 @@ class CrawlerPipeline(AsyncApolloApi):
                     item["title_sensitive"] = 1
 
                 await self.save_single_record(media_type, item)
+                await self.log_client.log(
+                    contents={
+                        "trace_id": trace_id,
+                        "function": "save_article",
+                        "data": item,
+                        "message": "save article successfully",
+                    }
+                )
 
             case "account":
                 if await self.whether_account_exist(
@@ -109,6 +117,14 @@ class CrawlerPipeline(AsyncApolloApi):
                     return
 
                 await self.save_single_record(media_type, item)
+                await self.log_client.log(
+                    contents={
+                        "trace_id": trace_id,
+                        "function": "save_account",
+                        "data": item,
+                        "message": "save account successfully",
+                    }
+                )
 
             case _:
                 raise Exception("Unknown media type")

+ 18 - 23
applications/tasks/crawler_tasks/crawler_toutiao.py

@@ -182,7 +182,7 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
             else:
                 break
 
-    async def crawler_each_account(self, account_name, account_id, media_type):
+    async def crawler_each_account(self, account_name, account_id, media_type, cookie):
         """
         get toutiao account info
         """
@@ -193,17 +193,19 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
             "crawler_date": datetime.now().strftime("%Y-%m-%d"),
             "media_type": media_type,
         }
-        await self.log_client.log(
-            contents={
-                "task": "crawler_toutiao",
-                "function": "crawler_each_account",
-                "trace_id": self.trace_id,
-                "message": "抓取账号成功",
-                "status": "success",
-                "data": new_account_item,
-            }
+        # get title_list
+        response = await get_toutiao_account_info_list(
+            account_id=account_id,
+            cookie=cookie,
+            media_type="article"
         )
-        await self.save_item_to_database(media_type="account", item=new_account_item)
+        if not response:
+            return
+
+        article_raw_data = response["data"]
+        title_list = [i['title'] for i in article_raw_data]
+        new_account_item["title_list"] = json.dumps(title_list, ensure_ascii=False)
+        await self.save_item_to_database(media_type="account", item=new_account_item, trace_id=self.trace_id)
 
     async def crawler_each_article(self, method, article_raw_data, category=None):
         """
@@ -240,17 +242,7 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
                 }
             case _:
                 raise Exception(f"unknown method: {method}")
-        await self.log_client.log(
-            contents={
-                "task": "crawler_toutiao",
-                "function": "crawler_each_article",
-                "trace_id": self.trace_id,
-                "message": "抓取文章成功",
-                "status": "success",
-                "data": new_article_item,
-            }
-        )
-        await self.save_item_to_database(media_type="article", item=new_article_item)
+        await self.save_item_to_database(media_type="article", item=new_article_item, trace_id=self.trace_id)
 
     async def crawler_each_video(self, video_raw_data):
         pass
@@ -416,6 +408,9 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
     # 搜索抓账号
     async def search_candidate_accounts(self):
         top_title_list = await get_top_article_title_list(pool=self.pool)
+        cookie = await self.get_config_value(
+            key="toutiao_blogger_cookie", output_type="string"
+        )
         for article in top_title_list:
             title = article["title"]
             try:
@@ -446,7 +441,7 @@ class CrawlerToutiao(CrawlerPipeline, CrawlerToutiaoConst):
                         )
                         if account_id:
                             await self.crawler_each_account(
-                                account_name, account_id, self.ARTICLE_TYPE
+                                account_name, account_id, self.ARTICLE_TYPE, cookie
                             )
 
                         await asyncio.sleep(1)