Selaa lähdekoodia

小年糕账号修改数据源获取

zhangliang 1 kuukausi sitten
vanhempi
commit
2c2d0d039d

+ 2 - 2
config/spiders_config.yaml

@@ -67,8 +67,8 @@ xiaoniangaoauthor:
       account_id: "{{uid}}" # 数据库的uid
   loop_times: 100
   loop_interval:
-    min: 30
-    max: 60
+    min: 5
+    max: 20
   feishu_sheetid: "golXy9"
   response_parse:
     uid: "$.uid" # 数据库的uid

+ 1 - 1
services/async_mysql_service.py

@@ -161,7 +161,7 @@ class AsyncMysqlService:
         return result["cnt"] if result else 0
 
     async def get_xng_mid(self) -> int:
-        sql = """select DISTINCT(uid) from xng_uid ORDER BY `data_time` DESC;"""
+        sql = """select link from crawler_user_v3 where task_id=21;"""
         result = await self.fetch_all(sql)
         return result if result else 0
 

+ 15 - 2
services/pipeline.py

@@ -28,11 +28,12 @@ class PiaoQuanPipeline:
         self.mysql = AsyncMysqlService(platform=platform, mode=mode)
         self.logger = LoggerManager.get_logger(platform=platform, mode=mode)
         self.aliyun_log = LoggerManager.get_aliyun_logger(platform=platform, mode=mode)
+        self.feishu_spreadsheet_token = "KsoMsyP2ghleM9tzBfmcEEXBnXg"
 
     async def feishu_time_list(self):
         async with FeishuDataAsync() as feishu_data:
             summary = await feishu_data.get_values(
-                spreadsheet_token="KsoMsyP2ghleM9tzBfmcEEXBnXg",
+                spreadsheet_token=self.feishu_spreadsheet_token,
                 sheet_id="RuLK77"
             )
         for row in summary[1:]:
@@ -43,7 +44,7 @@ class PiaoQuanPipeline:
     async def feishu_list(self):
         async with FeishuDataAsync() as feishu_data:
             summary = await feishu_data.get_values(
-                spreadsheet_token="KsoMsyP2ghleM9tzBfmcEEXBnXg",
+                spreadsheet_token=self.feishu_spreadsheet_token,
                 sheet_id="letS93"
             )
         for row in summary[1:]:
@@ -51,6 +52,17 @@ class PiaoQuanPipeline:
                 return row[1]
         return None
 
+    async def title_restricted_words(self):
+        async with FeishuDataAsync() as feishu_data:
+            summary = await feishu_data.get_values(
+                spreadsheet_token=self.feishu_spreadsheet_token,
+                sheet_id="BS9uyu"
+            )
+        for row in summary[1:]:
+            if row[0] == self.platform:
+                return row[1]
+        return None
+
     async def publish_time_flag(self) -> bool:
         publish_ts = self.item.get("publish_time_stamp", int(time.time()))
         update_ts = self.item.get("update_time_stamp", int(time.time()))
@@ -145,6 +157,7 @@ class PiaoQuanPipeline:
         """
           视频基础下载规则
           :return:
+          "rule": "[{\"period\":{\"min\":15,\"max\":3}},{\"duration\":{\"min\":50,\"max\":0}},{\"share_cnt\":{\"min\":2,\"max\":0}},{\"videos_cnt\":{\"min\":300,\"max\":0}}]",
         """
         for key in self.item:
             if self.rule_dict.get(key):

+ 1 - 1
spiders/authorspider.py

@@ -88,7 +88,7 @@ class AuthorSpider(BaseSpider):
         # 解析用户视频列表
         data_list = safe_extract(response, self.data_path)
         if not data_list:
-            self.logger.info(f"用户 {user_uid} 第{self.current_cursor}页无视频数据")
+            self.logger.info(f"用户 {user_uid} 第{self.current_cursor or 0}页无视频数据")
             return None, None
         return has_more, data_list
 

+ 3 - 0
spiders/basespider.py

@@ -181,6 +181,9 @@ class BaseSpider(ABC):
     async def push_to_etl(self, video: Dict) -> bool:
         try:
             await self.mq_producer.send_msg(video)
+            self.aliyun_log.logging(code="1009",
+                                    message="推送ETL成功",
+                                    data=video)
             self.logger.info(f"成功推送视频至ETL: {video}")
             return True
         except Exception as e:

+ 1 - 0
spiders/xiaoniangao_author.py

@@ -8,6 +8,7 @@ class XiaoniangaoAuthor(AuthorSpider):
     async def fetch_user_list(self) -> List[Dict]:
         """获取待爬取的用户列表(从数据库)"""
         datas =await self.db_service.get_xng_mid()
+        datas = [{"uid":data["link"]} for data in datas]
         return datas