浏览代码

小年糕账号

zhangliang 1 月之前
父节点
当前提交
2f4c89884d
共有 5 个文件被更改,包括 89 次插入13 次删除
  1. 37 3
      CONFIGURATION.md
  2. 7 1
      README.md
  3. 35 1
      config/spiders_config.yaml
  4. 8 6
      core/utils/helpers.py
  5. 2 2
      spiders/basespider.py

+ 37 - 3
CONFIGURATION.md

@@ -104,11 +104,45 @@ xiaoniangaoauthor:
   request_body:
       cursor: "{{next_cursor}}"
       account_id: "{{uid}}" # 数据库的uid
+  loop_times: 100
+  loop_interval:
+    min: 5
+    max: 10
+  feishu_sheetid: "K0gA9Y"
+  response_parse:
+    uid: "$.uid" # 数据库的uid
+    next_cursor: "$.cursor"
+    data: "$.data"
+    has_more: "$.data.has_more"
+    data_path: "$.data.data"
+    fields:
+      video_title: "$.title"
+      duration: "$.du"
+      play_cnt: "$.play_pv"
+      like_cnt: "$.favor.total"
+      comment_cnt: "$.comment_count"
+      share_cnt: "$.share"
+      width: "$.w"
+      height: "$.h"
+      avatar_url: "$.user.hurl"
+      cover_url: "$.url"
+      video_url: "$.v_url"
+      out_user_id: "$.user.mid"
+      out_video_id: "$.vid"
+      publish_time_stamp: "$.t"
+
+xiaoniangaorecommend:
+  platform: xiaoniangao
+  mode: recommend
+  path: crawler/xiao_nian_gao_plus/recommend
+  method: post
+  request_body:
+
   loop_times: 100
   loop_interval:
     min: 5
     max: 20
-  feishu_sheetid: "golXy9"
+  feishu_sheetid: "D1nVxQ"
   response_parse:
     uid: "$.uid" # 数据库的uid
     next_cursor: "$.cursor"
@@ -174,6 +208,6 @@ xiaoniangaoauthor:
 
 ## 当前配置状态
 
-- 平台配置数量: 3
+- 平台配置数量: 4
 - 运行环境: prod
-- 配置文件路径: /AutoScraperX/config/spiders_config.yaml
+- 配置文件路径: /Users/zhangliang/Documents/piaoquan/AutoScraperX/config/spiders_config.yaml

+ 7 - 1
README.md

@@ -303,4 +303,10 @@ sh deploy.sh
 2. **生成最新的配置文档**:
    ```bash
    python -m core.utils.config_documentation
-   ```
+   ```
+### 项目相关记录
+1. **写入安全标题表**:
+   ```
+   https://w42nne6hzg.feishu.cn/sheets/U5dXsSlPOhiNNCtEfgqcm1iYnpf?sheet=K0gA9Y
+   ```
+   

+ 35 - 1
config/spiders_config.yaml

@@ -68,7 +68,7 @@ xiaoniangaoauthor:
   loop_times: 100
   loop_interval:
     min: 5
-    max: 20
+    max: 10
   feishu_sheetid: "K0gA9Y"
   response_parse:
     uid: "$.uid" # 数据库的uid
@@ -91,6 +91,40 @@ xiaoniangaoauthor:
       out_user_id: "$.user.mid"
       out_video_id: "$.vid"
       publish_time_stamp: "$.t"
+#
+#xiaoniangaorecommend:
+#  platform: xiaoniangao
+#  mode: recommend
+#  path: crawler/xiao_nian_gao_plus/recommend
+#  method: post
+#  request_body:
+#
+#  loop_times: 100
+#  loop_interval:
+#    min: 5
+#    max: 20
+#  feishu_sheetid: "D1nVxQ"
+#  response_parse:
+#    uid: "$.uid" # 数据库的uid
+#    next_cursor: "$.cursor"
+#    data: "$.data"
+#    has_more: "$.data.has_more"
+#    data_path: "$.data.data"
+#    fields:
+#      video_title: "$.title"
+#      duration: 0
+#      play_cnt: "$.play_pv"
+#      like_cnt: 0
+#      comment_cnt: "$.comment_count"
+#      share_cnt: "$.share"
+#      width: "$.w"
+#      height: "$.h"
+#      avatar_url: "$.user.hurl"
+#      cover_url: "$.url"
+#      video_url: "$.v_url"
+#      out_user_id:
+#      out_video_id: "$.id"
+#      publish_time_stamp: "$.t"
 
 
 

+ 8 - 6
core/utils/helpers.py

@@ -20,15 +20,16 @@ async def get_title_filter_word() -> List[str]:
         return feishu_data[1]
 
 async def generate_titles(sheet_id: str,video_obj: Dict,logger,aliyun_log):
-    title_list = await get_title_filter_word()
-    title = video_obj.get("title")
+    title_filter_word = await get_title_filter_word()
+    title = video_obj.get("video_title")
     if not title:
-        return
-    contains_keyword = any(keyword in title for keyword in title_list)
+        return video_obj
+    contains_keyword = any(keyword in title for keyword in title_filter_word)
     logger.info(f"【{title}】标题包含过滤关键词:{contains_keyword}")
     if contains_keyword:
         new_title = await GPT4oMini.get_ai_mini_title(title)
         logger.info(f"生成新的标题:{new_title}")
+        video_obj["video_title"] = new_title
         current_time = datetime.now()
         formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
         values = [
@@ -39,6 +40,7 @@ async def generate_titles(sheet_id: str,video_obj: Dict,logger,aliyun_log):
                 formatted_time,
         ]
         await insert_safe_data(sheet_id, values)
+    return video_obj
 
 async def insert_safe_data(sheet_id: str, values: List):
     spreadsheet_token = "U5dXsSlPOhiNNCtEfgqcm1iYnpf"
@@ -49,5 +51,5 @@ async def insert_safe_data(sheet_id: str, values: List):
 
 
 if __name__ == '__main__':
-     filter_word = asyncio.run(get_title_filter_word())
-     print(filter_word)
+     asyncio.run(insert_safe_data("K0gA9Y", ["1","2","3","4","5"]))
+    

+ 2 - 2
spiders/basespider.py

@@ -140,7 +140,7 @@ class BaseSpider(ABC):
                 return False
             if not await self.filter_data(video_obj):
                 return False
-            await self.integrated_video_handling(video_obj)
+            video_obj = await self.integrated_video_handling(video_obj)
             return await self.push_to_etl(video_obj)
         except Exception as e:
             self.logger.exception(f"视频处理异常: {e}")
@@ -206,7 +206,7 @@ class BaseSpider(ABC):
           钩子函数:可在此实现自动生成标题或其他业务逻辑
         """
         # 视频标题处理生成
-        await generate_titles(self.feishu_sheetid, video,self.logger,self.aliyun_log)
+        return await generate_titles(self.feishu_sheetid, video,self.logger,self.aliyun_log)
 
     async def push_to_etl(self, video: Dict) -> bool:
         try: