Bläddra i källkod

feat:修改关键词分析脚本

zhaohaipeng 7 timmar sedan
förälder
incheckning
f08e84c1cc
3 ändrade filer med 29 tillägg och 25 borttagningar
  1. 3 3
      client/CrawlerClient.py
  2. 1 1
      model/automation_provide_job.py
  3. 25 21
      script/dou_yin_keywords_search.py

+ 3 - 3
client/CrawlerClient.py

@@ -1,11 +1,11 @@
 import json
-import logging
 from typing import List, Dict, Optional, Any
 
 import requests
 
 from model.automation_provide_job import DouYinSearchConfig, ChannelSearchAndDetailDTO
 
+
 # ==================== 配置与枚举定义 ====================
 # 日志配置
 # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -39,7 +39,7 @@ class CrawlerClient:
             publishTime: Optional[str] = None,
             duration: Optional[str] = None,
             cursor: Optional[str] = None,
-            account_id: Optional[int] = None,
+            account_id: Optional[str] = None,
     ) -> Dict[str, Any]:
         """
         基础关键词搜索方法(对应Java的keywordSearch重载方法)
@@ -181,7 +181,7 @@ class CrawlerClient:
         # logger.info(f"invoke crawler api result. respJson: {resp_json}")
 
         # 检查响应码
-        if resp_json.get("code") != "0":
+        if resp_json.get("code") != "0" and resp_json.get("code") != 0:
             raise RuntimeError(resp_json.get("msg", "API调用失败"))
 
         # 返回data字段

+ 1 - 1
model/automation_provide_job.py

@@ -11,7 +11,7 @@ class DouYinSearchConfig:
     publish_time: Optional[str] = None  # 发布时间(不限/近7天等)
     duration: Optional[str] = None  # 时长(不限/1分钟内等)
     cursor: Optional[str] = None  # 分页游标
-    account_id: Optional[int] = None  # 使用的账号
+    account_id: Optional[str] = None  # 使用的账号
 
 
 @dataclass

+ 25 - 21
script/dou_yin_keywords_search.py

@@ -11,7 +11,7 @@ crawler_client = CrawlerClient()
 
 preFilterThreshold = 3
 
-result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析.txt'
+result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析_20260204_1.txt'
 
 
 def write_result_file(content, mode='a+'):
@@ -25,11 +25,11 @@ def log_info_print_title():
         "视频ID,品类,关键词,爬取计划,结果,原因,搜索使用的账号ID,排序方式,站外视频ID,站外账号ID,过滤结果,分享量,点赞量,分享量/点赞量,视频时长(秒),观众年龄50+占比,观众年龄50+TGI,过滤规则表达式", 'w')
 
 
-def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
+def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
     if 'ext' in log_json and isinstance(log_json['ext'], dict):
         log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
 
-    if 'modelValueConfig' and isinstance(log_json['modelValueConfig'], dict):
+    if 'modelValueConfig' in log_json and isinstance(log_json['modelValueConfig'], dict):
         log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
 
     video_id = log_json["videoId"]
@@ -45,6 +45,8 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
         write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type}")
         return
     for channel_content_id in ext_json:
+        if channel_content_id in ['mergeCate2Map']:
+            continue
         channel_ext_info = ext_json[channel_content_id]
         filter_result = channel_ext_info.get("result", False)
         rule_str = channel_ext_info.get("rule", "")
@@ -66,11 +68,11 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
                           f"{share_cnt},{like_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi},{rule_str}")
 
 
-def keywords_search(keywords: str, sort_type: str, account_id=None) -> List[ChannelSearchAndDetailDTO]:
+def keywords_search(keywords: str, sort_type: str, account: Optional[str] = None) -> List[ChannelSearchAndDetailDTO]:
     search_config = DouYinSearchConfig(
         search_content=keywords,
         sort_type=sort_type,
-        account_id=account_id
+        account_id=account,
     )
     return crawler_client.dou_yin_keywords_search(search_config, True, True)
 
@@ -80,7 +82,7 @@ def eval_expr(expr: str, context: dict) -> bool:
     return bool(simple_eval(expr, names=context))
 
 
-def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
+def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
     need_copy_keys = ["videoId", "accountFilters", "contentFilters", "mergeSecondLevelCate", "keywords"]
     result_json = {}
     for key in need_copy_keys:
@@ -120,13 +122,14 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
             fans_portrait['channelContentId'] = channel_content_id
             ext_json['fanPortrait'] = fans_portrait
 
-        if (not content_detail) and (not fans_portrait):
-            ext_json["result"] = False
-            continue
         rule_context = AutoProvideUtil.extract_content_rule_feature(content_detail=content_detail, fans_portrait=fans_portrait)
         ext_json['ruleContext'] = rule_context
         ext_json['rule'] = rule_str
 
+        if (not content_detail) or (not fans_portrait):
+            ext_json["result"] = False
+            continue
+
         if not rule_context:
             cnt += 1
             continue
@@ -136,17 +139,17 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
         if result:
             cnt += 1
     if cnt <= preFilterThreshold:
-        log_json["result"] = False
-        log_json['reason'] = '该关键词首页满足条件的视频数不足'
+        result_json["result"] = False
+        result_json['reason'] = '该关键词首页满足条件的视频数不足'
 
-    return {}
+    return result_json
 
 
 def keywords_not_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
     """
     未登录,综合排序
     """
-    account_id = 0
+    account_id = "0"
     log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
     log_info_print(log_json, account_id=account_id)
 
@@ -155,7 +158,7 @@ def keywords_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], f
     """
     登录,综合排序
     """
-    account_id = 771431186
+    account_id = "771431186"
     log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
     log_info_print(log_json, account_id=account_id)
 
@@ -164,15 +167,15 @@ def keywords_login_like_sort(keywords: str, log_json: dict[str, Any], filters: L
     """
     登录状态,最多点赞
     """
-    account_id = 771431186
+    account_id = "771431186"
     log_json = keywords_search_and_filter(keywords=keywords, sort_type="最多点赞", account_id=account_id, log_json=log_json, filters=filters)
     log_info_print(log_json, account_id=account_id)
 
 
 def handle_log_json(log_json: dict[str, Any]):
-    log_info_print(log_json)
+    # 登录,综合排序
+    # log_info_print(log_json)
 
-    # 未登录,最多点赞
     keywords = log_json['keywords']
     account_filters = json.loads(log_json.get("accountFilters", "[]"))
     content_filters = json.loads(log_json.get("contentFilters", '[]'))
@@ -180,13 +183,13 @@ def handle_log_json(log_json: dict[str, Any]):
     for filter_item in account_filters + content_filters:
         search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
 
-    keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
-    keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
+    # keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
+    # keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
     keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
 
 
 def main():
-    file_path = "/Users/zhao/Downloads/keywords_filter_test_sample.json"
+    file_path = '/Users/zhao/Desktop/keywords.json'
     log_list = []
     with open(file_path, "r", encoding="utf-8") as f:
         line = f.readline()
@@ -196,7 +199,8 @@ def main():
 
     log_info_print_title()
     for log in log_list:
-        handle_log_json(log)
+        if "“揭秘开国 领导人" == log['keywords']:
+            handle_log_json(log)
 
 
 if __name__ == '__main__':