7 timmar sedan · f08e84c1cc
--- a/client/CrawlerClient.py
+++ b/client/CrawlerClient.py
@@ -1,11 +1,11 @@
 
				 import json
			
 
				-import logging
			
 
				 from typing import List, Dict, Optional, Any
			
 
				 
			
 
				 import requests
			
 
				 
			
 
				 from model.automation_provide_job import DouYinSearchConfig, ChannelSearchAndDetailDTO
			
 
				 
			
 
				+
			
 
				 # ==================== 配置与枚举定义 ====================
			
 
				 # 日志配置
			
 
				 # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
@@ -39,7 +39,7 @@ class CrawlerClient:
 
				             publishTime: Optional[str] = None,
			
 
				             duration: Optional[str] = None,
			
 
				             cursor: Optional[str] = None,
			
 
				-            account_id: Optional[int] = None,
			
 
				+            account_id: Optional[str] = None,
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				         基础关键词搜索方法（对应Java的keywordSearch重载方法）
			
@@ -181,7 +181,7 @@ class CrawlerClient:
 
				         # logger.info(f"invoke crawler api result. respJson: {resp_json}")
			
 
				 
			
 
				         # 检查响应码
			
 
				-        if resp_json.get("code") != "0":
			
 
				+        if resp_json.get("code") != "0" and resp_json.get("code") != 0:
			
 
				             raise RuntimeError(resp_json.get("msg", "API调用失败"))
			
 
				 
			
 
				         # 返回data字段
			
--- a/model/automation_provide_job.py
+++ b/model/automation_provide_job.py
@@ -11,7 +11,7 @@ class DouYinSearchConfig:
 
				     publish_time: Optional[str] = None  # 发布时间（不限/近7天等）
			
 
				     duration: Optional[str] = None  # 时长（不限/1分钟内等）
			
 
				     cursor: Optional[str] = None  # 分页游标
			
 
				-    account_id: Optional[int] = None  # 使用的账号
			
 
				+    account_id: Optional[str] = None  # 使用的账号
			
 
				 
			
 
				 
			
 
				 @dataclass
			
--- a/script/dou_yin_keywords_search.py
+++ b/script/dou_yin_keywords_search.py
@@ -11,7 +11,7 @@ crawler_client = CrawlerClient()
 
				 
			
 
				 preFilterThreshold = 3
			
 
				 
			
 
				-result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析.txt'
			
 
				+result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析_20260204_1.txt'
			
 
				 
			
 
				 
			
 
				 def write_result_file(content, mode='a+'):
			
@@ -25,11 +25,11 @@ def log_info_print_title():
 
				         "视频ID,品类,关键词,爬取计划,结果,原因,搜索使用的账号ID,排序方式,站外视频ID,站外账号ID,过滤结果,分享量,点赞量,分享量/点赞量,视频时长（秒）,观众年龄50+占比,观众年龄50+TGI,过滤规则表达式", 'w')
			
 
				 
			
 
				 
			
 
				-def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
			
 
				+def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
			
 
				     if 'ext' in log_json and isinstance(log_json['ext'], dict):
			
 
				         log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
			
 
				 
			
 
				-    if 'modelValueConfig' and isinstance(log_json['modelValueConfig'], dict):
			
 
				+    if 'modelValueConfig' in log_json and isinstance(log_json['modelValueConfig'], dict):
			
 
				         log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
			
 
				 
			
 
				     video_id = log_json["videoId"]
			
@@ -45,6 +45,8 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
 
				         write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type}")
			
 
				         return
			
 
				     for channel_content_id in ext_json:
			
 
				+        if channel_content_id in ['mergeCate2Map']:
			
 
				+            continue
			
 
				         channel_ext_info = ext_json[channel_content_id]
			
 
				         filter_result = channel_ext_info.get("result", False)
			
 
				         rule_str = channel_ext_info.get("rule", "")
			
@@ -66,11 +68,11 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
 
				                           f"{share_cnt},{like_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi},{rule_str}")
			
 
				 
			
 
				 
			
 
				-def keywords_search(keywords: str, sort_type: str, account_id=None) -> List[ChannelSearchAndDetailDTO]:
			
 
				+def keywords_search(keywords: str, sort_type: str, account: Optional[str] = None) -> List[ChannelSearchAndDetailDTO]:
			
 
				     search_config = DouYinSearchConfig(
			
 
				         search_content=keywords,
			
 
				         sort_type=sort_type,
			
 
				-        account_id=account_id
			
 
				+        account_id=account,
			
 
				     )
			
 
				     return crawler_client.dou_yin_keywords_search(search_config, True, True)
			
 
				 
			
@@ -80,7 +82,7 @@ def eval_expr(expr: str, context: dict) -> bool:
 
				     return bool(simple_eval(expr, names=context))
			
 
				 
			
 
				 
			
 
				-def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
			
 
				+def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
			
 
				     need_copy_keys = ["videoId", "accountFilters", "contentFilters", "mergeSecondLevelCate", "keywords"]
			
 
				     result_json = {}
			
 
				     for key in need_copy_keys:
			
@@ -120,13 +122,14 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
 
				             fans_portrait['channelContentId'] = channel_content_id
			
 
				             ext_json['fanPortrait'] = fans_portrait
			
 
				 
			
 
				-        if (not content_detail) and (not fans_portrait):
			
 
				-            ext_json["result"] = False
			
 
				-            continue
			
 
				         rule_context = AutoProvideUtil.extract_content_rule_feature(content_detail=content_detail, fans_portrait=fans_portrait)
			
 
				         ext_json['ruleContext'] = rule_context
			
 
				         ext_json['rule'] = rule_str
			
 
				 
			
 
				+        if (not content_detail) or (not fans_portrait):
			
 
				+            ext_json["result"] = False
			
 
				+            continue
			
 
				+
			
 
				         if not rule_context:
			
 
				             cnt += 1
			
 
				             continue
			
@@ -136,17 +139,17 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
 
				         if result:
			
 
				             cnt += 1
			
 
				     if cnt <= preFilterThreshold:
			
 
				-        log_json["result"] = False
			
 
				-        log_json['reason'] = '该关键词首页满足条件的视频数不足'
			
 
				+        result_json["result"] = False
			
 
				+        result_json['reason'] = '该关键词首页满足条件的视频数不足'
			
 
				 
			
 
				-    return {}
			
 
				+    return result_json
			
 
				 
			
 
				 
			
 
				 def keywords_not_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
			
 
				     """
			
 
				     未登录，综合排序
			
 
				     """
			
 
				-    account_id = 0
			
 
				+    account_id = "0"
			
 
				     log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
			
 
				     log_info_print(log_json, account_id=account_id)
			
 
				 
			
@@ -155,7 +158,7 @@ def keywords_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], f
 
				     """
			
 
				     登录，综合排序
			
 
				     """
			
 
				-    account_id = 771431186
			
 
				+    account_id = "771431186"
			
 
				     log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
			
 
				     log_info_print(log_json, account_id=account_id)
			
 
				 
			
@@ -164,15 +167,15 @@ def keywords_login_like_sort(keywords: str, log_json: dict[str, Any], filters: L
 
				     """
			
 
				     登录状态，最多点赞
			
 
				     """
			
 
				-    account_id = 771431186
			
 
				+    account_id = "771431186"
			
 
				     log_json = keywords_search_and_filter(keywords=keywords, sort_type="最多点赞", account_id=account_id, log_json=log_json, filters=filters)
			
 
				     log_info_print(log_json, account_id=account_id)
			
 
				 
			
 
				 
			
 
				 def handle_log_json(log_json: dict[str, Any]):
			
 
				-    log_info_print(log_json)
			
 
				+    # 登录，综合排序
			
 
				+    # log_info_print(log_json)
			
 
				 
			
 
				-    # 未登录，最多点赞
			
 
				     keywords = log_json['keywords']
			
 
				     account_filters = json.loads(log_json.get("accountFilters", "[]"))
			
 
				     content_filters = json.loads(log_json.get("contentFilters", '[]'))
			
@@ -180,13 +183,13 @@ def handle_log_json(log_json: dict[str, Any]):
 
				     for filter_item in account_filters + content_filters:
			
 
				         search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
			
 
				 
			
 
				-    keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
			
 
				-    keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
			
 
				+    # keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
			
 
				+    # keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
			
 
				     keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
			
 
				 
			
 
				 
			
 
				 def main():
			
 
				-    file_path = "/Users/zhao/Downloads/keywords_filter_test_sample.json"
			
 
				+    file_path = '/Users/zhao/Desktop/keywords.json'
			
 
				     log_list = []
			
 
				     with open(file_path, "r", encoding="utf-8") as f:
			
 
				         line = f.readline()
			
@@ -196,7 +199,8 @@ def main():
 
				 
			
 
				     log_info_print_title()
			
 
				     for log in log_list:
			
 
				-        handle_log_json(log)
			
 
				+        if "“揭秘开国 领导人" == log['keywords']:
			
 
				+            handle_log_json(log)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':