7 stundas atpakaļ · f08e84c1cc
--- a/client/CrawlerClient.py
+++ b/client/CrawlerClient.py
@@ -1,11 +1,11 @@
 
															 import json
														
 
															-import logging
														
 
															 from typing import List, Dict, Optional, Any
														
 
															 import requests
														
 
															 from model.automation_provide_job import DouYinSearchConfig, ChannelSearchAndDetailDTO
														
 
															+
														
 
															 # ==================== 配置与枚举定义 ====================
														
 
															 # 日志配置
														
 
															 # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
@@ -39,7 +39,7 @@ class CrawlerClient:
 
															             publishTime: Optional[str] = None,
														
 
															             duration: Optional[str] = None,
														
 
															             cursor: Optional[str] = None,
														
 
															-            account_id: Optional[int] = None,
														
 
															+            account_id: Optional[str] = None,
														
 
															     ) -> Dict[str, Any]:
														
 
															         """
														
 
															         基础关键词搜索方法（对应Java的keywordSearch重载方法）
														
@@ -181,7 +181,7 @@ class CrawlerClient:
 
															         # logger.info(f"invoke crawler api result. respJson: {resp_json}")
														
 
															         # 检查响应码
														
 
															-        if resp_json.get("code") != "0":
														
 
															+        if resp_json.get("code") != "0" and resp_json.get("code") != 0:
														
 
															             raise RuntimeError(resp_json.get("msg", "API调用失败"))
														
 
															         # 返回data字段
														
--- a/model/automation_provide_job.py
+++ b/model/automation_provide_job.py
@@ -11,7 +11,7 @@ class DouYinSearchConfig:
 
															     publish_time: Optional[str] = None  # 发布时间（不限/近7天等）
														
 
															     duration: Optional[str] = None  # 时长（不限/1分钟内等）
														
 
															     cursor: Optional[str] = None  # 分页游标
														
 
															-    account_id: Optional[int] = None  # 使用的账号
														
 
															+    account_id: Optional[str] = None  # 使用的账号
														
 
															 @dataclass
														
--- a/script/dou_yin_keywords_search.py
+++ b/script/dou_yin_keywords_search.py
@@ -11,7 +11,7 @@ crawler_client = CrawlerClient()
 
															 preFilterThreshold = 3
														
 
															-result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析.txt'
														
 
															+result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析_20260204_1.txt'
														
 
															 def write_result_file(content, mode='a+'):
														
@@ -25,11 +25,11 @@ def log_info_print_title():
 
															         "视频ID,品类,关键词,爬取计划,结果,原因,搜索使用的账号ID,排序方式,站外视频ID,站外账号ID,过滤结果,分享量,点赞量,分享量/点赞量,视频时长（秒）,观众年龄50+占比,观众年龄50+TGI,过滤规则表达式", 'w')
														
 
															-def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
														
 
															+def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
														
 
															     if 'ext' in log_json and isinstance(log_json['ext'], dict):
														
 
															         log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
														
 
															-    if 'modelValueConfig' and isinstance(log_json['modelValueConfig'], dict):
														
 
															+    if 'modelValueConfig' in log_json and isinstance(log_json['modelValueConfig'], dict):
														
 
															         log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
														
 
															     video_id = log_json["videoId"]
														
@@ -45,6 +45,8 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
 
															         write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type}")
														
 
															         return
														
 
															     for channel_content_id in ext_json:
														
 
															+        if channel_content_id in ['mergeCate2Map']:
														
 
															+            continue
														
 
															         channel_ext_info = ext_json[channel_content_id]
														
 
															         filter_result = channel_ext_info.get("result", False)
														
 
															         rule_str = channel_ext_info.get("rule", "")
														
@@ -66,11 +68,11 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
 
															                           f"{share_cnt},{like_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi},{rule_str}")
														
 
															-def keywords_search(keywords: str, sort_type: str, account_id=None) -> List[ChannelSearchAndDetailDTO]:
														
 
															+def keywords_search(keywords: str, sort_type: str, account: Optional[str] = None) -> List[ChannelSearchAndDetailDTO]:
														
 
															     search_config = DouYinSearchConfig(
														
 
															         search_content=keywords,
														
 
															         sort_type=sort_type,
														
 
															-        account_id=account_id
														
 
															+        account_id=account,
														
 
															     )
														
 
															     return crawler_client.dou_yin_keywords_search(search_config, True, True)
														
@@ -80,7 +82,7 @@ def eval_expr(expr: str, context: dict) -> bool:
 
															     return bool(simple_eval(expr, names=context))
														
 
															-def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
														
 
															+def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
														
 
															     need_copy_keys = ["videoId", "accountFilters", "contentFilters", "mergeSecondLevelCate", "keywords"]
														
 
															     result_json = {}
														
 
															     for key in need_copy_keys:
														
@@ -120,13 +122,14 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
 
															             fans_portrait['channelContentId'] = channel_content_id
														
 
															             ext_json['fanPortrait'] = fans_portrait
														
 
															-        if (not content_detail) and (not fans_portrait):
														
 
															-            ext_json["result"] = False
														
 
															-            continue
														
 
															         rule_context = AutoProvideUtil.extract_content_rule_feature(content_detail=content_detail, fans_portrait=fans_portrait)
														
 
															         ext_json['ruleContext'] = rule_context
														
 
															         ext_json['rule'] = rule_str
														
 
															+        if (not content_detail) or (not fans_portrait):
														
 
															+            ext_json["result"] = False
														
 
															+            continue
														
 
															+
														
 
															         if not rule_context:
														
 
															             cnt += 1
														
 
															             continue
														
@@ -136,17 +139,17 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
 
															         if result:
														
 
															             cnt += 1
														
 
															     if cnt <= preFilterThreshold:
														
 
															-        log_json["result"] = False
														
 
															-        log_json['reason'] = '该关键词首页满足条件的视频数不足'
														
 
															+        result_json["result"] = False
														
 
															+        result_json['reason'] = '该关键词首页满足条件的视频数不足'
														
 
															-    return {}
														
 
															+    return result_json
														
 
															 def keywords_not_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
														
 
															     """
														
 
															     未登录，综合排序
														
 
															     """
														
 
															-    account_id = 0
														
 
															+    account_id = "0"
														
 
															     log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
														
 
															     log_info_print(log_json, account_id=account_id)
														
@@ -155,7 +158,7 @@ def keywords_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], f
 
															     """
														
 
															     登录，综合排序
														
 
															     """
														
 
															-    account_id = 771431186
														
 
															+    account_id = "771431186"
														
 
															     log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
														
 
															     log_info_print(log_json, account_id=account_id)
														
@@ -164,15 +167,15 @@ def keywords_login_like_sort(keywords: str, log_json: dict[str, Any], filters: L
 
															     """
														
 
															     登录状态，最多点赞
														
 
															     """
														
 
															-    account_id = 771431186
														
 
															+    account_id = "771431186"
														
 
															     log_json = keywords_search_and_filter(keywords=keywords, sort_type="最多点赞", account_id=account_id, log_json=log_json, filters=filters)
														
 
															     log_info_print(log_json, account_id=account_id)
														
 
															 def handle_log_json(log_json: dict[str, Any]):
														
 
															-    log_info_print(log_json)
														
 
															+    # 登录，综合排序
														
 
															+    # log_info_print(log_json)
														
 
															-    # 未登录，最多点赞
														
 
															     keywords = log_json['keywords']
														
 
															     account_filters = json.loads(log_json.get("accountFilters", "[]"))
														
 
															     content_filters = json.loads(log_json.get("contentFilters", '[]'))
														
@@ -180,13 +183,13 @@ def handle_log_json(log_json: dict[str, Any]):
 
															     for filter_item in account_filters + content_filters:
														
 
															         search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
														
 
															-    keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
														
 
															-    keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
														
 
															+    # keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
														
 
															+    # keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
														
 
															     keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
														
 
															 def main():
														
 
															-    file_path = "/Users/zhao/Downloads/keywords_filter_test_sample.json"
														
 
															+    file_path = '/Users/zhao/Desktop/keywords.json'
														
 
															     log_list = []
														
 
															     with open(file_path, "r", encoding="utf-8") as f:
														
 
															         line = f.readline()
														
@@ -196,7 +199,8 @@ def main():
 
															     log_info_print_title()
														
 
															     for log in log_list:
														
 
															-        handle_log_json(log)
														
 
															+        if "“揭秘开国 领导人" == log['keywords']:
														
 
															+            handle_log_json(log)
														
 
															 if __name__ == '__main__':