|
@@ -11,7 +11,7 @@ crawler_client = CrawlerClient()
|
|
|
|
|
|
|
|
preFilterThreshold = 3
|
|
preFilterThreshold = 3
|
|
|
|
|
|
|
|
-result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析.txt'
|
|
|
|
|
|
|
+result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析_20260204_1.txt'
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_result_file(content, mode='a+'):
|
|
def write_result_file(content, mode='a+'):
|
|
@@ -25,11 +25,11 @@ def log_info_print_title():
|
|
|
"视频ID,品类,关键词,爬取计划,结果,原因,搜索使用的账号ID,排序方式,站外视频ID,站外账号ID,过滤结果,分享量,点赞量,分享量/点赞量,视频时长(秒),观众年龄50+占比,观众年龄50+TGI,过滤规则表达式", 'w')
|
|
"视频ID,品类,关键词,爬取计划,结果,原因,搜索使用的账号ID,排序方式,站外视频ID,站外账号ID,过滤结果,分享量,点赞量,分享量/点赞量,视频时长(秒),观众年龄50+占比,观众年龄50+TGI,过滤规则表达式", 'w')
|
|
|
|
|
|
|
|
|
|
|
|
|
-def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
|
|
|
|
|
|
|
+def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
|
|
|
if 'ext' in log_json and isinstance(log_json['ext'], dict):
|
|
if 'ext' in log_json and isinstance(log_json['ext'], dict):
|
|
|
log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
|
|
log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
|
|
|
|
|
|
|
|
- if 'modelValueConfig' and isinstance(log_json['modelValueConfig'], dict):
|
|
|
|
|
|
|
+ if 'modelValueConfig' in log_json and isinstance(log_json['modelValueConfig'], dict):
|
|
|
log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
|
|
log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
|
|
|
|
|
|
|
|
video_id = log_json["videoId"]
|
|
video_id = log_json["videoId"]
|
|
@@ -45,6 +45,8 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
|
|
|
write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type}")
|
|
write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type}")
|
|
|
return
|
|
return
|
|
|
for channel_content_id in ext_json:
|
|
for channel_content_id in ext_json:
|
|
|
|
|
+ if channel_content_id in ['mergeCate2Map']:
|
|
|
|
|
+ continue
|
|
|
channel_ext_info = ext_json[channel_content_id]
|
|
channel_ext_info = ext_json[channel_content_id]
|
|
|
filter_result = channel_ext_info.get("result", False)
|
|
filter_result = channel_ext_info.get("result", False)
|
|
|
rule_str = channel_ext_info.get("rule", "")
|
|
rule_str = channel_ext_info.get("rule", "")
|
|
@@ -66,11 +68,11 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
|
|
|
f"{share_cnt},{like_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi},{rule_str}")
|
|
f"{share_cnt},{like_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi},{rule_str}")
|
|
|
|
|
|
|
|
|
|
|
|
|
-def keywords_search(keywords: str, sort_type: str, account_id=None) -> List[ChannelSearchAndDetailDTO]:
|
|
|
|
|
|
|
+def keywords_search(keywords: str, sort_type: str, account: Optional[str] = None) -> List[ChannelSearchAndDetailDTO]:
|
|
|
search_config = DouYinSearchConfig(
|
|
search_config = DouYinSearchConfig(
|
|
|
search_content=keywords,
|
|
search_content=keywords,
|
|
|
sort_type=sort_type,
|
|
sort_type=sort_type,
|
|
|
- account_id=account_id
|
|
|
|
|
|
|
+ account_id=account,
|
|
|
)
|
|
)
|
|
|
return crawler_client.dou_yin_keywords_search(search_config, True, True)
|
|
return crawler_client.dou_yin_keywords_search(search_config, True, True)
|
|
|
|
|
|
|
@@ -80,7 +82,7 @@ def eval_expr(expr: str, context: dict) -> bool:
|
|
|
return bool(simple_eval(expr, names=context))
|
|
return bool(simple_eval(expr, names=context))
|
|
|
|
|
|
|
|
|
|
|
|
|
-def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
|
|
|
|
|
|
|
+def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
|
|
|
need_copy_keys = ["videoId", "accountFilters", "contentFilters", "mergeSecondLevelCate", "keywords"]
|
|
need_copy_keys = ["videoId", "accountFilters", "contentFilters", "mergeSecondLevelCate", "keywords"]
|
|
|
result_json = {}
|
|
result_json = {}
|
|
|
for key in need_copy_keys:
|
|
for key in need_copy_keys:
|
|
@@ -120,13 +122,14 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
|
|
|
fans_portrait['channelContentId'] = channel_content_id
|
|
fans_portrait['channelContentId'] = channel_content_id
|
|
|
ext_json['fanPortrait'] = fans_portrait
|
|
ext_json['fanPortrait'] = fans_portrait
|
|
|
|
|
|
|
|
- if (not content_detail) and (not fans_portrait):
|
|
|
|
|
- ext_json["result"] = False
|
|
|
|
|
- continue
|
|
|
|
|
rule_context = AutoProvideUtil.extract_content_rule_feature(content_detail=content_detail, fans_portrait=fans_portrait)
|
|
rule_context = AutoProvideUtil.extract_content_rule_feature(content_detail=content_detail, fans_portrait=fans_portrait)
|
|
|
ext_json['ruleContext'] = rule_context
|
|
ext_json['ruleContext'] = rule_context
|
|
|
ext_json['rule'] = rule_str
|
|
ext_json['rule'] = rule_str
|
|
|
|
|
|
|
|
|
|
+ if (not content_detail) or (not fans_portrait):
|
|
|
|
|
+ ext_json["result"] = False
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
if not rule_context:
|
|
if not rule_context:
|
|
|
cnt += 1
|
|
cnt += 1
|
|
|
continue
|
|
continue
|
|
@@ -136,17 +139,17 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, l
|
|
|
if result:
|
|
if result:
|
|
|
cnt += 1
|
|
cnt += 1
|
|
|
if cnt <= preFilterThreshold:
|
|
if cnt <= preFilterThreshold:
|
|
|
- log_json["result"] = False
|
|
|
|
|
- log_json['reason'] = '该关键词首页满足条件的视频数不足'
|
|
|
|
|
|
|
+ result_json["result"] = False
|
|
|
|
|
+ result_json['reason'] = '该关键词首页满足条件的视频数不足'
|
|
|
|
|
|
|
|
- return {}
|
|
|
|
|
|
|
+ return result_json
|
|
|
|
|
|
|
|
|
|
|
|
|
def keywords_not_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
|
|
def keywords_not_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
|
|
|
"""
|
|
"""
|
|
|
未登录,综合排序
|
|
未登录,综合排序
|
|
|
"""
|
|
"""
|
|
|
- account_id = 0
|
|
|
|
|
|
|
+ account_id = "0"
|
|
|
log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
|
|
log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
|
|
|
log_info_print(log_json, account_id=account_id)
|
|
log_info_print(log_json, account_id=account_id)
|
|
|
|
|
|
|
@@ -155,7 +158,7 @@ def keywords_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], f
|
|
|
"""
|
|
"""
|
|
|
登录,综合排序
|
|
登录,综合排序
|
|
|
"""
|
|
"""
|
|
|
- account_id = 771431186
|
|
|
|
|
|
|
+ account_id = "771431186"
|
|
|
log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
|
|
log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
|
|
|
log_info_print(log_json, account_id=account_id)
|
|
log_info_print(log_json, account_id=account_id)
|
|
|
|
|
|
|
@@ -164,15 +167,15 @@ def keywords_login_like_sort(keywords: str, log_json: dict[str, Any], filters: L
|
|
|
"""
|
|
"""
|
|
|
登录状态,最多点赞
|
|
登录状态,最多点赞
|
|
|
"""
|
|
"""
|
|
|
- account_id = 771431186
|
|
|
|
|
|
|
+ account_id = "771431186"
|
|
|
log_json = keywords_search_and_filter(keywords=keywords, sort_type="最多点赞", account_id=account_id, log_json=log_json, filters=filters)
|
|
log_json = keywords_search_and_filter(keywords=keywords, sort_type="最多点赞", account_id=account_id, log_json=log_json, filters=filters)
|
|
|
log_info_print(log_json, account_id=account_id)
|
|
log_info_print(log_json, account_id=account_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
def handle_log_json(log_json: dict[str, Any]):
|
|
def handle_log_json(log_json: dict[str, Any]):
|
|
|
- log_info_print(log_json)
|
|
|
|
|
|
|
+ # 登录,综合排序
|
|
|
|
|
+ # log_info_print(log_json)
|
|
|
|
|
|
|
|
- # 未登录,最多点赞
|
|
|
|
|
keywords = log_json['keywords']
|
|
keywords = log_json['keywords']
|
|
|
account_filters = json.loads(log_json.get("accountFilters", "[]"))
|
|
account_filters = json.loads(log_json.get("accountFilters", "[]"))
|
|
|
content_filters = json.loads(log_json.get("contentFilters", '[]'))
|
|
content_filters = json.loads(log_json.get("contentFilters", '[]'))
|
|
@@ -180,13 +183,13 @@ def handle_log_json(log_json: dict[str, Any]):
|
|
|
for filter_item in account_filters + content_filters:
|
|
for filter_item in account_filters + content_filters:
|
|
|
search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
|
|
search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
|
|
|
|
|
|
|
|
- keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
|
|
|
|
|
- keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
|
|
|
|
|
|
|
+ # keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
|
|
|
|
|
+ # keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
|
|
|
keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
|
|
keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
def main():
|
|
|
- file_path = "/Users/zhao/Downloads/keywords_filter_test_sample.json"
|
|
|
|
|
|
|
+ file_path = '/Users/zhao/Desktop/keywords.json'
|
|
|
log_list = []
|
|
log_list = []
|
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
line = f.readline()
|
|
line = f.readline()
|
|
@@ -196,7 +199,8 @@ def main():
|
|
|
|
|
|
|
|
log_info_print_title()
|
|
log_info_print_title()
|
|
|
for log in log_list:
|
|
for log in log_list:
|
|
|
- handle_log_json(log)
|
|
|
|
|
|
|
+ if "“揭秘开国 领导人" == log['keywords']:
|
|
|
|
|
+ handle_log_json(log)
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|