|
|
@@ -1,4 +1,6 @@
|
|
|
import json
|
|
|
+import os.path
|
|
|
+import time
|
|
|
from typing import List, Any, Optional
|
|
|
|
|
|
from simpleeval import simple_eval
|
|
|
@@ -9,9 +11,9 @@ from util.automation_provide_util import AutoProvideUtil
|
|
|
|
|
|
crawler_client = CrawlerClient()
|
|
|
|
|
|
-preFilterThreshold = 3
|
|
|
+preFilterThreshold = 2
|
|
|
|
|
|
-result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析_20260204_1.txt'
|
|
|
+result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词供给任务分析/关键词分析_20260226.txt'
|
|
|
|
|
|
|
|
|
def write_result_file(content, mode='a+'):
|
|
|
@@ -21,8 +23,7 @@ def write_result_file(content, mode='a+'):
|
|
|
|
|
|
|
|
|
def log_info_print_title():
|
|
|
- write_result_file(
|
|
|
- "视频ID,品类,关键词,爬取计划,结果,原因,搜索使用的账号ID,排序方式,站外视频ID,站外账号ID,过滤结果,分享量,点赞量,分享量/点赞量,视频时长(秒),观众年龄50+占比,观众年龄50+TGI,过滤规则表达式", 'w')
|
|
|
+ write_result_file("品类,标题,videoid,解析的关键词,抖音搜索首页结果初判是否通过,搜索视频链接,热点宝账号链接,点赞量,分享量,分享比点赞,时长,账号50以上占比,账号50以上TGI", 'w')
|
|
|
|
|
|
|
|
|
def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
|
|
|
@@ -32,17 +33,18 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
|
|
|
if 'modelValueConfig' in log_json and isinstance(log_json['modelValueConfig'], dict):
|
|
|
log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
|
|
|
|
|
|
- video_id = log_json["videoId"]
|
|
|
- keywords = log_json['keywords']
|
|
|
+ video_id = log_json.get("videoId", "-1")
|
|
|
+ keywords = log_json.get('keywords', "")
|
|
|
+ pq_title = log_json.get("pqTitle", "")
|
|
|
crawler_plan_id = log_json.get("crawlerPlanId", "")
|
|
|
result = log_json.get("result", False)
|
|
|
reason = log_json.get("reason", "")
|
|
|
- merge_cate2 = log_json['mergeSecondLevelCate']
|
|
|
+ merge_cate2 = log_json.get('mergeSecondLevelCate')
|
|
|
sort_type = json.loads(log_json.get("modelValueConfig", "{}")).get("sortType")
|
|
|
ext_json = json.loads(log_json.get("ext", "{}"))
|
|
|
account_id = account_id if account_id else 0
|
|
|
if not ext_json:
|
|
|
- write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type}")
|
|
|
+ write_result_file(f"{merge_cate2},{pq_title},{video_id}.{keywords},False")
|
|
|
return
|
|
|
for channel_content_id in ext_json:
|
|
|
if channel_content_id in ['mergeCate2Map']:
|
|
|
@@ -64,8 +66,10 @@ def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
|
|
|
elif "fanPortrait" in channel_ext_info:
|
|
|
channel_account_id = channel_ext_info["fanPortrait"].get("channelAccountId")
|
|
|
|
|
|
- write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type},'{channel_content_id},{channel_account_id},{filter_result},"
|
|
|
- f"{share_cnt},{like_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi},{rule_str}")
|
|
|
+ write_result_file(
|
|
|
+ f"{merge_cate2},{pq_title},{video_id},{keywords},{filter_result},{channel_content_id},{channel_account_id},"
|
|
|
+ f"{like_cnt},{share_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi}"
|
|
|
+ )
|
|
|
|
|
|
|
|
|
def keywords_search(keywords: str, sort_type: str, account: Optional[str] = None) -> List[ChannelSearchAndDetailDTO]:
|
|
|
@@ -83,7 +87,7 @@ def eval_expr(expr: str, context: dict) -> bool:
|
|
|
|
|
|
|
|
|
def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
|
|
|
- need_copy_keys = ["videoId", "accountFilters", "contentFilters", "mergeSecondLevelCate", "keywords"]
|
|
|
+ need_copy_keys = ["keywords"]
|
|
|
result_json = {}
|
|
|
for key in need_copy_keys:
|
|
|
result_json[key] = log_json.get(key)
|
|
|
@@ -95,7 +99,14 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, l
|
|
|
|
|
|
rule_str = AutoProvideUtil.parse_filter_config_to_rule_str(filters)
|
|
|
|
|
|
- channel_search_and_detail_dtos = keywords_search(keywords, sort_type, account_id)
|
|
|
+ channel_search_and_detail_dtos = []
|
|
|
+
|
|
|
+ for i in range(5):
|
|
|
+ channel_search_and_detail_dtos = keywords_search(keywords, sort_type, account_id)
|
|
|
+ if channel_search_and_detail_dtos:
|
|
|
+ break
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
if not channel_search_and_detail_dtos:
|
|
|
result_json["result"] = False
|
|
|
result_json['reason'] = '关键词搜索结果为空'
|
|
|
@@ -134,11 +145,16 @@ def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, l
|
|
|
cnt += 1
|
|
|
continue
|
|
|
|
|
|
- result = eval_expr(expr=rule_str, context=rule_context)
|
|
|
+ result = False
|
|
|
+ try:
|
|
|
+ result = eval_expr(expr=rule_str, context=rule_context)
|
|
|
+ except Exception as e:
|
|
|
+ print(rule_str, rule_context)
|
|
|
+
|
|
|
ext_json['result'] = result
|
|
|
if result:
|
|
|
cnt += 1
|
|
|
- if cnt <= preFilterThreshold:
|
|
|
+ if cnt < preFilterThreshold:
|
|
|
result_json["result"] = False
|
|
|
result_json['reason'] = '该关键词首页满足条件的视频数不足'
|
|
|
|
|
|
@@ -174,7 +190,7 @@ def keywords_login_like_sort(keywords: str, log_json: dict[str, Any], filters: L
|
|
|
|
|
|
def handle_log_json(log_json: dict[str, Any]):
|
|
|
# 登录,综合排序
|
|
|
- # log_info_print(log_json)
|
|
|
+ log_info_print(log_json)
|
|
|
|
|
|
keywords = log_json['keywords']
|
|
|
account_filters = json.loads(log_json.get("accountFilters", "[]"))
|
|
|
@@ -185,11 +201,14 @@ def handle_log_json(log_json: dict[str, Any]):
|
|
|
|
|
|
# keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
|
|
|
# keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
|
|
|
- keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
|
|
|
+ # keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
|
|
|
|
|
|
|
|
|
def main():
|
|
|
- file_path = '/Users/zhao/Desktop/keywords.json'
|
|
|
+ if os.path.exists(result_txt_file):
|
|
|
+ os.remove(result_txt_file)
|
|
|
+
|
|
|
+ file_path = '/Users/zhao/Desktop/20260226_keywords_job.json'
|
|
|
log_list = []
|
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
line = f.readline()
|
|
|
@@ -199,9 +218,39 @@ def main():
|
|
|
|
|
|
log_info_print_title()
|
|
|
for log in log_list:
|
|
|
- if "“揭秘开国 领导人" == log['keywords']:
|
|
|
+ if "历史名人" == log.get("mergeSecondLevelCate", ""):
|
|
|
handle_log_json(log)
|
|
|
|
|
|
+ search_filter_config_tems = []
|
|
|
+ account_filters = json.loads(log_list[0].get("accountFilters", "[]"))
|
|
|
+ content_filters = json.loads(log_list[0].get("contentFilters", '[]'))
|
|
|
+ for filter_item in account_filters + content_filters:
|
|
|
+ search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
|
|
|
+
|
|
|
+ keywords_list = [
|
|
|
+ "大字报式",
|
|
|
+ "情感化",
|
|
|
+ "事迹",
|
|
|
+ "伟人",
|
|
|
+ "毛主席",
|
|
|
+ "历史人物",
|
|
|
+ "民族自豪感",
|
|
|
+ "轶事",
|
|
|
+ "爱国主义",
|
|
|
+ "800万",
|
|
|
+ "物证",
|
|
|
+ "刘永坦院士",
|
|
|
+ "生活细节",
|
|
|
+ "历史人物评价",
|
|
|
+ "先驱先烈",
|
|
|
+ "科研国士",
|
|
|
+ "开国将帅"
|
|
|
+ ]
|
|
|
+ account_id = '771431206'
|
|
|
+ for keyword in keywords_list:
|
|
|
+ log_json = keywords_search_and_filter(keywords=keyword, sort_type="综合排序", account_id=account_id, log_json={"keywords": keyword}, filters=search_filter_config_tems)
|
|
|
+ log_info_print(log_json, account_id=account_id)
|
|
|
+
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|