dou_yin_keywords_search.py 8.4 KB


  1. import json
  2. from typing import List, Any, Optional
  3. from simpleeval import simple_eval
  4. from client.CrawlerClient import CrawlerClient
  5. from model.automation_provide_job import DouYinSearchConfig, ChannelSearchAndDetailDTO, SearchFilterConfigItem
  6. from util.automation_provide_util import AutoProvideUtil
  7. crawler_client = CrawlerClient()
  8. preFilterThreshold = 3
  9. result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词分析.txt'
  10. def write_result_file(content, mode='a+'):
  11. with open(result_txt_file, mode) as f:
  12. f.write(content)
  13. f.write("\n")
  14. def log_info_print_title():
  15. write_result_file(
  16. "视频ID,品类,关键词,爬取计划,结果,原因,搜索使用的账号ID,排序方式,站外视频ID,站外账号ID,过滤结果,分享量,点赞量,分享量/点赞量,视频时长(秒),观众年龄50+占比,观众年龄50+TGI,过滤规则表达式", 'w')
  17. def log_info_print(log_json: dict[str, Any], account_id: Optional[int] = None):
  18. video_id = log_json["videoId"]
  19. keywords = log_json['keywords']
  20. crawler_plan_id = log_json.get("crawlerPlanId", "")
  21. result = log_json.get("result", False)
  22. reason = log_json.get("reason", "")
  23. merge_cate2 = log_json['mergeSecondLevelCate']
  24. sort_type = json.loads(log_json.get("modelValueConfig", "{}")).get("sortType")
  25. ext_json = json.loads(log_json.get("ext", "{}"))
  26. account_id = account_id if account_id else 0
  27. if not ext_json:
  28. write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type}")
  29. return
  30. for channel_content_id in ext_json:
  31. channel_ext_info = ext_json[channel_content_id]
  32. filter_result = channel_ext_info.get("result", False)
  33. rule_str = channel_ext_info.get("rule", "")
  34. rule_context = channel_ext_info.get('ruleContext', {})
  35. share_cnt = rule_context.get('shareCnt', 0)
  36. video_duration_s = rule_context.get('videoDuration_s', 0)
  37. like_cnt = rule_context.get('likeCnt', 0)
  38. audience_age_50_rate = rule_context.get('audienceAge50Rate', 0)
  39. audience_age_50_tgi = rule_context.get('audienceAge50TGI', 0)
  40. share_div_link = rule_context.get('shareDivLink', 0)
  41. channel_account_id = ""
  42. if "contentDetail" in channel_ext_info:
  43. channel_account_id = channel_ext_info["contentDetail"].get("channelAccountId")
  44. elif "fanPortrait" in channel_ext_info:
  45. channel_account_id = channel_ext_info["fanPortrait"].get("channelAccountId")
  46. write_result_file(f"{video_id},{merge_cate2},{keywords},'{crawler_plan_id},'{result},{reason},{account_id},{sort_type},'{channel_content_id},{channel_account_id},{filter_result},"
  47. f"{share_cnt},{like_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi},{rule_str}")
  48. def keywords_search(keywords: str, sort_type: str, account_id=None) -> List[ChannelSearchAndDetailDTO]:
  49. search_config = DouYinSearchConfig(
  50. search_content=keywords,
  51. sort_type=sort_type,
  52. account_id=account_id
  53. )
  54. return crawler_client.dou_yin_keywords_search(search_config, True, True)
  55. def eval_expr(expr: str, context: dict) -> bool:
  56. expr = expr.replace("&&", " and ").replace("||", " or ")
  57. return bool(simple_eval(expr, names=context))
  58. def keywords_search_and_filter(keywords: str, sort_type: str, account_id: int, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
  59. need_copy_keys = ["videoId", "accountFilters", "contentFilters", "mergeSecondLevelCate", "keywords"]
  60. result_json = {}
  61. for key in need_copy_keys:
  62. result_json[key] = log_json.get(key)
  63. log_ext_info = {}
  64. result_json['ext'] = log_ext_info
  65. result_json['result'] = True
  66. result_json['modelValueConfig'] = {"sortType": sort_type}
  67. rule_str = AutoProvideUtil.parse_filter_config_to_rule_str(filters)
  68. channel_search_and_detail_dtos = keywords_search(keywords, sort_type, account_id)
  69. if not channel_search_and_detail_dtos:
  70. result_json["result"] = False
  71. result_json['reason'] = '关键词搜索结果为空'
  72. return result_json
  73. cnt = 0
  74. for channel_search_and_detail_dto in channel_search_and_detail_dtos:
  75. channel_content_id = channel_search_and_detail_dto.channel_content_id
  76. channel_account_id = channel_search_and_detail_dto.channel_account_id
  77. content_detail = channel_search_and_detail_dto.content_detail
  78. fans_portrait = channel_search_and_detail_dto.fans_portrait
  79. ext_json = {}
  80. log_ext_info[channel_content_id] = ext_json
  81. if content_detail:
  82. content_detail['channelAccountId'] = channel_account_id
  83. content_detail['channelContentId'] = channel_content_id
  84. ext_json['contentDetail'] = content_detail
  85. if fans_portrait:
  86. fans_portrait['channelAccountId'] = channel_account_id
  87. fans_portrait['channelContentId'] = channel_content_id
  88. ext_json['fanPortrait'] = fans_portrait
  89. if (not content_detail) and (not fans_portrait):
  90. ext_json["result"] = False
  91. continue
  92. rule_context = AutoProvideUtil.extract_content_rule_feature(content_detail=content_detail, fans_portrait=fans_portrait)
  93. ext_json['ruleContext'] = rule_context
  94. ext_json['rule'] = rule_str
  95. if not rule_context:
  96. cnt += 1
  97. continue
  98. result = eval_expr(expr=rule_str, context=rule_context)
  99. ext_json['result'] = result
  100. if result:
  101. cnt += 1
  102. if cnt <= preFilterThreshold:
  103. log_json["result"] = False
  104. log_json['reason'] = '该关键词首页满足条件的视频数不足'
  105. return {}
  106. def keywords_not_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
  107. """
  108. 未登录,综合排序
  109. """
  110. account_id = 0
  111. log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
  112. log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
  113. log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
  114. log_info_print(log_json, account_id=account_id)
  115. def keywords_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
  116. """
  117. 登录,综合排序
  118. """
  119. account_id = 771431186
  120. log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
  121. log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
  122. log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
  123. log_info_print(log_json, account_id=account_id)
  124. def keywords_login_like_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
  125. """
  126. 登录状态,最多点赞
  127. """
  128. account_id = 771431186
  129. log_json = keywords_search_and_filter(keywords=keywords, sort_type="最多点赞", account_id=account_id, log_json=log_json, filters=filters)
  130. log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
  131. log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
  132. log_info_print(log_json, account_id=account_id)
  133. def handle_log_json(log_json: dict[str, Any]):
  134. log_info_print(log_json)
  135. # 未登录,最多点赞
  136. keywords = log_json['keywords']
  137. account_filters = json.loads(log_json.get("accountFilters", "[]"))
  138. content_filters = json.loads(log_json.get("contentFilters", '[]'))
  139. search_filter_config_tems = []
  140. for filter_item in account_filters + content_filters:
  141. search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
  142. keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
  143. keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
  144. keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
  145. def main():
  146. file_path = "/Users/zhao/Downloads/keywords_filter_test_sample.json"
  147. log_list = []
  148. with open(file_path, "r", encoding="utf-8") as f:
  149. line = f.readline()
  150. while line:
  151. log_list.append(json.loads(line))
  152. line = f.readline()
  153. log_info_print_title()
  154. for log in log_list:
  155. handle_log_json(log)
  156. if __name__ == '__main__':
  157. main()