dou_yin_keywords_search.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. import json
  2. import os.path
  3. import time
  4. from typing import List, Any, Optional
  5. from simpleeval import simple_eval
  6. from client.CrawlerClient import CrawlerClient
  7. from model.automation_provide_job import DouYinSearchConfig, ChannelSearchAndDetailDTO, SearchFilterConfigItem
  8. from util.automation_provide_util import AutoProvideUtil
  9. crawler_client = CrawlerClient()
  10. preFilterThreshold = 2
  11. result_txt_file = '/Users/zhao/Desktop/tzld/文档/分析文档/关键词供给任务分析/关键词分析_20260226.txt'
  12. def write_result_file(content, mode='a+'):
  13. with open(result_txt_file, mode) as f:
  14. f.write(content)
  15. f.write("\n")
  16. def log_info_print_title():
  17. write_result_file("品类,标题,videoid,解析的关键词,抖音搜索首页结果初判是否通过,搜索视频链接,热点宝账号链接,点赞量,分享量,分享比点赞,时长,账号50以上占比,账号50以上TGI", 'w')
  18. def log_info_print(log_json: dict[str, Any], account_id: Optional[str] = None):
  19. if 'ext' in log_json and isinstance(log_json['ext'], dict):
  20. log_json['ext'] = json.dumps(log_json['ext'], ensure_ascii=False)
  21. if 'modelValueConfig' in log_json and isinstance(log_json['modelValueConfig'], dict):
  22. log_json['modelValueConfig'] = json.dumps(log_json['modelValueConfig'], ensure_ascii=False)
  23. video_id = log_json.get("videoId", "-1")
  24. keywords = log_json.get('keywords', "")
  25. pq_title = log_json.get("pqTitle", "")
  26. crawler_plan_id = log_json.get("crawlerPlanId", "")
  27. result = log_json.get("result", False)
  28. reason = log_json.get("reason", "")
  29. merge_cate2 = log_json.get('mergeSecondLevelCate')
  30. sort_type = json.loads(log_json.get("modelValueConfig", "{}")).get("sortType")
  31. ext_json = json.loads(log_json.get("ext", "{}"))
  32. account_id = account_id if account_id else 0
  33. if not ext_json:
  34. write_result_file(f"{merge_cate2},{pq_title},{video_id}.{keywords},False")
  35. return
  36. for channel_content_id in ext_json:
  37. if channel_content_id in ['mergeCate2Map']:
  38. continue
  39. channel_ext_info = ext_json[channel_content_id]
  40. filter_result = channel_ext_info.get("result", False)
  41. rule_str = channel_ext_info.get("rule", "")
  42. rule_context = channel_ext_info.get('ruleContext', {})
  43. share_cnt = rule_context.get('shareCnt', 0)
  44. video_duration_s = rule_context.get('videoDuration_s', 0)
  45. like_cnt = rule_context.get('likeCnt', 0)
  46. audience_age_50_rate = rule_context.get('audienceAge50Rate', 0)
  47. audience_age_50_tgi = rule_context.get('audienceAge50TGI', 0)
  48. share_div_link = rule_context.get('shareDivLink', 0)
  49. channel_account_id = ""
  50. if "contentDetail" in channel_ext_info:
  51. channel_account_id = channel_ext_info["contentDetail"].get("channelAccountId")
  52. elif "fanPortrait" in channel_ext_info:
  53. channel_account_id = channel_ext_info["fanPortrait"].get("channelAccountId")
  54. write_result_file(
  55. f"{merge_cate2},{pq_title},{video_id},{keywords},{filter_result},{channel_content_id},{channel_account_id},"
  56. f"{like_cnt},{share_cnt},{share_div_link},{video_duration_s},{audience_age_50_rate},{audience_age_50_tgi}"
  57. )
  58. def keywords_search(keywords: str, sort_type: str, account: Optional[str] = None) -> List[ChannelSearchAndDetailDTO]:
  59. search_config = DouYinSearchConfig(
  60. search_content=keywords,
  61. sort_type=sort_type,
  62. account_id=account,
  63. )
  64. return crawler_client.dou_yin_keywords_search(search_config, True, True)
  65. def eval_expr(expr: str, context: dict) -> bool:
  66. expr = expr.replace("&&", " and ").replace("||", " or ")
  67. return bool(simple_eval(expr, names=context))
  68. def keywords_search_and_filter(keywords: str, sort_type: str, account_id: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]) -> dict[str, Any]:
  69. need_copy_keys = ["keywords"]
  70. result_json = {}
  71. for key in need_copy_keys:
  72. result_json[key] = log_json.get(key)
  73. log_ext_info = {}
  74. result_json['ext'] = log_ext_info
  75. result_json['result'] = True
  76. result_json['modelValueConfig'] = {"sortType": sort_type}
  77. rule_str = AutoProvideUtil.parse_filter_config_to_rule_str(filters)
  78. channel_search_and_detail_dtos = []
  79. for i in range(5):
  80. channel_search_and_detail_dtos = keywords_search(keywords, sort_type, account_id)
  81. if channel_search_and_detail_dtos:
  82. break
  83. time.sleep(5)
  84. if not channel_search_and_detail_dtos:
  85. result_json["result"] = False
  86. result_json['reason'] = '关键词搜索结果为空'
  87. return result_json
  88. cnt = 0
  89. for channel_search_and_detail_dto in channel_search_and_detail_dtos:
  90. channel_content_id = channel_search_and_detail_dto.channel_content_id
  91. channel_account_id = channel_search_and_detail_dto.channel_account_id
  92. content_detail = channel_search_and_detail_dto.content_detail
  93. fans_portrait = channel_search_and_detail_dto.fans_portrait
  94. ext_json = {}
  95. log_ext_info[channel_content_id] = ext_json
  96. if content_detail:
  97. content_detail['channelAccountId'] = channel_account_id
  98. content_detail['channelContentId'] = channel_content_id
  99. ext_json['contentDetail'] = content_detail
  100. if fans_portrait:
  101. fans_portrait['channelAccountId'] = channel_account_id
  102. fans_portrait['channelContentId'] = channel_content_id
  103. ext_json['fanPortrait'] = fans_portrait
  104. rule_context = AutoProvideUtil.extract_content_rule_feature(content_detail=content_detail, fans_portrait=fans_portrait)
  105. ext_json['ruleContext'] = rule_context
  106. ext_json['rule'] = rule_str
  107. if (not content_detail) or (not fans_portrait):
  108. ext_json["result"] = False
  109. continue
  110. if not rule_context:
  111. cnt += 1
  112. continue
  113. result = False
  114. try:
  115. result = eval_expr(expr=rule_str, context=rule_context)
  116. except Exception as e:
  117. print(rule_str, rule_context)
  118. ext_json['result'] = result
  119. if result:
  120. cnt += 1
  121. if cnt < preFilterThreshold:
  122. result_json["result"] = False
  123. result_json['reason'] = '该关键词首页满足条件的视频数不足'
  124. return result_json
  125. def keywords_not_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
  126. """
  127. 未登录,综合排序
  128. """
  129. account_id = "0"
  130. log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
  131. log_info_print(log_json, account_id=account_id)
  132. def keywords_login_comprehensive_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
  133. """
  134. 登录,综合排序
  135. """
  136. account_id = "771431186"
  137. log_json = keywords_search_and_filter(keywords=keywords, sort_type="综合排序", account_id=account_id, log_json=log_json, filters=filters)
  138. log_info_print(log_json, account_id=account_id)
  139. def keywords_login_like_sort(keywords: str, log_json: dict[str, Any], filters: List[SearchFilterConfigItem]):
  140. """
  141. 登录状态,最多点赞
  142. """
  143. account_id = "771431186"
  144. log_json = keywords_search_and_filter(keywords=keywords, sort_type="最多点赞", account_id=account_id, log_json=log_json, filters=filters)
  145. log_info_print(log_json, account_id=account_id)
  146. def handle_log_json(log_json: dict[str, Any]):
  147. # 登录,综合排序
  148. log_info_print(log_json)
  149. keywords = log_json['keywords']
  150. account_filters = json.loads(log_json.get("accountFilters", "[]"))
  151. content_filters = json.loads(log_json.get("contentFilters", '[]'))
  152. search_filter_config_tems = []
  153. for filter_item in account_filters + content_filters:
  154. search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
  155. # keywords_not_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
  156. # keywords_login_comprehensive_sort(keywords, log_json, search_filter_config_tems)
  157. # keywords_login_like_sort(keywords, log_json, search_filter_config_tems)
  158. def main():
  159. if os.path.exists(result_txt_file):
  160. os.remove(result_txt_file)
  161. file_path = '/Users/zhao/Desktop/20260226_keywords_job.json'
  162. log_list = []
  163. with open(file_path, "r", encoding="utf-8") as f:
  164. line = f.readline()
  165. while line:
  166. log_list.append(json.loads(line))
  167. line = f.readline()
  168. log_info_print_title()
  169. for log in log_list:
  170. if "历史名人" == log.get("mergeSecondLevelCate", ""):
  171. handle_log_json(log)
  172. search_filter_config_tems = []
  173. account_filters = json.loads(log_list[0].get("accountFilters", "[]"))
  174. content_filters = json.loads(log_list[0].get("contentFilters", '[]'))
  175. for filter_item in account_filters + content_filters:
  176. search_filter_config_tems.append(SearchFilterConfigItem(**filter_item))
  177. keywords_list = [
  178. "大字报式",
  179. "情感化",
  180. "事迹",
  181. "伟人",
  182. "毛主席",
  183. "历史人物",
  184. "民族自豪感",
  185. "轶事",
  186. "爱国主义",
  187. "800万",
  188. "物证",
  189. "刘永坦院士",
  190. "生活细节",
  191. "历史人物评价",
  192. "先驱先烈",
  193. "科研国士",
  194. "开国将帅"
  195. ]
  196. account_id = '771431206'
  197. for keyword in keywords_list:
  198. log_json = keywords_search_and_filter(keywords=keyword, sort_type="综合排序", account_id=account_id, log_json={"keywords": keyword}, filters=search_filter_config_tems)
  199. log_info_print(log_json, account_id=account_id)
  200. if __name__ == '__main__':
  201. main()