__init__.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. from datetime import datetime
  6. from applications.feishu import bot
  7. from applications.const import server_const
  8. from applications.functions.video_item import VideoProducer
  9. from applications.log import logging
  10. from applications.match_algorithm import title_similarity_with_nlp
  11. from .spiderAB import SearchABTest
  12. from .spiderSchedule import SearchMethod
  13. async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client,
  14. similarity_score):
  15. """
  16. 异步处理微信 video_obj
  17. 公众号和站内账号一一对应
  18. :param similarity_score:
  19. :param crawler_video_table: 爬虫表
  20. :param db_client: mysql
  21. :param content_id:
  22. :param platform:
  23. :param user:
  24. :param trace_id:
  25. :param video_obj:
  26. :return:
  27. """
  28. Video = VideoProducer()
  29. if platform == "xg_search":
  30. mq_obj = Video.xg_video_produce(
  31. video_obj=video_obj,
  32. user=user,
  33. trace_id=trace_id,
  34. )
  35. elif platform == "baidu_search":
  36. mq_obj = Video.baidu_video_produce(
  37. video_obj=video_obj,
  38. user=user,
  39. trace_id=trace_id,
  40. )
  41. elif platform == "wx_search":
  42. mq_obj = Video.wx_video_produce(
  43. video_obj=video_obj,
  44. user=user,
  45. trace_id=trace_id,
  46. )
  47. elif platform == "dy_search":
  48. mq_obj = Video.dy_video_produce(
  49. video_obj=video_obj,
  50. user=user,
  51. trace_id=trace_id,
  52. )
  53. else:
  54. mq_obj = {}
  55. mq_obj['trace_id'] = trace_id
  56. mq_obj['content_id'] = content_id
  57. out_video_id = mq_obj['video_id']
  58. # 先查询此content_id下是否有out_video_id了
  59. select_sql = f"""
  60. SELECT id
  61. FROM {crawler_video_table}
  62. WHERE out_video_id = '{out_video_id}' and content_id = '{content_id}';
  63. """
  64. result = await db_client.async_select(select_sql)
  65. if result:
  66. return
  67. insert_sql = f"""
  68. INSERT INTO {crawler_video_table}
  69. (content_id, out_video_id, platform, video_title, play_count, like_count, publish_time, crawler_time, duration, video_url, cover_url, user_id, trace_id, score, score_version)
  70. values
  71. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  72. """
  73. await db_client.async_insert(
  74. sql=insert_sql,
  75. params=(
  76. content_id,
  77. mq_obj['video_id'],
  78. platform,
  79. mq_obj['video_title'],
  80. mq_obj['play_cnt'],
  81. mq_obj['like_cnt'],
  82. datetime.fromtimestamp(mq_obj['publish_timestamp']).strftime('%Y-%m-%d %H:%M:%S'),
  83. datetime.now().__str__(),
  84. mq_obj['duration'],
  85. mq_obj['video_url'],
  86. mq_obj['cover_url'],
  87. mq_obj['user_id'],
  88. trace_id,
  89. similarity_score,
  90. server_const.NLP_VERSION
  91. )
  92. )
  93. async def search_videos_from_web(info, gh_id_map, db_client):
  94. """
  95. search and send msg to ETL
  96. :param db_client:
  97. :param gh_id_map:
  98. :param info:
  99. :return:
  100. """
  101. default_account_id = 69637498
  102. search_AB = SearchABTest(info=info, searchMethod=SearchMethod())
  103. # 启三个搜索,每个搜索都保证要搜索到, 分别用key1, key2, key3去搜索
  104. trace_id = info['trace_id']
  105. gh_id = info['gh_id']
  106. content_id = info['content_id']
  107. recall_list = await search_AB.ab_6()
  108. logging(
  109. code="1006",
  110. info="搜索到{}条视频".format(len(recall_list)),
  111. data=recall_list,
  112. trace_id=info['trace_id']
  113. )
  114. # 按照标题相似度排序
  115. ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1],
  116. recall_list=recall_list)
  117. ranked_list = ranked_result['result']
  118. if recall_list and not ranked_list:
  119. bot(
  120. title="NLP服务请求失败",
  121. detail={
  122. "trace_id": info['trace_id']
  123. },
  124. mention=False
  125. )
  126. success_match_video_count = 0
  127. for recall_obj in ranked_list:
  128. if recall_obj:
  129. platform = recall_obj['platform']
  130. recall_video = recall_obj['result']
  131. score = recall_obj['score']
  132. # 过滤掉nlp分低于0.55的
  133. if score < server_const.NLP_SIMILARITY_THRESHOLD:
  134. continue
  135. if recall_video:
  136. await save_video_to_mysql(
  137. video_obj=recall_video,
  138. user=gh_id_map.get(gh_id, default_account_id),
  139. trace_id=trace_id,
  140. platform=platform,
  141. content_id=content_id,
  142. crawler_video_table=info['crawler_video_table'],
  143. db_client=db_client,
  144. similarity_score=score
  145. )
  146. success_match_video_count += 1
  147. return success_match_video_count