__init__.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. from datetime import datetime
  6. from applications.feishu import bot
  7. from applications.const import server_const
  8. from applications.functions.video_item import VideoProducer
  9. from applications.log import logging
  10. from applications.match_algorithm import title_similarity_with_nlp
  11. from .spiderAB import SearchABTest
  12. from .spiderSchedule import SearchMethod
  13. async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client,
  14. similarity_score):
  15. """
  16. 异步处理微信 video_obj
  17. 公众号和站内账号一一对应
  18. :param similarity_score:
  19. :param crawler_video_table: 爬虫表
  20. :param db_client: mysql
  21. :param content_id:
  22. :param platform:
  23. :param user:
  24. :param trace_id:
  25. :param video_obj:
  26. :return:
  27. """
  28. Video = VideoProducer()
  29. if platform == "xg_search":
  30. mq_obj = Video.xg_video_produce(
  31. video_obj=video_obj,
  32. user=user,
  33. trace_id=trace_id,
  34. )
  35. elif platform == "baidu_search":
  36. mq_obj = Video.baidu_video_produce(
  37. video_obj=video_obj,
  38. user=user,
  39. trace_id=trace_id,
  40. )
  41. elif platform == "wx_search":
  42. mq_obj = Video.wx_video_produce(
  43. video_obj=video_obj,
  44. user=user,
  45. trace_id=trace_id,
  46. )
  47. elif platform == "dy_search":
  48. mq_obj = Video.dy_video_produce(
  49. video_obj=video_obj,
  50. user=user,
  51. trace_id=trace_id,
  52. )
  53. else:
  54. mq_obj = {}
  55. mq_obj['trace_id'] = trace_id
  56. mq_obj['content_id'] = content_id
  57. insert_sql = f"""
  58. INSERT INTO {crawler_video_table}
  59. (content_id, out_video_id, platform, video_title, play_count, like_count, publish_time, crawler_time, duration, video_url, cover_url, user_id, trace_id, score, score_version)
  60. values
  61. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  62. """
  63. await db_client.async_insert(
  64. sql=insert_sql,
  65. params=(
  66. content_id,
  67. mq_obj['video_id'],
  68. platform,
  69. mq_obj['video_title'],
  70. mq_obj['play_cnt'],
  71. mq_obj['like_cnt'],
  72. datetime.fromtimestamp(mq_obj['publish_timestamp']).strftime('%Y-%m-%d %H:%M:%S'),
  73. datetime.now().__str__(),
  74. mq_obj['duration'],
  75. mq_obj['video_url'],
  76. mq_obj['cover_url'],
  77. mq_obj['user_id'],
  78. trace_id,
  79. similarity_score,
  80. server_const.NLP_VERSION
  81. )
  82. )
  83. async def search_videos_from_web(info, gh_id_map, db_client):
  84. """
  85. search and send msg to ETL
  86. :param db_client:
  87. :param gh_id_map:
  88. :param info:
  89. :return:
  90. """
  91. default_account_id = 69637498
  92. search_AB = SearchABTest(info=info, searchMethod=SearchMethod())
  93. # 启三个搜索,每个搜索都保证要搜索到, 分别用key1, key2, key3去搜索
  94. trace_id = info['trace_id']
  95. gh_id = info['gh_id']
  96. content_id = info['content_id']
  97. recall_list = await search_AB.ab_6()
  98. logging(
  99. code="1006",
  100. info="搜索到{}条视频".format(len(recall_list)),
  101. data=recall_list,
  102. trace_id=info['trace_id']
  103. )
  104. # 按照标题相似度排序
  105. ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1],
  106. recall_list=recall_list)
  107. ranked_list = ranked_result['result']
  108. if recall_list and not ranked_list:
  109. bot(
  110. title="NLP服务请求失败",
  111. detail={
  112. "trace_id": info['trace_id']
  113. },
  114. mention=False
  115. )
  116. for recall_obj in ranked_list:
  117. if recall_obj:
  118. platform = recall_obj['platform']
  119. recall_video = recall_obj['result']
  120. score = recall_obj['score']
  121. # 过滤掉nlp分低于0.45的
  122. if score < server_const.NLP_SIMILARITY_THRESHOLD:
  123. continue
  124. if recall_video:
  125. await save_video_to_mysql(
  126. video_obj=recall_video,
  127. user=gh_id_map.get(gh_id, default_account_id),
  128. trace_id=trace_id,
  129. platform=platform,
  130. content_id=content_id,
  131. crawler_video_table=info['crawler_video_table'],
  132. db_client=db_client,
  133. similarity_score=score
  134. )
  135. return len(ranked_list)