__init__.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. """
  2. @author: luojunhui
  3. """
  4. from datetime import datetime
  5. from applications.feishu import bot
  6. from applications.const import server_const
  7. from applications.functions.video_item import VideoProducer
  8. from applications.log import logging
  9. from applications.match_algorithm import title_similarity_with_nlp
  10. from .spiderAB import SearchABTest
  11. from .spiderSchedule import SearchMethod
  12. async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client,
  13. similarity_score):
  14. """
  15. 异步处理微信 video_obj
  16. 公众号和站内账号一一对应
  17. :param similarity_score:
  18. :param crawler_video_table: 爬虫表
  19. :param db_client: mysql
  20. :param content_id:
  21. :param platform:
  22. :param user:
  23. :param trace_id:
  24. :param video_obj:
  25. :return:
  26. """
  27. Video = VideoProducer()
  28. if platform == "xg_search":
  29. mq_obj = Video.xg_video_produce(
  30. video_obj=video_obj,
  31. user=user,
  32. trace_id=trace_id,
  33. )
  34. elif platform == "baidu_search":
  35. mq_obj = Video.baidu_video_produce(
  36. video_obj=video_obj,
  37. user=user,
  38. trace_id=trace_id,
  39. )
  40. elif platform == "wx_search":
  41. mq_obj = Video.wx_video_produce(
  42. video_obj=video_obj,
  43. user=user,
  44. trace_id=trace_id,
  45. )
  46. elif platform == "dy_search":
  47. mq_obj = Video.dy_video_produce(
  48. video_obj=video_obj,
  49. user=user,
  50. trace_id=trace_id,
  51. )
  52. else:
  53. mq_obj = {}
  54. mq_obj['trace_id'] = trace_id
  55. mq_obj['content_id'] = content_id
  56. insert_sql = f"""
  57. INSERT INTO {crawler_video_table}
  58. (content_id, out_video_id, platform, video_title, play_count, like_count, publish_time, crawler_time, duration, video_url, cover_url, user_id, trace_id, score)
  59. values
  60. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  61. """
  62. await db_client.async_insert(
  63. sql=insert_sql,
  64. params=(
  65. content_id,
  66. mq_obj['video_id'],
  67. platform,
  68. mq_obj['video_title'],
  69. mq_obj['play_cnt'],
  70. mq_obj['like_cnt'],
  71. datetime.fromtimestamp(mq_obj['publish_timestamp']).strftime('%Y-%m-%d %H:%M:%S'),
  72. datetime.now().__str__(),
  73. mq_obj['duration'],
  74. mq_obj['video_url'],
  75. mq_obj['cover_url'],
  76. mq_obj['user_id'],
  77. trace_id,
  78. similarity_score
  79. )
  80. )
  81. async def search_videos_from_web(info, gh_id_map, db_client):
  82. """
  83. search and send msg to ETL
  84. :param db_client:
  85. :param gh_id_map:
  86. :param info:
  87. :return:
  88. """
  89. default_account_id = 69637498
  90. search_AB = SearchABTest(info=info, searchMethod=SearchMethod())
  91. # 启三个搜索,每个搜索都保证要搜索到, 分别用key1, key2, key3去搜索
  92. trace_id = info['trace_id']
  93. gh_id = info['gh_id']
  94. content_id = info['content_id']
  95. recall_list = await search_AB.ab_6()
  96. logging(
  97. code="1006",
  98. info="搜索到{}条视频".format(len(recall_list)),
  99. data=recall_list,
  100. trace_id=info['trace_id']
  101. )
  102. # 按照标题相似度排序
  103. ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1],
  104. recall_list=recall_list)
  105. ranked_list = ranked_result['result']
  106. if recall_list and not ranked_list:
  107. bot(
  108. title="NLP服务请求失败",
  109. detail={
  110. "trace_id": info['trace_id']
  111. },
  112. mention=False
  113. )
  114. for recall_obj in ranked_list:
  115. if recall_obj:
  116. platform = recall_obj['platform']
  117. recall_video = recall_obj['result']
  118. score = recall_obj['score']
  119. # 过滤掉nlp分低于0.45的
  120. if score < server_const.NLP_SIMILARITY_THRESHOLD:
  121. continue
  122. if recall_video:
  123. await save_video_to_mysql(
  124. video_obj=recall_video,
  125. user=gh_id_map.get(gh_id, default_account_id),
  126. trace_id=trace_id,
  127. platform=platform,
  128. content_id=content_id,
  129. crawler_video_table=info['crawler_video_table'],
  130. db_client=db_client,
  131. similarity_score=score
  132. )
  133. return len(ranked_list)