__init__.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. """
  2. @author: luojunhui
  3. """
  4. from datetime import datetime
  5. from applications.const import server_const
  6. from applications.functions.video_item import VideoProducer
  7. from applications.log import logging
  8. from applications.match_algorithm import title_similarity_with_nlp
  9. from .spiderAB import SearchABTest
  10. from .spiderSchedule import SearchMethod
  11. async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client, similarity_score):
  12. """
  13. 异步处理微信 video_obj
  14. 公众号和站内账号一一对应
  15. :param similarity_score:
  16. :param crawler_video_table: 爬虫表
  17. :param db_client: mysql
  18. :param content_id:
  19. :param platform:
  20. :param user:
  21. :param trace_id:
  22. :param video_obj:
  23. :return:
  24. """
  25. Video = VideoProducer()
  26. if platform == "xg_search":
  27. mq_obj = Video.xg_video_produce(
  28. video_obj=video_obj,
  29. user=user,
  30. trace_id=trace_id,
  31. )
  32. elif platform == "baidu_search":
  33. mq_obj = Video.baidu_video_produce(
  34. video_obj=video_obj,
  35. user=user,
  36. trace_id=trace_id,
  37. )
  38. elif platform == "wx_search":
  39. mq_obj = Video.wx_video_produce(
  40. video_obj=video_obj,
  41. user=user,
  42. trace_id=trace_id,
  43. )
  44. elif platform == "dy_search":
  45. mq_obj = Video.dy_video_produce(
  46. video_obj=video_obj,
  47. user=user,
  48. trace_id=trace_id,
  49. )
  50. else:
  51. mq_obj = {}
  52. mq_obj['trace_id'] = trace_id
  53. mq_obj['content_id'] = content_id
  54. insert_sql = f"""
  55. INSERT INTO {crawler_video_table}
  56. (content_id, out_video_id, platform, video_title, play_count, like_count, publish_time, crawler_time, duration, video_url, cover_url, user_id, trace_id, score)
  57. values
  58. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  59. """
  60. await db_client.async_insert(
  61. sql=insert_sql,
  62. params=(
  63. content_id,
  64. mq_obj['video_id'],
  65. platform,
  66. mq_obj['video_title'],
  67. mq_obj['play_cnt'],
  68. mq_obj['like_cnt'],
  69. datetime.fromtimestamp(mq_obj['publish_timestamp']).strftime('%Y-%m-%d %H:%M:%S'),
  70. datetime.now().__str__(),
  71. mq_obj['duration'],
  72. mq_obj['video_url'],
  73. mq_obj['cover_url'],
  74. mq_obj['user_id'],
  75. trace_id,
  76. similarity_score
  77. )
  78. )
  79. async def search_videos_from_web(info, gh_id_map, db_client):
  80. """
  81. search and send msg to ETL
  82. :param db_client:
  83. :param gh_id_map:
  84. :param info:
  85. :return:
  86. """
  87. default_account_id = 69637498
  88. search_AB = SearchABTest(info=info, searchMethod=SearchMethod())
  89. # 启三个搜索,每个搜索都保证要搜索到, 分别用key1, key2, key3去搜索
  90. trace_id = info['trace_id']
  91. gh_id = info['gh_id']
  92. content_id = info['content_id']
  93. recall_list = await search_AB.ab_6()
  94. logging(
  95. code="1006",
  96. info="搜索到{}条视频".format(len(recall_list)),
  97. data=recall_list,
  98. trace_id=info['trace_id']
  99. )
  100. # 按照标题相似度排序
  101. ranked_result = await title_similarity_with_nlp(content_title=info['ori_title'].split("@@")[-1], recall_list=recall_list)
  102. rank_alg = ranked_result['alg']
  103. ranked_list = ranked_result['result']
  104. for recall_obj in ranked_list:
  105. if recall_obj:
  106. platform = recall_obj['platform']
  107. recall_video = recall_obj['result']
  108. score = recall_obj['score']
  109. # 过滤掉jcd分数为0的
  110. if rank_alg == 'jcd' and score == server_const.JCD_SIMILARITY_THRESHOLD:
  111. continue
  112. # 过滤掉nlp分低于0.3的
  113. if rank_alg == 'nlp' and score < server_const.NLP_SIMILARITY_THRESHOLD:
  114. continue
  115. if recall_video:
  116. await save_video_to_mysql(
  117. video_obj=recall_video,
  118. user=gh_id_map.get(gh_id, default_account_id),
  119. trace_id=trace_id,
  120. platform=platform,
  121. content_id=content_id,
  122. crawler_video_table=info['crawler_video_table'],
  123. db_client=db_client,
  124. similarity_score=score
  125. )
  126. return len(ranked_list)