__init__.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. """
  2. @author: luojunhui
  3. """
  4. from datetime import datetime
  5. from applications.functions.video_item import VideoProducer
  6. from applications.log import logging
  7. from applications.match_algorithm.rank import title_similarity_rank
  8. from .spiderAB import SearchABTest
  9. from .spiderSchedule import SearchMethod
  10. async def save_video_to_mysql(video_obj, user, trace_id, platform, content_id, crawler_video_table, db_client, similarity_score):
  11. """
  12. 异步处理微信 video_obj
  13. 公众号和站内账号一一对应
  14. :param similarity_score:
  15. :param crawler_video_table: 爬虫表
  16. :param db_client: mysql
  17. :param content_id:
  18. :param platform:
  19. :param user:
  20. :param trace_id:
  21. :param video_obj:
  22. :return:
  23. """
  24. Video = VideoProducer()
  25. if platform == "xg_search":
  26. mq_obj = Video.xg_video_produce(
  27. video_obj=video_obj,
  28. user=user,
  29. trace_id=trace_id,
  30. )
  31. elif platform == "baidu_search":
  32. mq_obj = Video.baidu_video_produce(
  33. video_obj=video_obj,
  34. user=user,
  35. trace_id=trace_id,
  36. )
  37. elif platform == "wx_search":
  38. mq_obj = Video.wx_video_produce(
  39. video_obj=video_obj,
  40. user=user,
  41. trace_id=trace_id,
  42. )
  43. elif platform == "dy_search":
  44. mq_obj = Video.dy_video_produce(
  45. video_obj=video_obj,
  46. user=user,
  47. trace_id=trace_id,
  48. )
  49. else:
  50. mq_obj = {}
  51. mq_obj['trace_id'] = trace_id
  52. mq_obj['content_id'] = content_id
  53. insert_sql = f"""
  54. INSERT INTO {crawler_video_table}
  55. (content_id, out_video_id, platform, video_title, play_count, like_count, publish_time, crawler_time, duration, video_url, cover_url, user_id, trace_id, score)
  56. values
  57. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  58. """
  59. await db_client.async_insert(
  60. sql=insert_sql,
  61. params=(
  62. content_id,
  63. mq_obj['video_id'],
  64. platform,
  65. mq_obj['video_title'],
  66. mq_obj['play_cnt'],
  67. mq_obj['like_cnt'],
  68. datetime.fromtimestamp(mq_obj['publish_timestamp']).strftime('%Y-%m-%d %H:%M:%S'),
  69. datetime.now().__str__(),
  70. mq_obj['duration'],
  71. mq_obj['video_url'],
  72. mq_obj['cover_url'],
  73. mq_obj['user_id'],
  74. trace_id,
  75. similarity_score
  76. )
  77. )
  78. async def search_videos_from_web(info, gh_id_map, db_client):
  79. """
  80. search and send msg to ETL
  81. :param db_client:
  82. :param gh_id_map:
  83. :param info:
  84. :return:
  85. """
  86. default_account_id = 69637498
  87. search_AB = SearchABTest(info=info, searchMethod=SearchMethod())
  88. # 启三个搜索,每个搜索都保证要搜索到, 分别用key1, key2, key3去搜索
  89. trace_id = info['trace_id']
  90. gh_id = info['gh_id']
  91. content_id = info['content_id']
  92. recall_list = await search_AB.ab_5()
  93. logging(
  94. code="1006",
  95. info="搜索到{}条视频".format(len(recall_list)),
  96. data=recall_list,
  97. trace_id=info['trace_id']
  98. )
  99. # 按照标题相似度排序
  100. ranked_list = title_similarity_rank(content_title=info['ori_title'].split("@@")[-1], recall_list=recall_list)
  101. for recall_obj in ranked_list:
  102. if recall_obj:
  103. platform = recall_obj['platform']
  104. recall_video = recall_obj['result']
  105. score = recall_video['score']
  106. if recall_video:
  107. await save_video_to_mysql(
  108. video_obj=recall_video,
  109. user=gh_id_map.get(gh_id, default_account_id),
  110. trace_id=trace_id,
  111. platform=platform,
  112. content_id=content_id,
  113. crawler_video_table=info['crawler_video_table'],
  114. db_client=db_client,
  115. similarity_score=score
  116. )
  117. return len(ranked_list)