video_item.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. from applications.functions.common import clean_title
  6. class VideoItem(object):
  7. """
  8. function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
  9. __init__: 初始化空json 对象,用来存储视频信息
  10. add_video_info: 把视频信息存储到 item 对象中
  11. check_item: 检查 item 对象中的各个元素以及处理
  12. """
  13. def __init__(self):
  14. self.item = {}
  15. def add_video_info(self, key, value):
  16. """
  17. insert or update video info
  18. :param key:
  19. :param value:
  20. """
  21. self.item[key] = value
  22. def check_item(self):
  23. """
  24. 判断item 里面的字段,是否符合要求
  25. 字段分为 3 类:
  26. 1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
  27. 2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
  28. 3. 需要后出理的字段: video_title, publish_time
  29. """
  30. if self.item.get("video_title"):
  31. self.item["video_title"] = clean_title(self.item["video_title"])
  32. else:
  33. return False
  34. if self.item.get("publish_time_stamp"):
  35. publish_time_str = time.strftime(
  36. "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
  37. )
  38. self.add_video_info("publish_time_str", publish_time_str)
  39. else:
  40. publish_time_stamp = int(time.time())
  41. publish_time_str = time.strftime(
  42. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  43. )
  44. self.add_video_info("publish_time_stamp", publish_time_stamp)
  45. self.add_video_info("publish_time_str", publish_time_str)
  46. self.add_video_info("publish_time", publish_time_str)
  47. if not self.item.get("update_time_stamp"):
  48. self.add_video_info("update_time_stamp", int(time.time()))
  49. # 如果不存在,默认值为 0
  50. config_keys = [
  51. "duration",
  52. "play_cnt",
  53. "like_cnt",
  54. "comment_cnt",
  55. "share_cnt",
  56. "width",
  57. "height",
  58. ]
  59. for config_key in config_keys:
  60. if self.item.get(config_key):
  61. continue
  62. else:
  63. self.add_video_info(config_key, 0)
  64. # 必须存在的元素,若不存在则会报错
  65. must_keys = [
  66. "video_id",
  67. "user_id",
  68. "user_name",
  69. "out_video_id",
  70. "session",
  71. "video_url",
  72. "cover_url",
  73. "platform",
  74. "strategy",
  75. ]
  76. """
  77. video_id, out_video_id 均为站外视频 id
  78. usr_id: 站内用户 id
  79. out_user_id: 站外用户 id
  80. user_name: 站外用户名称
  81. """
  82. for m_key in must_keys:
  83. if self.item.get(m_key):
  84. continue
  85. else:
  86. # print(m_key)
  87. return False
  88. return True
  89. def produce_item(self):
  90. """
  91. item producer
  92. :return:
  93. """
  94. flag = self.check_item()
  95. if flag:
  96. return self.item
  97. else:
  98. return False
  99. class VideoProducer(object):
  100. """
  101. 处理视频
  102. """
  103. @classmethod
  104. def wx_video_producer(cls, video_obj, user, trace_id):
  105. """
  106. 异步处理微信 video_obj
  107. 公众号和站内账号一一对应
  108. :param trace_id:
  109. :param user:
  110. :param video_obj:
  111. :return:
  112. """
  113. platform = "weixin_search"
  114. publish_time_stamp = int(video_obj['pubTime'])
  115. item = VideoItem()
  116. item.add_video_info("user_id", user["uid"])
  117. item.add_video_info("user_name", user["nick_name"])
  118. item.add_video_info("video_id", video_obj['hashDocID'])
  119. item.add_video_info("video_title", trace_id)
  120. item.add_video_info("publish_time_stamp", int(publish_time_stamp))
  121. item.add_video_info("video_url", video_obj["videoUrl"])
  122. item.add_video_info("cover_url", video_obj["image"])
  123. item.add_video_info("out_video_id", video_obj['hashDocID'])
  124. item.add_video_info("out_user_id", trace_id)
  125. item.add_video_info("platform", platform)
  126. item.add_video_info("strategy", "search")
  127. item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
  128. mq_obj = item.produce_item()
  129. return mq_obj
  130. @classmethod
  131. def baidu_video_producer(cls, video_obj, user, trace_id):
  132. """
  133. 处理好看视频的 video_info
  134. :param video_obj:
  135. :param user:
  136. :param trace_id:
  137. :return:
  138. """
  139. platform = "baidu_search"
  140. publish_time_stamp = int(video_obj['publish_time'])
  141. item = VideoItem()
  142. item.add_video_info("user_id", user["uid"])
  143. item.add_video_info("user_name", user["nick_name"])
  144. item.add_video_info("video_id", video_obj['id'])
  145. item.add_video_info("video_title", trace_id)
  146. item.add_video_info("publish_time_stamp", publish_time_stamp)
  147. item.add_video_info("video_url", video_obj["playurl"])
  148. item.add_video_info("cover_url", video_obj["poster"])
  149. item.add_video_info("out_video_id", video_obj['id'])
  150. item.add_video_info("out_user_id", trace_id)
  151. item.add_video_info("like_cnt", video_obj['like'] if video_obj.get('like') else 0)
  152. item.add_video_info("play_cnt", video_obj['playcnt'])
  153. item.add_video_info("duration", video_obj['duration'])
  154. item.add_video_info("platform", platform)
  155. item.add_video_info("strategy", "search")
  156. item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
  157. mq_obj = item.produce_item()
  158. return mq_obj
  159. @classmethod
  160. def xg_video_producer(cls, video_obj, user, trace_id):
  161. """
  162. 西瓜搜索
  163. :param video_obj:
  164. :param user:
  165. :param trace_id:
  166. :return:
  167. """
  168. platform = "xg_search"
  169. publish_time_stamp = int(video_obj['publish_time'])
  170. item = VideoItem()
  171. item.add_video_info("user_id", user["uid"])
  172. item.add_video_info("user_name", user["nick_name"])
  173. item.add_video_info("video_id", video_obj['video_id'])
  174. item.add_video_info("video_title", trace_id)
  175. item.add_video_info("publish_time_stamp", int(publish_time_stamp))
  176. item.add_video_info("video_url", video_obj["video_url"])
  177. item.add_video_info("cover_url", video_obj["cover_url"])
  178. item.add_video_info("out_video_id", video_obj['video_id'])
  179. item.add_video_info("play_cnt", video_obj['play_cnt'])
  180. item.add_video_info("duration", video_obj['duration'])
  181. item.add_video_info("like_cnt", video_obj['like_cnt'])
  182. item.add_video_info("out_user_id", trace_id)
  183. item.add_video_info("platform", platform)
  184. item.add_video_info("strategy", "search")
  185. item.add_video_info("session", "{}-{}".format(platform, int(time.time())))
  186. mq_obj = item.produce_item()
  187. return mq_obj