item.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import time
  2. from application.functions import clean_title
  3. class VideoItem(object):
  4. """
  5. function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
  6. __init__: 初始化空json 对象,用来存储视频信息
  7. add_video_info: 把视频信息存储到 item 对象中
  8. check_item: 检查 item 对象中的各个元素以及处理
  9. """
  10. def __init__(self):
  11. self.item = {}
  12. def add_video_info(self, key, value):
  13. self.item[key] = value
  14. def check_item(self):
  15. """
  16. 判断item 里面的字段,是否符合要求
  17. 字段分为 3 类:
  18. 1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
  19. 2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
  20. 3. 需要后出理的字段: video_title, publish_time
  21. """
  22. if self.item.get("video_title"):
  23. self.item["video_title"] = clean_title(self.item["video_title"])
  24. else:
  25. return False
  26. if self.item.get("publish_time_stamp"):
  27. publish_time_str = time.strftime(
  28. "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
  29. )
  30. self.add_video_info("publish_time_str", publish_time_str)
  31. else:
  32. publish_time_stamp = int(time.time())
  33. publish_time_str = time.strftime(
  34. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  35. )
  36. self.add_video_info("publish_time_stamp", publish_time_stamp)
  37. self.add_video_info("publish_time_str", publish_time_str)
  38. self.add_video_info("publish_time", publish_time_str)
  39. if not self.item.get("update_time_stamp"):
  40. self.add_video_info("update_time_stamp", int(time.time()))
  41. # 如果不存在,默认值为 0
  42. config_keys = [
  43. "duration",
  44. "play_cnt",
  45. "like_cnt",
  46. "comment_cnt",
  47. "share_cnt",
  48. "width",
  49. "height",
  50. ]
  51. for config_key in config_keys:
  52. if self.item.get(config_key):
  53. continue
  54. else:
  55. self.add_video_info(config_key, 0)
  56. # 必须存在的元素,若不存在则会报错
  57. must_keys = [
  58. "video_id",
  59. "user_id",
  60. "user_name",
  61. "out_video_id",
  62. "session",
  63. "video_url",
  64. "cover_url",
  65. "platform",
  66. "strategy",
  67. ]
  68. """
  69. video_id, out_video_id 均为站外视频 id
  70. usr_id: 站内用户 id
  71. out_user_id: 站外用户 id
  72. user_name: 站外用户名称
  73. """
  74. for m_key in must_keys:
  75. if self.item.get(m_key):
  76. continue
  77. else:
  78. # print(m_key)
  79. return False
  80. return True
  81. def produce_item(self):
  82. flag = self.check_item()
  83. if flag:
  84. return self.item
  85. else:
  86. return False