|
@@ -1,45 +1,96 @@
|
|
|
+import time
|
|
|
+
|
|
|
from common.public import clean_title
|
|
|
from .aliyun_log import AliyunLogger
|
|
|
|
|
|
|
|
|
-class VideoItem:
|
|
|
+class VideoItem(object):
|
|
|
"""
|
|
|
function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的
|
|
|
__init__: 初始化空json 对象,用来存储视频信息
|
|
|
add_video_info: 把视频信息存储到 item 对象中
|
|
|
check_item: 检查 item 对象中的各个元素以及处理
|
|
|
"""
|
|
|
+
|
|
|
def __init__(self):
|
|
|
self.item = {}
|
|
|
|
|
|
def add_video_info(self, key, value):
|
|
|
self.item[key] = value
|
|
|
|
|
|
- # 判断视频格式, 做兼容
|
|
|
def check_item(self):
|
|
|
- # video_title
|
|
|
+ """
|
|
|
+ 判断item 里面的字段,是否符合要求
|
|
|
+ 字段分为 3 类:
|
|
|
+ 1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]
|
|
|
+ 2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]
|
|
|
+ 3. 需要后出理的字段: video_title, publish_time
|
|
|
+ """
|
|
|
if self.item.get("video_title"):
|
|
|
- self.item['video_title'] = clean_title(self.item['video_title'])
|
|
|
+ self.item["video_title"] = clean_title(self.item["video_title"])
|
|
|
else:
|
|
|
- self.item['video_title'] = "No title"
|
|
|
- # video_id
|
|
|
-
|
|
|
- # video_time, publish_time_str, publish_time_stamp, update_time_stamp
|
|
|
-
|
|
|
- # play_cnt, like_cnt, comment_cnt, share_cnt
|
|
|
-
|
|
|
- # width, height, video_width, video_height
|
|
|
-
|
|
|
- # user_name, user_id, out_user_name, out_user_id
|
|
|
-
|
|
|
- # profile_id, profile_mid
|
|
|
-
|
|
|
- # session
|
|
|
+ return False
|
|
|
+ if self.item.get("publish_time_stamp"):
|
|
|
+ publish_time_str = time.strftime(
|
|
|
+ "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])
|
|
|
+ )
|
|
|
+ self.add_video_info("publish_time_str", publish_time_str)
|
|
|
+ else:
|
|
|
+ publish_time_stamp = int(time.time())
|
|
|
+ publish_time_str = time.strftime(
|
|
|
+ "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
|
|
|
+ )
|
|
|
+ self.add_video_info("publish_time_stamp", publish_time_stamp)
|
|
|
+ self.add_video_info("publish_time_str", publish_time_str)
|
|
|
+ self.add_video_info("publish_time", publish_time_str)
|
|
|
+ if not self.item.get("update_time_stamp"):
|
|
|
+ self.add_video_info("update_time_stamp", int(time.time()))
|
|
|
|
|
|
- # video_url
|
|
|
+ # 如果不存在,默认值为 0
|
|
|
+ config_keys = [
|
|
|
+ "duration",
|
|
|
+ "play_cnt",
|
|
|
+ "like_cnt",
|
|
|
+ "comment_cnt",
|
|
|
+ "share_cnt",
|
|
|
+ "width",
|
|
|
+ "height",
|
|
|
+ ]
|
|
|
+ for config_key in config_keys:
|
|
|
+ if self.item.get(config_key):
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ self.add_video_info(config_key, 0)
|
|
|
|
|
|
- # cover_url
|
|
|
+ # 必须存在的元素,若不存在则会报错
|
|
|
+ must_keys = [
|
|
|
+ "video_id",
|
|
|
+ "user_id",
|
|
|
+ "user_name",
|
|
|
+ "out_video_id",
|
|
|
+ "session",
|
|
|
+ "video_url",
|
|
|
+ "cover_url",
|
|
|
+ "platform",
|
|
|
+ "strategy",
|
|
|
+ ]
|
|
|
+ """
|
|
|
+ video_id, out_video_id 均为站外视频 id
|
|
|
+ usr_id: 站内用户 id
|
|
|
+ out_user_id: 站外用户 id
|
|
|
+ user_name: 站外用户名称
|
|
|
+ """
|
|
|
+ for m_key in must_keys:
|
|
|
+ if self.item.get(m_key):
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ # print(m_key)
|
|
|
+ return False
|
|
|
+ return True
|
|
|
|
|
|
def produce_item(self):
|
|
|
- self.check_item()
|
|
|
- return self.item
|
|
|
+ flag = self.check_item()
|
|
|
+ if flag:
|
|
|
+ return self.item
|
|
|
+ else:
|
|
|
+ return False
|