| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 | import timefrom application.functions import clean_titleclass VideoItem(object):    """    function: 当扫描进一条视频的时候,对该视频的基本信息进行处理,保证发送给 pipeline和 etl 的 video_dict 是正确的    __init__: 初始化空json 对象,用来存储视频信息    add_video_info: 把视频信息存储到 item 对象中    check_item: 检查 item 对象中的各个元素以及处理    """    def __init__(self):        self.item = {}    def add_video_info(self, key, value):        self.item[key] = value    def check_item(self):        """        判断item 里面的字段,是否符合要求        字段分为 3 类:        1. 必须存在数据的字段: ["video_id", "user_id", "user_name", "out_user_id", "out_video_id", "session", "video_url", "cover_url", "platform", "strategy"]        2. 不存在默认为 0 的字段 :["duration", "play_cnt", "like_cnt", "comment_cnt", "share_cnt", "width", "height"]        3. 需要后出理的字段: video_title, publish_time        """        if self.item.get("video_title"):            self.item["video_title"] = clean_title(self.item["video_title"])        else:            return False        if self.item.get("publish_time_stamp"):            publish_time_str = time.strftime(                "%Y-%m-%d %H:%M:%S", time.localtime(self.item["publish_time_stamp"])            )            self.add_video_info("publish_time_str", publish_time_str)        else:            publish_time_stamp = int(time.time())            publish_time_str = time.strftime(                "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)            )            self.add_video_info("publish_time_stamp", publish_time_stamp)            self.add_video_info("publish_time_str", publish_time_str)        self.add_video_info("publish_time", publish_time_str)        if not self.item.get("update_time_stamp"):            self.add_video_info("update_time_stamp", int(time.time()))        # 如果不存在,默认值为 0        config_keys = [            "duration",            "play_cnt",            "like_cnt",            "comment_cnt",            "share_cnt",            "width",            "height",        ]        for config_key in config_keys:            if self.item.get(config_key):                continue            else:                self.add_video_info(config_key, 0)        # 必须存在的元素,若不存在则会报错        must_keys = [            "video_id",            "user_id",            "user_name",            "out_video_id",            "session",            "video_url",            "cover_url",            "platform",            "strategy",        ]        """        video_id, out_video_id 均为站外视频 id        usr_id: 站内用户 id        out_user_id: 站外用户 id        user_name: 站外用户名称        """        for m_key in must_keys:            if self.item.get(m_key):                continue            else:                # print(m_key)                return False        return True    def produce_item(self):        flag = self.check_item()        if flag:            return self.item        else:            return False
 |