# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/3/15 import datetime import json import os import random import shutil import sys import time import requests import urllib3 sys.path.append(os.getcwd()) from common.common import Common from common.feishu import Feishu from common.publish import Publish from common.db import MysqlHelper proxies = {"http": None, "https": None} class XiaoniangaoHour: platform = "小年糕" # # 配置微信 # time.sleep(1) # wechat_sheet = Feishu.get_values_batch("hour", "xiaoniangao", "dzcWHw") # hour_x_b3_traceid = wechat_sheet[2][1] # hour_x_token_id = wechat_sheet[3][1] # hour_referer = wechat_sheet[4][1] # hour_uid = wechat_sheet[5][1] # hour_token = wechat_sheet[6][1] # 生成 uid、token @classmethod def get_uid_token(cls): words = "abcdefghijklmnopqrstuvwxyz0123456789" uid = f"""{"".join(random.sample(words, 8))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 4))}-{"".join(random.sample(words, 12))}""" token = "".join(random.sample(words, 32)) uid_token_dict = { "uid": uid, "token": token } return uid_token_dict # 过滤敏感词 @classmethod def filter_words(cls, log_type, crawler): try: while True: # 敏感词库列表 word_list = [] # 从云文档读取所有敏感词,添加到词库列表 filter_sheet = Feishu.get_values_batch(log_type, "xiaoniangao", "DRAnZh") if filter_sheet is None: Common.logger(log_type, crawler).info(f"filter_sheet:{filter_sheet}") continue for i in filter_sheet: for j in i: # 过滤空的单元格内容 if j is None: pass else: word_list.append(j) return word_list except Exception as e: Common.logger(log_type, crawler).error(f"filter_words:{e}\n") # 基础门槛规则 @staticmethod def download_rule(video_dict): """ 下载视频的基本规则 :param video_dict: 视频信息,字典格式 :return: 满足规则,返回 True;反之,返回 False """ # 视频时长 if int(float(video_dict["duration"])) >= 40: # 宽或高 if int(video_dict["video_width"]) >= 0 or int(video_dict["video_height"]) >= 0: # 播放量 if int(video_dict["play_cnt"]) >= 4000: # 点赞量 if int(video_dict["like_cnt"]) >= 0: # 分享量 if int(video_dict["share_cnt"]) >= 0: # 发布时间 <= 10 天 if int(time.time()) - int(video_dict["publish_time_stamp"]) <= 3600*24*10: return True else: return False else: return False else: return False else: return False return False return False # 检查是否有今日的上升榜日期 @classmethod def check_data(cls, log_type, crawler, date): while True: hour_sheet = Feishu.get_values_batch(log_type, crawler, "ba0da4") if hour_sheet is None: Common.logger(log_type, crawler).warning(f'小时级数据_feeds:{hour_sheet}\n') continue # 判断J1单元格的日期是否为今天 if Feishu.get_range_value(log_type, crawler, "ba0da4", "L1:L1")[0] != date: # 插入3列 L1:N1,并写入日期和时间数据 values = [[date], ["10:00", "15:00", "20:00"]] time.sleep(1) Feishu.insert_columns(log_type, crawler, "ba0da4", "COLUMNS", 11, 14) time.sleep(1) Feishu.update_values(log_type, crawler, "ba0da4", "L1:N2", values) time.sleep(1) Feishu.merge_cells(log_type, crawler, "ba0da4", "L1:N1") Common.logger(log_type, crawler).info("插入今天日期成功\n") return else: Common.logger(log_type, crawler).info("今日上升榜日期已存在\n") return # 获取表情及符号 @classmethod def get_expression(cls): while True: expression_list = [] char_list = [] char_sheet = Feishu.get_values_batch("hour", "xiaoniangao", "BhlbST") if char_sheet is None: continue for i in range(len(char_sheet)): if char_sheet[i][0] is not None: expression_list.append(char_sheet[i][0]) if char_sheet[i][1] is not None: char_list.append(char_sheet[i][1]) return expression_list, char_list @classmethod def repeat_video(cls, log_type, crawler, video_id, env, machine): sql = f""" select * from crawler_video where platform="小年糕" and out_video_id="{video_id}"; """ repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine) return len(repeat_video) # 获取列表 @classmethod def get_videoList(cls, log_type, crawler, env, machine): # try: uid_token_dict = cls.get_uid_token() url = "https://kapi.xiaoniangao.cn/trends/get_recommend_trends" headers = { # "x-b3-traceid": cls.hour_x_b3_traceid, "x-b3-traceid": '1c403a4aa72e3c', # "X-Token-Id": cls.hour_x_token_id, "X-Token-Id": 'ab619e96d801f1567388629260aa68ec-1202200806', # "uid": cls.hour_uid, "uid": uid_token_dict['uid'], "content-type": "application/json", "Accept-Encoding": "gzip,compress,br,deflate", "User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)' ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 ' 'MicroMessenger/8.0.20(0x18001432) NetType/WIFI Language/zh_CN', # "Referer": cls.hour_referer "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/624/page-frame.html' } data = { "log_params": { "page": "discover_rec", "common": { "brand": "iPhone", "device": "iPhone 11", "os": "iOS 14.7.1", "weixinver": "8.0.20", "srcver": "2.24.2", "net": "wifi", "scene": 1089 } }, "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!750x500r/crop/750x500/interlace/1/format/jpg", "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail/!80x80r/crop/80x80/interlace/1/format/jpg", "share_width": 625, "share_height": 500, "ext": { "fmid": 0, "items": {} }, "app": "xng", "rec_scene": "discover_rec", "log_common_params": { "e": [{ "data": { "page": "discoverIndexPage", "topic": "recommend" }, "ab": {} }], "ext": { "brand": "iPhone", "device": "iPhone 11", "os": "iOS 14.7.1", "weixinver": "8.0.20", "srcver": "2.24.3", "net": "wifi", "scene": "1089" }, "pj": "1", "pf": "2", "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29" }, "refresh": False, "token": uid_token_dict["token"], "uid": uid_token_dict["uid"], "proj": "ma", "wx_ver": "8.0.20", "code_ver": "3.62.0" } urllib3.disable_warnings() r = requests.post(url=url, headers=headers, json=data, proxies=proxies, verify=False) if 'data' not in r.text or r.status_code != 200: Common.logger(log_type, crawler).warning(f"get_videoList:{r.text}\n") elif "data" not in r.json(): Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()}\n") elif "list" not in r.json()["data"]: Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()['data']}\n") elif len(r.json()['data']['list']) == 0: Common.logger(log_type, crawler).warning(f"get_videoList:{r.json()['data']['list']}\n") else: # 视频列表数据 feeds = r.json()["data"]["list"] for i in range(len(feeds)): # 标题,表情随机加在片头、片尾,或替代句子中间的标点符号 if "title" in feeds[i]: befor_video_title = feeds[i]["title"].strip().replace("\n", "") \ .replace("/", "").replace("\r", "").replace("#", "") \ .replace(".", "。").replace("\\", "").replace("&NBSP", "") \ .replace(":", "").replace("*", "").replace("?", "") \ .replace("?", "").replace('"', "").replace("<", "") \ .replace(">", "").replace("|", "").replace(" ", "").replace("#表情", "").replace("#符号", "") expression = cls.get_expression() expression_list = expression[0] char_list = expression[1] # 随机取一个表情 expression = random.choice(expression_list) # 生成标题list[表情+title, title+表情] expression_title_list = [expression + befor_video_title, befor_video_title + expression] # 从标题list中随机取一个标题 title_list1 = random.choice(expression_title_list) # 生成标题:原标题+符号 title_list2 = befor_video_title + random.choice(char_list) # 表情和标题组合,与标题和符号组合,汇总成待使用的标题列表 title_list4 = [title_list2, title_list1] # 最终标题 video_title = random.choice(title_list4) else: video_title = 0 # 视频 ID if "vid" in feeds[i]: video_id = feeds[i]["vid"] else: video_id = 0 # 播放量 if "play_pv" in feeds[i]: video_play_cnt = feeds[i]["play_pv"] else: video_play_cnt = 0 # 点赞量 if "favor" in feeds[i]: video_like_cnt = feeds[i]["favor"]["total"] else: video_like_cnt = 0 # 评论数 if "comment_count" in feeds[i]: video_comment_cnt = feeds[i]["comment_count"] else: video_comment_cnt = 0 # 分享量 if "share" in feeds[i]: video_share_cnt = feeds[i]["share"] else: video_share_cnt = 0 # 时长 if "du" in feeds[i]: video_duration = int(feeds[i]["du"] / 1000) else: video_duration = 0 # 宽和高 if "w" or "h" in feeds[i]: video_width = feeds[i]["w"] video_height = feeds[i]["h"] else: video_width = 0 video_height = 0 # 发布时间 if "t" in feeds[i]: video_send_time = feeds[i]["t"] else: video_send_time = 0 publish_time_stamp = int(int(video_send_time)/1000) publish_time_str = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time_stamp)) # 用户名 / 头像 if "user" in feeds[i]: user_name = feeds[i]["user"]["nick"].strip().replace("\n", "") \ .replace("/", "").replace("快手", "").replace(" ", "") \ .replace(" ", "").replace("&NBSP", "").replace("\r", "") head_url = feeds[i]["user"]["hurl"] else: user_name = 0 head_url = 0 # 用户 ID profile_id = feeds[i]["id"] # 用户 mid profile_mid = feeds[i]["user"]["mid"] # 视频封面 if "url" in feeds[i]: cover_url = feeds[i]["url"] else: cover_url = 0 # 视频播放地址 if "v_url" in feeds[i]: video_url = feeds[i]["v_url"] else: video_url = 0 video_dict = { "video_title": video_title, "video_id": video_id, "duration": video_duration, "play_cnt": video_play_cnt, "like_cnt": video_like_cnt, "comment_cnt": video_comment_cnt, "share_cnt": video_share_cnt, "user_name": user_name, "publish_time_stamp": publish_time_stamp, "publish_time_str": publish_time_str, "video_width": video_width, "video_height": video_height, "avatar_url": head_url, "profile_id": profile_id, "profile_mid": profile_mid, "cover_url": cover_url, "video_url": video_url, "session": f"xiaoniangao-hour-{int(time.time())}" } for k, v in video_dict.items(): Common.logger(log_type, crawler).info(f"{k}:{v}") # 过滤无效视频 if video_title == 0 or video_id == 0 or video_duration == 0 \ or video_send_time == 0 or user_name == 0 or head_url == 0 \ or cover_url == 0 or video_url == 0: Common.logger(log_type, crawler).warning("无效视频\n") # 抓取基础规则过滤 elif cls.download_rule(video_dict) is False: Common.logger(log_type, crawler).info("不满足基础门槛规则\n") elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0: Common.logger(log_type, crawler).info('视频已下载\n') # 过滤敏感词 elif any(str(word) if str(word) in video_title else False for word in cls.filter_words(log_type, crawler)) is True: Common.logger(log_type, crawler).info("视频已中过滤词\n") time.sleep(1) else: # 写入飞书小时级feeds工作表 Feishu.insert_columns(log_type, crawler, "ba0da4", "ROWS", 2, 3) get_feeds_time = int(time.time()) values = [[profile_id, profile_mid, video_id, video_title, user_name, video_duration, cover_url, video_url, publish_time_str, str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(get_feeds_time))), video_play_cnt]] time.sleep(0.5) Feishu.update_values(log_type, crawler, "ba0da4", "A3:K3", values) Common.logger(log_type, crawler).info("视频添加至小时级数据_feeds成功\n") # except Exception as e: # Common.logger(log_type, crawler).error(f"get_videoList:{e}\n") @classmethod def download_video(cls, log_type, crawler, p_id, p_mid, v_title, v_id, strategy, oss_endpoint, env, machine): try: uid_token_dict = cls.get_uid_token() url = "https://kapi.xiaoniangao.cn/profile/get_profile_by_id" headers = { # "x-b3-traceid": cls.hour_x_b3_traceid, "x-b3-traceid": '1c403a4aa72e3c', # "X-Token-Id": cls.hour_x_token_id, "X-Token-Id": 'ab619e96d801f1567388629260aa68ec-1202200806', "uid": uid_token_dict['uid'], "content-type": "application/json", "Accept-Encoding": "gzip,compress,br,deflate", "User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)' ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 ' 'MicroMessenger/8.0.20(0x18001432) NetType/WIFI Language/zh_CN', # "Referer": cls.hour_referer "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/624/page-frame.html' } data = { "play_src": "1", "profile_id": int(p_id), "profile_mid": int(p_mid), "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/" "!400x400r/crop/400x400/interlace/1/format/jpg", "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail" "/!80x80r/crop/80x80/interlace/1/format/jpg", "share_width": 625, "share_height": 500, "no_comments": True, "no_follow": True, "vid": v_id, "hot_l1_comment": True, # "token": cls.hour_token, "token": uid_token_dict['token'], # "uid": cls.hour_uid, "uid": uid_token_dict['uid'], "proj": "ma", "wx_ver": "8.0.20", "code_ver": "3.62.0", "log_common_params": { "e": [{ "data": { "page": "dynamicSharePage" } }], "ext": { "brand": "iPhone", "device": "iPhone 11", "os": "iOS 14.7.1", "weixinver": "8.0.20", "srcver": "2.24.3", "net": "wifi", "scene": "1089" }, "pj": "1", "pf": "2", "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29" } } urllib3.disable_warnings() r = requests.post(headers=headers, url=url, json=data, proxies=proxies, verify=False) if r.status_code != 200 or 'data' not in r.text: Common.logger(log_type, crawler).warning(f"get_videoInfo:{r.text}\n") else: hour_play_cnt = r.json()["data"]["play_pv"] hour_cover_url = r.json()["data"]["url"] hour_video_url = r.json()["data"]["v_url"] hour_video_duration = r.json()["data"]["du"] hour_video_comment_cnt = r.json()["data"]["comment_count"] hour_video_like_cnt = r.json()["data"]["favor"]["total"] hour_video_share_cnt = r.json()["data"]["share"] hour_video_width = r.json()["data"]["w"] hour_video_height = r.json()["data"]["h"] hour_video_send_time = r.json()["data"]["t"] publish_time_stamp = int(int(hour_video_send_time)/1000) publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)) hour_user_name = r.json()["data"]["user"]["nick"] hour_head_url = r.json()["data"]["user"]["hurl"] video_info_dict = { "video_id": v_id, "video_title": v_title, "duration": hour_video_duration, "play_cnt": hour_play_cnt, "like_cnt": hour_video_like_cnt, "comment_cnt": hour_video_comment_cnt, "share_cnt": hour_video_share_cnt, "user_name": hour_user_name, "publish_time_stamp": publish_time_stamp, "publish_time_str": publish_time_str, "video_width": hour_video_width, "video_height": hour_video_height, "avatar_url": hour_head_url, "profile_id": p_id, "profile_mid": p_mid, "cover_url": hour_cover_url, "video_url": hour_video_url, "session": f"xiaoniangao-hour-{int(time.time())}" } # 下载封面 Common.download_method(log_type=log_type, crawler=crawler, text="cover", title=video_info_dict["video_title"], url=video_info_dict["cover_url"]) # 下载视频 Common.download_method(log_type=log_type, crawler=crawler, text="video", title=video_info_dict["video_title"], url=video_info_dict["video_url"]) # 保存视频信息至 "./videos/{download_video_title}/info.txt" Common.save_video_info(log_type=log_type, crawler=crawler, video_dict=video_info_dict) # 上传视频 Common.logger(log_type, crawler).info("开始上传视频...") our_video_id = Publish.upload_and_publish(log_type=log_type, crawler=crawler, strategy=strategy, our_uid="hour", env=env, oss_endpoint=oss_endpoint) if env == "dev": our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info" else: our_video_link = f"https://admin.piaoquantv.com/cms/post-detail/{our_video_id}/info" Common.logger(log_type, crawler).info("视频上传完成") if our_video_id is None: # 删除视频文件夹 shutil.rmtree(f"./{crawler}/videos/{video_info_dict['video_title']}") return # 视频信息保存数据库 rule_dict = { "duration": {"min": 40}, "play_cnt": {"min": 4000}, "publish_day": {"min": 10} } insert_sql = f""" insert into crawler_video(video_id, out_user_id, platform, strategy, out_video_id, video_title, cover_url, video_url, duration, publish_time, play_cnt, crawler_rule, width, height) values({our_video_id}, "{video_info_dict['profile_id']}", "{cls.platform}", "小时榜爬虫策略", "{video_info_dict['video_id']}", "{video_info_dict['video_title']}", "{video_info_dict['cover_url']}", "{video_info_dict['video_url']}", {int(video_info_dict['duration'])}, "{video_info_dict['publish_time_str']}", {int(video_info_dict['play_cnt'])}, '{json.dumps(rule_dict)}', {int(video_info_dict['video_width'])}, {int(video_info_dict['video_height'])}) """ Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}") MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine) Common.logger(log_type, crawler).info('视频信息插入数据库成功!') # 视频写入飞书 Feishu.insert_columns(log_type, crawler, "yatRv2", "ROWS", 1, 2) # 视频ID工作表,首行写入数据 upload_time = int(time.time()) values = [[time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(upload_time)), "小时级上升榜", str(video_info_dict['video_id']), str(video_info_dict['video_title']), our_video_link, video_info_dict['play_cnt'], video_info_dict['comment_cnt'], video_info_dict['like_cnt'], video_info_dict['share_cnt'], video_info_dict['duration'], f"{video_info_dict['video_width']}*{video_info_dict['video_height']}", str(video_info_dict['publish_time_str'].replace("-", "/")), str(video_info_dict['user_name']), str(video_info_dict['profile_id']), str(video_info_dict['profile_mid']), str(video_info_dict['avatar_url']), str(video_info_dict['cover_url']), str(video_info_dict['video_url'])]] time.sleep(1) Feishu.update_values(log_type, crawler, "yatRv2", "F2:Z2", values) Common.logger(log_type, crawler).info('视频信息写入飞书成功\n') except Exception as e: Common.logger(log_type, crawler).error(f"download_video:{e}\n") # 更新小时榜数据 @classmethod def update_videoList(cls, log_type, crawler, today, yesterday, before_yesterday): """ 更新小时榜数据 """ try: update_hour_sheet = Feishu.get_values_batch("hour", "xiaoniangao", "ba0da4") if len(update_hour_sheet) == 2: Common.logger(log_type, crawler).info("当前工作表无数据") else: for i in range(2, len(update_hour_sheet) + 1): Common.logger(log_type, crawler).info(f"更新第:{i+1}行视频信息") # 略过空行 if update_hour_sheet[i][0] is None \ or update_hour_sheet[i][1] is None or update_hour_sheet[i][2] is None: Common.logger(log_type, crawler).info("空行,略过") else: # 视频标题 v_title = update_hour_sheet[i][3] Common.logger(log_type, crawler).info("video_title:{}", v_title) # 视频 ID v_id = update_hour_sheet[i][2] Common.logger(log_type, crawler).info("video_id:{}", v_id) # profile_id,用户 ID p_id = update_hour_sheet[i][0] Common.logger(log_type, crawler).info("profile_id:{}", p_id) # profile_mid p_mid = update_hour_sheet[i][1] Common.logger(log_type, crawler).info("profile_mid:{}", p_mid) # 抓取时的播放量 v_play_cnt = update_hour_sheet[i][10] Common.logger(log_type, crawler).info("video_play_cnt:{}", v_play_cnt) # 抓取时间 v_upload_time = update_hour_sheet[i][9] Common.logger(log_type, crawler).info("video_send_time:{}", v_upload_time) # 抓取时间的时间戳格式(秒为单位) v_time = int(time.mktime(time.strptime(v_upload_time, "%Y/%m/%d %H:%M:%S"))) # 抓取时间:日期 upload_data = v_upload_time.split(" ")[0] # 抓取时间:小时 upload_hour = v_upload_time.split(" ")[-1].split(":")[0] uid_token_dict = cls.get_uid_token() url = "https://kapi.xiaoniangao.cn/profile/get_profile_by_id" headers = { # "x-b3-traceid": cls.hour_x_b3_traceid, "x-b3-traceid": '1c403a4aa72e3c', # "X-Token-Id": cls.hour_x_token_id, "X-Token-Id": 'ab619e96d801f1567388629260aa68ec-1202200806', # "uid": cls.hour_uid, "uid": uid_token_dict['uid'], "content-type": "application/json", "Accept-Encoding": "gzip,compress,br,deflate", "User-Agent": 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X)' ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 ' 'MicroMessenger/8.0.20(0x18001432) NetType/WIFI Language/zh_CN', # "Referer": cls.hour_referer "Referer": 'https://servicewechat.com/wxd7911e4c177690e4/624/page-frame.html' } data = { "play_src": "1", "profile_id": int(p_id), "profile_mid": int(p_mid), "qs": "imageMogr2/gravity/center/rotate/$/thumbnail/" "!400x400r/crop/400x400/interlace/1/format/jpg", "h_qs": "imageMogr2/gravity/center/rotate/$/thumbnail" "/!80x80r/crop/80x80/interlace/1/format/jpg", "share_width": 625, "share_height": 500, "no_comments": True, "no_follow": True, "vid": v_id, "hot_l1_comment": True, # "token": cls.hour_token, # "uid": cls.hour_uid, "token": uid_token_dict['token'], "uid": uid_token_dict['uid'], "proj": "ma", "wx_ver": "8.0.20", "code_ver": "3.62.0", "log_common_params": { "e": [{ "data": { "page": "dynamicSharePage" } }], "ext": { "brand": "iPhone", "device": "iPhone 11", "os": "iOS 14.7.1", "weixinver": "8.0.20", "srcver": "2.24.3", "net": "wifi", "scene": "1089" }, "pj": "1", "pf": "2", "session_id": "7bcce313-b57d-4305-8d14-6ebd9a1bad29" } } try: urllib3.disable_warnings() r = requests.post(headers=headers, url=url, json=data, proxies=proxies, verify=False) hour_play_cnt = r.json()["data"]["play_pv"] Common.logger(log_type, crawler).info("视频详情,当前播放量:{}", hour_play_cnt) # 固定时间获取符合规则的视频,写入云文档:https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih?sheet=ba0da4 update_hour = datetime.datetime.now() if int(time.time()) - v_time >= 172800: Common.logger(log_type, crawler).info("抓取时间超过 2 天\n") return elif upload_data == today and update_hour.hour == 10 and int(upload_hour) <= 10: Common.logger(log_type, crawler).info("满足条件: 抓取日期为今天 and 当前时间:10点 and 抓取时间<=10点") # 当天 10:00 视频播放量 ten_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 10:00 视频播放量:{}", ten_hour_play_cnt) # 10:00 的上升榜写入数据 values = int(ten_hour_play_cnt) - int(v_play_cnt) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "L" + str(i + 1) + ":" + "L" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info(f"10:00数据更新成功:{values}\n") elif upload_data == today and update_hour.hour == 15 and int(upload_hour) <= 10: Common.logger(log_type, crawler).info("满足条件: 抓取日期为今天 and 当前时间:15点 and 抓取时间<=10点") # 当天 15:00 视频播放量 fifteen_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info(f"当天 15:00 视频播放量:{fifteen_hour_play_cnt}") # 当天 10:00 上升的数据 if update_hour_sheet[i][11] is None: ten_up_cnt = 0 else: ten_up_cnt = update_hour_sheet[i][11] # 15:00 的上升榜写入数据 values = int(fifteen_hour_play_cnt) - (int(v_play_cnt) + int(ten_up_cnt)) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "M" + str(i + 1) + ":" + "M" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("15:00数据更新成功:{}\n", values) elif upload_data == today and update_hour.hour == 15 and 10 < int(upload_hour) <= 15: Common.logger(log_type, crawler).info("满足条件: 抓取日期为今天 and 当前时间:15点 and 10<抓取时间<=15点") # 当天 15:00 视频播放量 fifteen_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 15:00 视频播放量:{}", fifteen_hour_play_cnt) # 15:00 的上升榜写入数据 values = int(fifteen_hour_play_cnt) - int(v_play_cnt) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "M" + str(i + 1) + ":" + "M" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("15:00数据更新成功:{}\n", values) elif upload_data == today and update_hour.hour == 20 and int(upload_hour) <= 10: Common.logger(log_type, crawler).info("满足条件: 抓取日期为今天 and 当前时间:20点 and 抓取时间<=10点") # 当天 20:00 视频播放量 twenty_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 20:00 视频播放量:{}", twenty_hour_play_cnt) # 当天 10:00 上升的数据 if update_hour_sheet[i][11] is None: ten_up_cnt = 0 else: ten_up_cnt = update_hour_sheet[i][11] # 当天 15:00 上升的数据 if update_hour_sheet[i][12] is None: fifteen_up_cnt = 0 else: fifteen_up_cnt = update_hour_sheet[i][12] # 20:00 的上升榜写入数据 values = int(twenty_hour_play_cnt) - ( int(v_play_cnt) + int(ten_up_cnt) + int(fifteen_up_cnt)) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "N" + str(i + 1) + ":" + "N" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("20:00数据更新成功:{}\n", values) elif upload_data == today and update_hour.hour == 20 and 10 < int(upload_hour) <= 15: Common.logger(log_type, crawler).info("满足条件: 抓取日期为今天 and 当前时间:20点 and 10<抓取时间<=15点") # 当天 20:00 视频播放量 twenty_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 20:00 视频播放量:{}", twenty_hour_play_cnt) # 当天 15:00 上升的数据 if update_hour_sheet[i][12] is None: fifteen_up_cnt = 0 else: fifteen_up_cnt = update_hour_sheet[i][12] # 20:00 的上升榜写入数据 values = int(twenty_hour_play_cnt) - (int(v_play_cnt) + int(fifteen_up_cnt)) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "N" + str(i + 1) + ":" + "N" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("20:00数据更新成功:{}\n", values) elif upload_data == today and update_hour.hour == 20 and 15 < int(upload_hour) <= 20: Common.logger(log_type, crawler).info("满足条件: 抓取日期为今天 and 当前时间:20点 and 15<抓取时间<=20点") # 当天 20:00 视频播放量 twenty_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 20:00 视频播放量:{}", twenty_hour_play_cnt) # 20:00 的上升榜写入数据 values = int(twenty_hour_play_cnt) - int(v_play_cnt) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "N" + str(i + 1) + ":" + "N" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("20:00数据更新成功:{}\n", values) elif (upload_data == yesterday or upload_data == before_yesterday) \ and update_hour.hour == 10: Common.logger(log_type, crawler).info("满足条件: 抓取时间小于今天 and 当前时间:10点") # 当天 10:00 视频播放量 ten_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 10:00 视频播放量:{}", ten_hour_play_cnt) # 10:00 的上升榜写入数据 values = int(ten_hour_play_cnt) - int(v_play_cnt) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "L" + str(i + 1) + ":" + "L" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("10:00数据更新成功:{}\n", values) elif (upload_data == yesterday or upload_data == before_yesterday) \ and update_hour.hour == 15: Common.logger(log_type, crawler).info("满足条件: 抓取时间小于今天 and 当前时间:15点") # 当天 15:00 视频播放量 fifteen_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 15:00 视频播放量:{}", fifteen_hour_play_cnt) # 当天 10:00 上升的数据 if update_hour_sheet[i][11] is None: ten_up_cnt = 0 else: ten_up_cnt = update_hour_sheet[i][11] # 15:00 的上升榜写入数据 values = int(fifteen_hour_play_cnt) - (int(v_play_cnt) + int(ten_up_cnt)) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "M" + str(i + 1) + ":" + "M" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("15:00数据更新成功:{}\n", values) elif (upload_data == yesterday or upload_data == before_yesterday) \ and update_hour.hour == 20: Common.logger(log_type, crawler).info("满足条件: 抓取时间小于今天 and 当前时间:20点") # 当天 20:00 视频播放量 twenty_hour_play_cnt = hour_play_cnt Common.logger(log_type, crawler).info("当天 20:00 视频播放量:{}", twenty_hour_play_cnt) # 当天 10:00 上升的数据 if update_hour_sheet[i][11] is None: ten_up_cnt = 0 else: ten_up_cnt = update_hour_sheet[i][11] # 当天 15:00 上升的数据 if update_hour_sheet[i][12] is None: fifteen_up_cnt = 0 else: fifteen_up_cnt = update_hour_sheet[i][12] # 20:00 的上升榜写入数据 values = int(twenty_hour_play_cnt) - ( int(v_play_cnt) + int(ten_up_cnt) + int(fifteen_up_cnt)) time.sleep(1) Feishu.update_values( log_type, "xiaoniangao", "ba0da4", "N" + str(i + 1) + ":" + "N" + str(i + 1), [[values]]) Common.logger(log_type, crawler).info("20:00数据更新成功:{}\n", values) except Exception as e: Common.logger(log_type, crawler).error("视频详情:{},异常:{}\n", v_title, e) except Exception as e: Common.logger(log_type, crawler).error("获取小时榜数据异常:{}\n", e) # 下载/上传 @classmethod def download_publish(cls, log_type, crawler, strategy, oss_endpoint, env, machine): """ 2.从云文档中下载符合规则的视频:https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=ba0da4 2.1 当日 10:00 or 15:00 or 20:00 视频播放量上升 > 5000 2.2 当日 10:00 and 15:00 视频播放量上升 > 2000 2.3 当日 15:00 and 20:00 视频播放量上升 > 2000 2.4 昨日 20:00 and 今日 10:00 视频播放量上升 > 2000 """ while True: try: hour_sheet = Feishu.get_values_batch("hour", "xiaoniangao", "ba0da4") if hour_sheet is None: Common.logger(log_type, crawler).warning(f"小时级数据_feeds:{hour_sheet}\n") continue if len(hour_sheet) == 2: Common.logger(log_type, crawler).info("小时级数据_feeds,没有数据\n") return for i in range(2, len(hour_sheet)): Common.logger(log_type, crawler).info(f"分析第:{i+1}行视频信息是否符合下载规则") # 略过空行 if hour_sheet[i][0] is None or hour_sheet[i][1] is None or hour_sheet[i][2] is None: Common.logger(log_type, crawler).info("空行,略过") continue # 视频标题 v_title = hour_sheet[i][3] # 视频 ID v_id = hour_sheet[i][2] # profile_id,用户 ID p_id = hour_sheet[i][0] # profile_mid p_mid = hour_sheet[i][1] # 抓取时间 v_upload_time = hour_sheet[i][9] v_send_time = int(time.mktime(time.strptime(v_upload_time, "%Y/%m/%d %H:%M:%S"))) # 播放量 v_play_cnt = hour_sheet[i][10] # 今日 10:00 数据上升量 if hour_sheet[i][11] is None: ten_cnt = 0 else: ten_cnt = hour_sheet[i][11] # 今日 15:00 数据上升量 if hour_sheet[i][12] is None: fifteen_cnt = 0 else: fifteen_cnt = hour_sheet[i][12] # 今日 20:00 数据上升量 if hour_sheet[i][13] is None: twenty_cnt = 0 else: twenty_cnt = hour_sheet[i][13] # 昨日 20:00 数据上升量 if hour_sheet[i][16] is None: yesterday_twenty_cnt = 0 else: yesterday_twenty_cnt = hour_sheet[i][16] Common.logger(log_type, crawler).info(f"视频标题:{v_title}") Common.logger(log_type, crawler).info(f"10:00 / 15:00 / 20:00 上升量: {ten_cnt} / {fifteen_cnt} / {twenty_cnt}") if int(time.time()) - int(v_send_time) >= 3600*24*3: Common.logger(log_type, crawler).info("抓取时间超过 3 天") return elif cls.repeat_video(log_type, crawler, v_id, env, machine) != 0: Common.logger(log_type, crawler).info('视频已下载\n') # 播放量大于 50000,直接下载 elif int(v_play_cnt) >= 50000: Common.logger(log_type, crawler).info(f"播放量:{v_play_cnt} >= 50000,满足下载规则,开始下载视频") cls.download_video(log_type=log_type, crawler=crawler, p_id=p_id, p_mid=p_mid, v_title=v_title, v_id=v_id, strategy=strategy, oss_endpoint=oss_endpoint, env=env, machine=machine) # 上升榜判断逻辑,任意时间段上升量>=5000,连续两个时间段上升量>=2000 elif int(ten_cnt) >= 5000 or int(fifteen_cnt) >= 5000 or int(twenty_cnt) >= 5000: Common.logger(log_type, crawler).info(f"10:00 or 15:00 or 20:00 数据上升量:{ten_cnt} or {fifteen_cnt} or {twenty_cnt} >= 5000") Common.logger(log_type, crawler).info("满足下载规则,开始下载视频") cls.download_video(log_type=log_type, crawler=crawler, p_id=p_id, p_mid=p_mid, v_title=v_title, v_id=v_id, strategy=strategy, oss_endpoint=oss_endpoint, env=env, machine=machine) elif int(ten_cnt) >= 2000 and int(fifteen_cnt) >= 2000: Common.logger(log_type, crawler).info(f"10:00 and 15:00 数据上升量:{ten_cnt} and {fifteen_cnt} >= 2000") Common.logger(log_type, crawler).info("满足下载规则,开始下载视频") cls.download_video(log_type=log_type, crawler=crawler, p_id=p_id, p_mid=p_mid, v_title=v_title, v_id=v_id, strategy=strategy, oss_endpoint=oss_endpoint, env=env, machine=machine) elif int(fifteen_cnt) >= 2000 and int(twenty_cnt) >= 2000: Common.logger(log_type, crawler).info(f"15:00 and 20:00 数据上升量:{fifteen_cnt} and {twenty_cnt} >= 2000") Common.logger(log_type, crawler).info("满足下载规则,开始下载视频") cls.download_video(log_type=log_type, crawler=crawler, p_id=p_id, p_mid=p_mid, v_title=v_title, v_id=v_id, strategy=strategy, oss_endpoint=oss_endpoint, env=env, machine=machine) elif int(yesterday_twenty_cnt) >= 2000 and int(ten_cnt) >= 2000: Common.logger(log_type, crawler).info(f"昨日20:00 and 今日10:00 数据上升量:{yesterday_twenty_cnt} and {ten_cnt} >= 2000") Common.logger(log_type, crawler).info("满足下载规则,开始下载视频") cls.download_video(log_type=log_type, crawler=crawler, p_id=p_id, p_mid=p_mid, v_title=v_title, v_id=v_id, strategy=strategy, oss_endpoint=oss_endpoint, env=env, machine=machine) else: Common.logger(log_type, crawler).info("上升量不满足下载规则") except Exception as e: Common.logger(log_type, crawler).error(f"download_publish:{e}\n") if __name__ == "__main__": # print(XiaoniangaoHour.filter_words("hour", "xiaoniangao")) # print(XiaoniangaoHour.get_uid_token()) pass