|
@@ -1,54 +1,46 @@
|
|
|
-import asyncio
|
|
|
import os
|
|
|
-import random
|
|
|
import sys
|
|
|
-import time
|
|
|
-import uuid
|
|
|
+import asyncio
|
|
|
import json
|
|
|
+import random
|
|
|
+import uuid
|
|
|
+import time
|
|
|
+import traceback
|
|
|
from datetime import datetime
|
|
|
-
|
|
|
import aiohttp
|
|
|
-import requests
|
|
|
|
|
|
+sys.path.append(os.getcwd())
|
|
|
from application.common.feishu import FsData
|
|
|
from application.common.feishu.feishu_utils import FeishuUtils
|
|
|
from application.common.gpt import GPT4oMini
|
|
|
-from application.common.redis.redis_helper import SyncRedisHelper
|
|
|
-
|
|
|
-sys.path.append(os.getcwd())
|
|
|
-
|
|
|
-from application.items import VideoItem
|
|
|
-from application.pipeline import PiaoQuanPipeline
|
|
|
from application.common.messageQueue import MQ
|
|
|
from application.common.log import AliyunLogger
|
|
|
-from application.common.mysql import MysqlHelper
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+from application.functions.zqkd_db_redis import DatabaseOperations, RedisOperations
|
|
|
+from application.items import VideoItem
|
|
|
+from application.pipeline import PiaoQuanPipeline
|
|
|
+from application.common.log import Local
|
|
|
|
|
|
|
|
|
class ZhongQingKanDian:
|
|
|
- # / recommend(列表11个id)
|
|
|
- # ↓ 并发请求每个id的 / related(得到列表N个元素)
|
|
|
- # ↓ 对每个元素并发请求 / detail
|
|
|
- # ↓ 若为视频,写入Redis(键:detail_id,值:视频数据)
|
|
|
API_BASE_URL = "http://8.217.192.46:8889"
|
|
|
COMMON_HEADERS = {
|
|
|
"Content-Type": "application/json"
|
|
|
}
|
|
|
+ # 最大重试次数
|
|
|
MAX_RETRIES = 3
|
|
|
- TIMEOUT = 30 # 设置超时时间
|
|
|
- max_recommend_count = 100 # 推荐抓取每日最大量
|
|
|
- max_related_recommend_count = 200 # 相关推荐抓取每日最大量
|
|
|
- max_author_video = 300 # 账号每日抓取视频最大量
|
|
|
-
|
|
|
- """
|
|
|
- 中青看点推荐流
|
|
|
- Topic:zqkd_recommend_prod
|
|
|
- """
|
|
|
+ # 最大等待时长
|
|
|
+ TIMEOUT = 30
|
|
|
|
|
|
def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
|
|
|
- self.limit_flag = False
|
|
|
+ """
|
|
|
+ 初始化
|
|
|
+ :param platform: 平台名称 zhongqingkandian
|
|
|
+ :param mode: 运行模式 recommend
|
|
|
+ :param rule_dict: 规则字典,包含视频数量限制、时长限制等规则 [{"videos_cnt":{"min":100,"max":0}},{"duration":{"min":30,"max":1200}}]
|
|
|
+ :param user_list: 用户列表
|
|
|
+ :param env: 运行环境,默认为 "prod"
|
|
|
+ """
|
|
|
+ self.limit_flag = True
|
|
|
self.platform = platform
|
|
|
self.mode = mode
|
|
|
self.rule_dict = rule_dict
|
|
@@ -58,287 +50,303 @@ class ZhongQingKanDian:
|
|
|
self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
self.expire_flag = False
|
|
|
self.aliyun_log = AliyunLogger(mode=self.mode, platform=self.platform)
|
|
|
- self.mysql = MysqlHelper(mode=self.mode, platform=self)
|
|
|
+ self.db_ops = DatabaseOperations(mode=mode, platform=platform)
|
|
|
+ self.redis_ops = RedisOperations()
|
|
|
data_rule = FsData()
|
|
|
self.title_rule = data_rule.get_title_rule()
|
|
|
+ self.LocalLog = Local.logger(self.platform, self.mode)
|
|
|
|
|
|
async def send_request(self, path, data):
|
|
|
+ """
|
|
|
+ 异步发送 POST 请求到指定路径,带有重试机制。
|
|
|
+ :param path: 请求的 API 路径
|
|
|
+ :param data: 请求的数据
|
|
|
+ :return: 响应的 JSON 数据,如果请求失败则返回 None
|
|
|
+ """
|
|
|
full_url = f"{self.API_BASE_URL}{path}"
|
|
|
async with aiohttp.ClientSession(headers=self.COMMON_HEADERS) as session:
|
|
|
for retry in range(self.MAX_RETRIES):
|
|
|
try:
|
|
|
async with session.post(full_url, data=data, timeout=self.TIMEOUT) as response:
|
|
|
response.raise_for_status()
|
|
|
+ self.LocalLog.info(f"{path}响应数据:{await response.json()}")
|
|
|
return await response.json()
|
|
|
- except aiohttp.ClientError as e:
|
|
|
- if retry < self.MAX_RETRIES - 1:
|
|
|
- await asyncio.sleep(2)
|
|
|
- except json.JSONDecodeError as e:
|
|
|
- if retry < self.MAX_RETRIES - 1:
|
|
|
- await asyncio.sleep(2)
|
|
|
+ except (aiohttp.ClientError, json.JSONDecodeError) as e:
|
|
|
+ tb_info = traceback.format_exc()
|
|
|
+ self.LocalLog.info(f"{path}请求失败:{e} \n{tb_info}")
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="3000",
|
|
|
+ message=f"请求 {path} 失败,错误信息: {str(e)}",
|
|
|
+ data={"path": path}
|
|
|
+ )
|
|
|
+ await asyncio.sleep(random.randint(5, 10))
|
|
|
return None
|
|
|
|
|
|
- def is_response_valid(self, resp):
|
|
|
- if resp['code'] != 0:
|
|
|
+ def is_response_valid(self, resp, url):
|
|
|
+ """
|
|
|
+ 检查响应是否有效(状态码为 0 表示有效)。
|
|
|
+ :param resp: 响应数据
|
|
|
+ :param url: 请求的 URL
|
|
|
+ :return: 如果响应有效则返回响应数据,否则返回 None
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ if resp and resp.get('code') != 0:
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="3000",
|
|
|
+ message=f"抓取{url}失败,请求失败,响应:{resp}"
|
|
|
+ )
|
|
|
+ self.LocalLog.info(f"{url}请求失败,响应:{resp}")
|
|
|
+ return None
|
|
|
+ return resp
|
|
|
+ except Exception as e:
|
|
|
+ tb_info = traceback.format_exc()
|
|
|
self.aliyun_log.logging(
|
|
|
code="3000",
|
|
|
- message="抓取单条视频失败,请求失败"
|
|
|
- ),
|
|
|
- return
|
|
|
- return resp
|
|
|
+ message=f"检查响应有效性时出错,错误信息: {str(e)}",
|
|
|
+ data={"url": url, "resp": resp}
|
|
|
+ )
|
|
|
+ self.LocalLog.info(f"检查 {url} 响应有效性时出错:{e} \n{tb_info}")
|
|
|
+ return None
|
|
|
|
|
|
async def req_recommend_list(self):
|
|
|
- print("开始请求推荐")
|
|
|
- '''
|
|
|
- 推荐请求
|
|
|
- '''
|
|
|
- url = '/crawler/zhong_qing_kan_dian/recommend'
|
|
|
- body = json.dumps({"cursor": ""})
|
|
|
- resp = await self.send_request(url, body)
|
|
|
- return self.is_response_valid(resp)
|
|
|
-
|
|
|
- async def req_related_recommend_list(self, content_id):
|
|
|
- print("请求相关推荐")
|
|
|
- '''
|
|
|
- 相关推荐请求
|
|
|
- '''
|
|
|
- url = '/crawler/zhong_qing_kan_dian/related'
|
|
|
- body = json.dumps({
|
|
|
- "content_id": str(content_id),
|
|
|
- "cursor": ""
|
|
|
- })
|
|
|
- resp = await self.send_request(url, body)
|
|
|
- return self.is_response_valid(resp)
|
|
|
-
|
|
|
+ """
|
|
|
+ 异步请求推荐视频列表。
|
|
|
+ :return: 推荐视频列表的有效响应数据,如果请求失败则返回 None
|
|
|
+ """
|
|
|
+ try:
|
|
|
|
|
|
- async def req_detail(self, content_link, label,**kwargs):
|
|
|
- print("请求详情")
|
|
|
- '''
|
|
|
- 请求详情
|
|
|
- '''
|
|
|
- url = '/crawler/zhong_qing_kan_dian/detail'
|
|
|
- body = json.dumps({
|
|
|
- "content_link": content_link
|
|
|
- })
|
|
|
- resp = await self.send_request(url, body)
|
|
|
- if not self.is_response_valid(resp):
|
|
|
- return
|
|
|
- data = resp.get("data", {}).get("data", {})
|
|
|
- if data.get("content_type") != "video":
|
|
|
+ url = '/crawler/zhong_qing_kan_dian/recommend'
|
|
|
+ body = json.dumps({"cursor": ""})
|
|
|
+ self.LocalLog.info(f"开始请求推荐{body}")
|
|
|
+ resp = await self.send_request(url, body)
|
|
|
+ return self.is_response_valid(resp, url)
|
|
|
+ except Exception as e:
|
|
|
+ tb_info = traceback.format_exc()
|
|
|
self.aliyun_log.logging(
|
|
|
- code="3003",
|
|
|
- message=f"跳过非视频内容(label={label})",
|
|
|
- data={"content_link": content_link}
|
|
|
+ code="1003",
|
|
|
+ message=f"请求推荐视频列表时发生异常,错误信息: {str(e)}\n{tb_info}",
|
|
|
+ data={"url": url}
|
|
|
)
|
|
|
- return
|
|
|
- print("是视频")
|
|
|
- # 将 kwargs 中的键值对更新到 data 字典中
|
|
|
- data.update(kwargs)
|
|
|
- self.process_video_obj(data, label)
|
|
|
- await asyncio.sleep(10)
|
|
|
-
|
|
|
- async def control_request(self):
|
|
|
- print("开始处理")
|
|
|
- """核心控制逻辑:顺序处理三个接口"""
|
|
|
- recommend_resp = await self.req_recommend_list()
|
|
|
- if not self.is_response_valid(recommend_resp):
|
|
|
- return
|
|
|
-
|
|
|
- recommend_list = recommend_resp.get("data", {}).get("data", [])
|
|
|
+ self.LocalLog.info(f"请求推荐视频列表 {url} 时发生异常:{str(e)} \n{tb_info}")
|
|
|
+ return None
|
|
|
|
|
|
- for video_obj in recommend_list:
|
|
|
- content_link = video_obj.get("share_url")
|
|
|
- content_id = video_obj.get("id")
|
|
|
|
|
|
- if not (content_link and content_id):
|
|
|
- continue
|
|
|
- # 处理推荐视频详情
|
|
|
- await self.req_detail(content_link, "recommend",**video_obj)
|
|
|
|
|
|
- # # 处理相关推荐列表(间隔后执行)
|
|
|
- # await asyncio.sleep(5)
|
|
|
- # related_resp = await self.req_related_recommend_list(content_id)
|
|
|
- # if not self.is_response_valid(related_resp):
|
|
|
- # continue
|
|
|
- #
|
|
|
- # related_list = related_resp.get("data", {}).get("data", [])
|
|
|
- # for related_obj in related_list:
|
|
|
- # related_content_link = related_obj.get("share_url")
|
|
|
- # if related_content_link:
|
|
|
- # await self.req_detail(related_content_link, "related",**related_obj)
|
|
|
- def process_video_obj(self, video_obj, label):
|
|
|
+ async def req_detail(self, content_link, **kwargs):
|
|
|
"""
|
|
|
- 处理视频
|
|
|
- :param video_obj:
|
|
|
+ 异步请求视频详情。
|
|
|
+ :param content_link: 视频内容链接
|
|
|
+ :param label: 视频标签(如 "recommend" 或 "related")
|
|
|
+ :param kwargs: 额外的视频信息
|
|
|
+ :return: 无返回值,处理视频详情信息
|
|
|
"""
|
|
|
+ try:
|
|
|
+ self.LocalLog.info(f"开始请求视频详情,链接: {content_link}")
|
|
|
+ url = '/crawler/zhong_qing_kan_dian/detail'
|
|
|
+ body = json.dumps({
|
|
|
+ "content_link": content_link
|
|
|
+ })
|
|
|
+ resp = await self.send_request(url, body)
|
|
|
+ if not self.is_response_valid(resp, url):
|
|
|
+ return
|
|
|
+ data = resp.get("data", {}).get("data", {})
|
|
|
+ if data.get("content_type") != "video":
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="3003",
|
|
|
+ message=f"跳过非视频内容)",
|
|
|
+ data={"content_link": content_link}
|
|
|
+ )
|
|
|
+ self.LocalLog.info(f"跳过非视频内容,链接: {content_link}")
|
|
|
+ return
|
|
|
+ self.LocalLog.info(f"{content_link} 是视频")
|
|
|
+ data.update(kwargs)
|
|
|
+ await self.process_video_obj(data)
|
|
|
+ await asyncio.sleep(10)
|
|
|
+ except Exception as e:
|
|
|
+ tb_info = traceback.format_exc()
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="1005",
|
|
|
+ message=f"请求视频详情时发生异常,错误信息: {str(e)}",
|
|
|
+ data={"content_link": content_link}
|
|
|
+ )
|
|
|
+ self.LocalLog.error(f"请求视频详情,链接 {content_link} 时发生异常:{e} \n{tb_info}")
|
|
|
|
|
|
- if not self.save_video_id():
|
|
|
-
|
|
|
- our_user = random.choice(self.user_list)
|
|
|
- trace_id = self.platform + str(uuid.uuid1())
|
|
|
- item = VideoItem()
|
|
|
+ async def control_request_recommend(self):
|
|
|
+ """
|
|
|
+ 控制推荐视频列表的请求和处理流程。
|
|
|
+ :return: 无返回值,根据下载数量限制控制流程
|
|
|
+ """
|
|
|
+ while self.limit_flag:
|
|
|
+ try:
|
|
|
+ self.LocalLog.info(f"开始推荐视频列表的请求和处理流程,今日已爬推荐 {self.download_cnt} 个视频")
|
|
|
+ recommend_resp = await self.req_recommend_list()
|
|
|
+ if not recommend_resp:
|
|
|
+ continue
|
|
|
+ recommend_list = recommend_resp.get("data", {}).get("data", [])
|
|
|
+ self.LocalLog.info(f"获取的推荐列表长度:{len(recommend_list)}")
|
|
|
+ for video_obj in recommend_list:
|
|
|
+ content_link = video_obj.get("share_url")
|
|
|
+ content_id = video_obj.get("id")
|
|
|
+ self.LocalLog.info(f"content_link == {content_link} \n content_id == {content_id}")
|
|
|
+ if not (content_link and content_id):
|
|
|
+ continue
|
|
|
+ # 当前内容id保存到redis
|
|
|
+ self.redis_ops.save_recommend_video(content_id)
|
|
|
+ await self.req_detail(content_link, **video_obj)
|
|
|
+ except Exception as e:
|
|
|
+ tb_info = traceback.format_exc()
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="3008",
|
|
|
+ message=f"控制推荐视频请求和处理时发生异常,错误信息: {str(e)}",
|
|
|
+ data={}
|
|
|
+ )
|
|
|
+ self.LocalLog.info(f"控制推荐视频请求和处理时发生异常:\n{tb_info}")
|
|
|
+ self.LocalLog.info(f"循环结束,当前 limit_flag 值为: {self.limit_flag}")
|
|
|
+
|
|
|
+ async def process_video_obj(self, video_obj):
|
|
|
+ """
|
|
|
+ 处理视频对象,包括检查视频时长、用户信息、保存数据等操作。
|
|
|
+ :param video_obj: 视频对象,包含视频的各种信息
|
|
|
+ :return: 无返回值,完成视频对象的处理
|
|
|
+ """
|
|
|
try:
|
|
|
+
|
|
|
+ video_duration = video_obj["video_url_list"][0]['video_duration']
|
|
|
video_id = video_obj['channel_content_id']
|
|
|
+ # 检查视频ID是否存在
|
|
|
+ if self.redis_ops.check_video_id_exists(video_id):
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="3004",
|
|
|
+ message=f"重复视频ID:{video_id}"
|
|
|
+ )
|
|
|
+ self.LocalLog.info(f"重复视频ID: {video_id}")
|
|
|
+ return
|
|
|
+ our_user = random.choice(self.user_list)
|
|
|
+ trace_id = self.platform + str(uuid.uuid1())
|
|
|
+ item = VideoItem()
|
|
|
+
|
|
|
account_id = video_obj["channel_account_id"]
|
|
|
account_name = video_obj["channel_account_name"]
|
|
|
account_avatar = video_obj["avatar"]
|
|
|
- is_repeat_user = self.select_id(account_id)
|
|
|
- # 判断用户是否重复
|
|
|
+ # 检查用户ID是否存在
|
|
|
+ is_repeat_user = self.db_ops.check_user_id(account_id)
|
|
|
if is_repeat_user:
|
|
|
- self.update_name_url(account_id, account_name, account_avatar)
|
|
|
+ # 更新用户信息,使用异步方法并等待结果
|
|
|
+ self.LocalLog.info(f"用户{account_id}已经存在数据库中")
|
|
|
+ self.db_ops.update_user(account_id, account_name, account_avatar)
|
|
|
else:
|
|
|
- # 写表
|
|
|
- self.insert_name_url(account_id, account_name, account_avatar)
|
|
|
- # 写redis
|
|
|
- self.write_redis_user_data(json.dumps({"uid": account_id}))
|
|
|
- print("写入成功")
|
|
|
- except Exception as e:
|
|
|
- print(f"写入异常{e}")
|
|
|
- pass
|
|
|
- url = video_obj["video_url_list"][0]['video_url']
|
|
|
- duration = video_obj["video_url_list"][0]['video_duration']
|
|
|
- item.add_video_info("video_id", video_obj['channel_content_id'])
|
|
|
- item.add_video_info("video_title", video_obj["title"])
|
|
|
- item.add_video_info("play_cnt", int(video_obj["read_num"]))
|
|
|
- item.add_video_info("publish_time_stamp", int(int(video_obj["publish_timestamp"])/1000))
|
|
|
- item.add_video_info("out_user_id", video_obj["channel_account_id"])
|
|
|
- item.add_video_info("cover_url", video_obj["image_url_list"][0]['image_url'])
|
|
|
- item.add_video_info("like_cnt", 0)
|
|
|
- item.add_video_info("collection_cnt", int(video_obj['collect_num']))
|
|
|
- item.add_video_info("share_cnt", int(video_obj["share_num"]))
|
|
|
- item.add_video_info("comment_cnt", int(video_obj["cmt_num"]))
|
|
|
- item.add_video_info("video_url", video_obj["video_url_list"][0]['video_url'])
|
|
|
- item.add_video_info("out_video_id", int(video_obj["channel_content_id"]))
|
|
|
- item.add_video_info("duration", video_obj["video_url_list"][0]['video_duration'])
|
|
|
- item.add_video_info("platform", self.platform)
|
|
|
- item.add_video_info("strategy", self.mode)
|
|
|
- item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
|
|
|
- item.add_video_info("user_id", our_user["uid"])
|
|
|
- item.add_video_info("user_name", our_user["nick_name"])
|
|
|
-
|
|
|
- mq_obj = item.produce_item()
|
|
|
- pipeline = PiaoQuanPipeline(
|
|
|
- platform=self.platform,
|
|
|
- mode=self.mode,
|
|
|
- rule_dict=self.rule_dict,
|
|
|
- env=self.env,
|
|
|
- item=mq_obj,
|
|
|
- trace_id=trace_id,
|
|
|
- )
|
|
|
- if pipeline.process_item():
|
|
|
- title_list = self.title_rule.split(",")
|
|
|
- title = video_obj["title"]
|
|
|
- contains_keyword = any(keyword in title for keyword in title_list)
|
|
|
- if contains_keyword:
|
|
|
- new_title = GPT4oMini.get_ai_mini_title(title)
|
|
|
- if new_title:
|
|
|
- item.add_video_info("video_title", new_title)
|
|
|
- current_time = datetime.now()
|
|
|
- formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
- values = [
|
|
|
- [
|
|
|
- video_obj["video_url_list"][0]['video_url'],
|
|
|
- video_obj["image_url_list"][0]['image_url'],
|
|
|
- title,
|
|
|
- new_title,
|
|
|
- formatted_time,
|
|
|
+ self.LocalLog.info(f"用户{account_id}没在数据库中")
|
|
|
+ # 插入用户信息,使用异步方法并等待结果
|
|
|
+ self.db_ops.insert_user(account_id, account_name, account_avatar)
|
|
|
+ self.aliyun_log.logging(code="1007", message=f"用户数据写入成功,用户ID:{account_id}")
|
|
|
+ self.LocalLog.info(f"用户数据写入成功,用户ID: {account_id}")
|
|
|
+ if video_duration > self.rule_dict.get("duration", {}).get("max",
|
|
|
+ 1200) or video_duration < self.rule_dict.get(
|
|
|
+ "duration", {}).get("min", 30):
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="3005",
|
|
|
+ message=f"视频时长不满足条件[>=30s&<=1200s]视频ID:{video_obj['channel_content_id']},视频时长:{video_duration}"
|
|
|
+ )
|
|
|
+ self.LocalLog.info(
|
|
|
+ f"视频时长不满足条件,视频ID: {video_obj['channel_content_id']}, 视频时长: {video_duration}")
|
|
|
+ return
|
|
|
+
|
|
|
+ item.add_video_info("video_id", video_obj['channel_content_id'])
|
|
|
+ item.add_video_info("video_title", video_obj["title"])
|
|
|
+ item.add_video_info("play_cnt", int(video_obj["read_num"]))
|
|
|
+ item.add_video_info("publish_time_stamp", int(int(video_obj["publish_timestamp"]) / 1000))
|
|
|
+ item.add_video_info("out_user_id", video_obj["channel_account_id"])
|
|
|
+ item.add_video_info("cover_url", video_obj["image_url_list"][0]['image_url'])
|
|
|
+ item.add_video_info("like_cnt", 0)
|
|
|
+ item.add_video_info("collection_cnt", int(video_obj['collect_num']))
|
|
|
+ item.add_video_info("share_cnt", int(video_obj["share_num"]))
|
|
|
+ item.add_video_info("comment_cnt", int(video_obj["cmt_num"]))
|
|
|
+ item.add_video_info("video_url", video_obj["video_url_list"][0]['video_url'])
|
|
|
+ item.add_video_info("out_video_id", int(video_obj["channel_content_id"]))
|
|
|
+ item.add_video_info("duration", video_obj["video_url_list"][0]['video_duration'])
|
|
|
+ item.add_video_info("platform", self.platform)
|
|
|
+ item.add_video_info("strategy", self.mode)
|
|
|
+ item.add_video_info("session", f"{self.platform}-{int(time.time())}")
|
|
|
+ item.add_video_info("user_id", our_user["uid"])
|
|
|
+ item.add_video_info("user_name", our_user["nick_name"])
|
|
|
+
|
|
|
+ mq_obj = item.produce_item()
|
|
|
+ pipeline = PiaoQuanPipeline(
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ rule_dict=self.rule_dict,
|
|
|
+ env=self.env,
|
|
|
+ item=mq_obj,
|
|
|
+ trace_id=trace_id
|
|
|
+ )
|
|
|
+ if pipeline.process_item():
|
|
|
+ title_list = self.title_rule.split(",")
|
|
|
+ title = video_obj["title"]
|
|
|
+ contains_keyword = any(keyword in title for keyword in title_list)
|
|
|
+ if contains_keyword:
|
|
|
+ new_title = GPT4oMini.get_ai_mini_title(title)
|
|
|
+ if new_title:
|
|
|
+ item.add_video_info("video_title", new_title)
|
|
|
+ current_time = datetime.now()
|
|
|
+ formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+ values = [
|
|
|
+ [
|
|
|
+ video_obj["video_url_list"][0]['video_url'],
|
|
|
+ video_obj["image_url_list"][0]['image_url'],
|
|
|
+ title,
|
|
|
+ new_title,
|
|
|
+ formatted_time,
|
|
|
+ ]
|
|
|
]
|
|
|
- ]
|
|
|
- FeishuUtils.insert_columns("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "v8S6nL", "ROWS", 1, 2)
|
|
|
- time.sleep(0.5)
|
|
|
- FeishuUtils.update_values("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "v8S6nL", "A2:Z2", values)
|
|
|
- self.download_cnt += 1
|
|
|
- self.mq.send_msg(mq_obj)
|
|
|
- self.aliyun_log.logging(code="1002", message="成功发送至 ETL", data=mq_obj)
|
|
|
- if self.download_cnt >= int(
|
|
|
- self.rule_dict.get("videos_cnt", {}).get("min", 200)
|
|
|
- ):
|
|
|
- self.limit_flag = True
|
|
|
- if label == "recommend":
|
|
|
- key = f"crawler:zqkd:{video_id}"
|
|
|
- self.save_video_id(key)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- """
|
|
|
- 查询用户id是否存在
|
|
|
- """
|
|
|
-
|
|
|
- def select_id(self, uid):
|
|
|
- sql = f""" select uid from zqkd_uid where uid = "{uid}"; """
|
|
|
- db = MysqlHelper()
|
|
|
- repeat_user = db.select(sql=sql)
|
|
|
- if repeat_user:
|
|
|
- return True
|
|
|
- return False
|
|
|
- def update_name_url(self, uid,user_name,avatar_url):
|
|
|
- sql = f""" update zqkd_uid set avatar_url = "{avatar_url}", user_name="{user_name}" where uid = "{uid}"; """
|
|
|
- db = MysqlHelper()
|
|
|
- repeat_video = db.update(sql=sql)
|
|
|
- if repeat_video:
|
|
|
- return True
|
|
|
- return False
|
|
|
-
|
|
|
- def insert_name_url(self, uid, user_name, avatar_url):
|
|
|
- current_time = datetime.now()
|
|
|
- formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
- insert_sql = f"""INSERT INTO zqkd_uid (uid, avatar_url, user_name, data_time) values ('{uid}' ,'{avatar_url}','{user_name}', '{formatted_time}')"""
|
|
|
- db = MysqlHelper()
|
|
|
- repeat_video = db.update(sql=insert_sql)
|
|
|
- if repeat_video:
|
|
|
- return True
|
|
|
- return False
|
|
|
-
|
|
|
- def get_redis_video_data(self):
|
|
|
- """获取一条id"""
|
|
|
- task = f"task:zqkd_video_id"
|
|
|
- helper = SyncRedisHelper()
|
|
|
- client = helper.get_client()
|
|
|
-
|
|
|
- # 获取列表的长度
|
|
|
- list_length = client.llen(task)
|
|
|
- # 循环获取列表中的元素
|
|
|
- for i in range(list_length):
|
|
|
- # 使用 lrange 获取单个元素
|
|
|
- element = client.lrange(task, i, i)
|
|
|
- if element:
|
|
|
- print(f"Element at index {i}: {element[0].decode('utf-8')}")
|
|
|
- return element
|
|
|
-
|
|
|
- def write_redis_user_data(self,key,ret):
|
|
|
- """写入"""
|
|
|
- task = f"task:zqkd_user_id"
|
|
|
- helper = SyncRedisHelper()
|
|
|
- client = helper.get_client()
|
|
|
- client.rpush(task, ret)
|
|
|
+ FeishuUtils.insert_columns("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "v8S6nL", "ROWS", 1, 2)
|
|
|
+ time.sleep(0.5)
|
|
|
+ FeishuUtils.update_values("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "v8S6nL", "A2:Z2", values)
|
|
|
+
|
|
|
+ self.mq.send_msg(mq_obj)
|
|
|
+ self.download_cnt += 1
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="2009",
|
|
|
+ message=f"成功发送视频到etl",
|
|
|
+ data={"video_obj": video_obj}
|
|
|
+ )
|
|
|
+ # 保存视频ID
|
|
|
+ self.redis_ops.save_video_id(video_obj['channel_content_id'])
|
|
|
+ if self.download_cnt >= self.rule_dict.get("videos_cnt", {}).get("min", 100):
|
|
|
+ self.limit_flag = False
|
|
|
+ except Exception as e:
|
|
|
+ tb_info = traceback.format_exc()
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="1005",
|
|
|
+ message=f"处理视频对象时发生异常,错误信息: {str(e)}",
|
|
|
+ data={"video_obj": video_obj}
|
|
|
+ )
|
|
|
+ self.LocalLog.error(f"处理视频对象时发生异常: {e}\n{tb_info}")
|
|
|
|
|
|
async def run(self):
|
|
|
- while True:
|
|
|
- await self.control_request()
|
|
|
- def save_video_id(self,key):
|
|
|
- helper = SyncRedisHelper()
|
|
|
- client = helper.get_client()
|
|
|
- # 将视频ID存储到Redis中,并设置过期时间为7天
|
|
|
- # 检查键是否存在
|
|
|
-
|
|
|
- if client.exists(key):
|
|
|
- return False
|
|
|
- else:
|
|
|
- expiration_time = int(timedelta(days=7).total_seconds())
|
|
|
- client.setex(key, expiration_time, "1")
|
|
|
+ """
|
|
|
+ 运行主流程,异步执行推荐视频和相关推荐视频的请求,直到达到下载数量限制。
|
|
|
|
|
|
+ :return: 无返回值,程序运行的主逻辑
|
|
|
+ """
|
|
|
+ self.LocalLog.info("开始执行中青看点推荐抓取...")
|
|
|
+ await asyncio.gather(
|
|
|
+ self.control_request_recommend()
|
|
|
+ )
|
|
|
|
|
|
-from datetime import datetime, timedelta
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
+ asyncio.run(ZhongQingKanDian(
|
|
|
+ platform="zhongqingkandian",
|
|
|
+ mode="recommend",
|
|
|
+ rule_dict={"videos_cnt": {"min": 2, "max": 0}},
|
|
|
+ user_list=[{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
|
|
|
+ ).run())
|
|
|
+ # content_link = "https://vol.youth.cn/4X32ftEV6SsA9Mq9?signature=6y30XlmbkL9oxwAjJd1PXOBX0idx0ZD1gMQE2nZKW8RNpvPrqz"
|
|
|
# asyncio.run(ZhongQingKanDian(
|
|
|
# platform="zhongqingkandian",
|
|
|
# mode="recommend",
|
|
|
- # rule_dict={},
|
|
|
- # user_list=[{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"},
|
|
|
- # ]
|
|
|
- #
|
|
|
- # ).run())
|
|
|
- save_video_id("1234")
|
|
|
-
|
|
|
-
|
|
|
+ # rule_dict={
|
|
|
+ # {"videos_cnt":{"min":100,"max":0}},{"duration":{"min":30,"max":1200}}
|
|
|
+ # },
|
|
|
+ # user_list=[{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
|
|
|
+ # ).req_detail(content_link,"测试"))
|