|
@@ -1,13 +1,14 @@
|
|
import os
|
|
import os
|
|
import sys
|
|
import sys
|
|
-import asyncio
|
|
|
|
import json
|
|
import json
|
|
import random
|
|
import random
|
|
import uuid
|
|
import uuid
|
|
import time
|
|
import time
|
|
import traceback
|
|
import traceback
|
|
from datetime import datetime
|
|
from datetime import datetime
|
|
-import aiohttp
|
|
|
|
|
|
+import requests
|
|
|
|
+from requests.adapters import HTTPAdapter
|
|
|
|
+from urllib3.util.retry import Retry
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
sys.path.append(os.getcwd())
|
|
from application.common.feishu import FsData
|
|
from application.common.feishu import FsData
|
|
@@ -51,35 +52,36 @@ class ZhongQingKanDian:
|
|
self.expire_flag = False
|
|
self.expire_flag = False
|
|
self.aliyun_log = AliyunLogger(mode=self.mode, platform=self.platform)
|
|
self.aliyun_log = AliyunLogger(mode=self.mode, platform=self.platform)
|
|
self.db_ops = DatabaseOperations(mode=mode, platform=platform)
|
|
self.db_ops = DatabaseOperations(mode=mode, platform=platform)
|
|
- self.redis_ops = RedisOperations()
|
|
|
|
|
|
+ self.redis_ops = RedisOperations(mode=mode, platform=platform)
|
|
data_rule = FsData()
|
|
data_rule = FsData()
|
|
self.title_rule = data_rule.get_title_rule()
|
|
self.title_rule = data_rule.get_title_rule()
|
|
self.LocalLog = Local.logger(self.platform, self.mode)
|
|
self.LocalLog = Local.logger(self.platform, self.mode)
|
|
|
|
+ self.session = requests.session()
|
|
|
|
|
|
- async def send_request(self, path, data):
|
|
|
|
|
|
+ def send_request(self, path, data):
|
|
"""
|
|
"""
|
|
- 异步发送 POST 请求到指定路径,带有重试机制。
|
|
|
|
|
|
+ 同步发送 POST 请求到指定路径,带有重试机制。
|
|
:param path: 请求的 API 路径
|
|
:param path: 请求的 API 路径
|
|
:param data: 请求的数据
|
|
:param data: 请求的数据
|
|
:return: 响应的 JSON 数据,如果请求失败则返回 None
|
|
:return: 响应的 JSON 数据,如果请求失败则返回 None
|
|
"""
|
|
"""
|
|
full_url = f"{self.API_BASE_URL}{path}"
|
|
full_url = f"{self.API_BASE_URL}{path}"
|
|
- async with aiohttp.ClientSession(headers=self.COMMON_HEADERS) as session:
|
|
|
|
- for retry in range(self.MAX_RETRIES):
|
|
|
|
- try:
|
|
|
|
- async with session.post(full_url, data=data, timeout=self.TIMEOUT) as response:
|
|
|
|
- response.raise_for_status()
|
|
|
|
- self.LocalLog.info(f"{path}响应数据:{await response.json()}")
|
|
|
|
- return await response.json()
|
|
|
|
- except (aiohttp.ClientError, json.JSONDecodeError) as e:
|
|
|
|
- tb_info = traceback.format_exc()
|
|
|
|
- self.LocalLog.info(f"{path}请求失败:{e} \n{tb_info}")
|
|
|
|
- self.aliyun_log.logging(
|
|
|
|
- code="3000",
|
|
|
|
- message=f"请求 {path} 失败,错误信息: {str(e)}",
|
|
|
|
- data={"path": path}
|
|
|
|
- )
|
|
|
|
- await asyncio.sleep(random.randint(5, 10))
|
|
|
|
|
|
+
|
|
|
|
+ for retry in range(self.MAX_RETRIES):
|
|
|
|
+ try:
|
|
|
|
+ response = self.session.post(full_url, data=data, timeout=self.TIMEOUT, headers=self.COMMON_HEADERS)
|
|
|
|
+ response.raise_for_status()
|
|
|
|
+ self.LocalLog.info(f"{path}响应数据:{response.json()}")
|
|
|
|
+ return response.json()
|
|
|
|
+ except Exception as e:
|
|
|
|
+ tb_info = traceback.format_exc()
|
|
|
|
+ self.LocalLog.info(f"{path}请求失败:{e} \n{tb_info}")
|
|
|
|
+ self.aliyun_log.logging(
|
|
|
|
+ code="3000",
|
|
|
|
+ message=f"请求 {path} 失败,错误信息: {str(e)}",
|
|
|
|
+ data={"path": path}
|
|
|
|
+ )
|
|
|
|
+ time.sleep(5)
|
|
return None
|
|
return None
|
|
|
|
|
|
def is_response_valid(self, resp, url):
|
|
def is_response_valid(self, resp, url):
|
|
@@ -108,17 +110,16 @@ class ZhongQingKanDian:
|
|
self.LocalLog.info(f"检查 {url} 响应有效性时出错:{e} \n{tb_info}")
|
|
self.LocalLog.info(f"检查 {url} 响应有效性时出错:{e} \n{tb_info}")
|
|
return None
|
|
return None
|
|
|
|
|
|
- async def req_recommend_list(self):
|
|
|
|
|
|
+ def req_recommend_list(self):
|
|
"""
|
|
"""
|
|
- 异步请求推荐视频列表。
|
|
|
|
|
|
+ 同步请求推荐视频列表。
|
|
:return: 推荐视频列表的有效响应数据,如果请求失败则返回 None
|
|
:return: 推荐视频列表的有效响应数据,如果请求失败则返回 None
|
|
"""
|
|
"""
|
|
try:
|
|
try:
|
|
-
|
|
|
|
url = '/crawler/zhong_qing_kan_dian/recommend'
|
|
url = '/crawler/zhong_qing_kan_dian/recommend'
|
|
body = json.dumps({"cursor": ""})
|
|
body = json.dumps({"cursor": ""})
|
|
self.LocalLog.info(f"开始请求推荐{body}")
|
|
self.LocalLog.info(f"开始请求推荐{body}")
|
|
- resp = await self.send_request(url, body)
|
|
|
|
|
|
+ resp = self.send_request(url, body)
|
|
return self.is_response_valid(resp, url)
|
|
return self.is_response_valid(resp, url)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
tb_info = traceback.format_exc()
|
|
tb_info = traceback.format_exc()
|
|
@@ -130,11 +131,9 @@ class ZhongQingKanDian:
|
|
self.LocalLog.info(f"请求推荐视频列表 {url} 时发生异常:{str(e)} \n{tb_info}")
|
|
self.LocalLog.info(f"请求推荐视频列表 {url} 时发生异常:{str(e)} \n{tb_info}")
|
|
return None
|
|
return None
|
|
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- async def req_detail(self, content_link, **kwargs):
|
|
|
|
|
|
+ def req_detail(self, content_link, **kwargs):
|
|
"""
|
|
"""
|
|
- 异步请求视频详情。
|
|
|
|
|
|
+ 同步请求视频详情。
|
|
:param content_link: 视频内容链接
|
|
:param content_link: 视频内容链接
|
|
:param label: 视频标签(如 "recommend" 或 "related")
|
|
:param label: 视频标签(如 "recommend" 或 "related")
|
|
:param kwargs: 额外的视频信息
|
|
:param kwargs: 额外的视频信息
|
|
@@ -146,22 +145,21 @@ class ZhongQingKanDian:
|
|
body = json.dumps({
|
|
body = json.dumps({
|
|
"content_link": content_link
|
|
"content_link": content_link
|
|
})
|
|
})
|
|
- resp = await self.send_request(url, body)
|
|
|
|
|
|
+ resp = self.send_request(url, body)
|
|
if not self.is_response_valid(resp, url):
|
|
if not self.is_response_valid(resp, url):
|
|
return
|
|
return
|
|
data = resp.get("data", {}).get("data", {})
|
|
data = resp.get("data", {}).get("data", {})
|
|
if data.get("content_type") != "video":
|
|
if data.get("content_type") != "video":
|
|
self.aliyun_log.logging(
|
|
self.aliyun_log.logging(
|
|
code="3003",
|
|
code="3003",
|
|
- message=f"跳过非视频内容)",
|
|
|
|
|
|
+ message=f"跳过非视频内容",
|
|
data={"content_link": content_link}
|
|
data={"content_link": content_link}
|
|
)
|
|
)
|
|
self.LocalLog.info(f"跳过非视频内容,链接: {content_link}")
|
|
self.LocalLog.info(f"跳过非视频内容,链接: {content_link}")
|
|
return
|
|
return
|
|
self.LocalLog.info(f"{content_link} 是视频")
|
|
self.LocalLog.info(f"{content_link} 是视频")
|
|
data.update(kwargs)
|
|
data.update(kwargs)
|
|
- await self.process_video_obj(data)
|
|
|
|
- await asyncio.sleep(10)
|
|
|
|
|
|
+ self.process_video_obj(data)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
tb_info = traceback.format_exc()
|
|
tb_info = traceback.format_exc()
|
|
self.aliyun_log.logging(
|
|
self.aliyun_log.logging(
|
|
@@ -171,7 +169,7 @@ class ZhongQingKanDian:
|
|
)
|
|
)
|
|
self.LocalLog.error(f"请求视频详情,链接 {content_link} 时发生异常:{e} \n{tb_info}")
|
|
self.LocalLog.error(f"请求视频详情,链接 {content_link} 时发生异常:{e} \n{tb_info}")
|
|
|
|
|
|
- async def control_request_recommend(self):
|
|
|
|
|
|
+ def control_request_recommend(self):
|
|
"""
|
|
"""
|
|
控制推荐视频列表的请求和处理流程。
|
|
控制推荐视频列表的请求和处理流程。
|
|
:return: 无返回值,根据下载数量限制控制流程
|
|
:return: 无返回值,根据下载数量限制控制流程
|
|
@@ -179,12 +177,17 @@ class ZhongQingKanDian:
|
|
while self.limit_flag:
|
|
while self.limit_flag:
|
|
try:
|
|
try:
|
|
self.LocalLog.info(f"开始推荐视频列表的请求和处理流程,今日已爬推荐 {self.download_cnt} 个视频")
|
|
self.LocalLog.info(f"开始推荐视频列表的请求和处理流程,今日已爬推荐 {self.download_cnt} 个视频")
|
|
- recommend_resp = await self.req_recommend_list()
|
|
|
|
|
|
+
|
|
|
|
+ recommend_resp = self.req_recommend_list()
|
|
if not recommend_resp:
|
|
if not recommend_resp:
|
|
|
|
+ time.sleep(random.randint(5, 10))
|
|
continue
|
|
continue
|
|
recommend_list = recommend_resp.get("data", {}).get("data", [])
|
|
recommend_list = recommend_resp.get("data", {}).get("data", [])
|
|
self.LocalLog.info(f"获取的推荐列表长度:{len(recommend_list)}")
|
|
self.LocalLog.info(f"获取的推荐列表长度:{len(recommend_list)}")
|
|
for video_obj in recommend_list:
|
|
for video_obj in recommend_list:
|
|
|
|
+ # if not self.limit_flag:
|
|
|
|
+ # self.LocalLog.info(f"今日视频数量已达最大量{self.download_cnt}")
|
|
|
|
+ # return
|
|
content_link = video_obj.get("share_url")
|
|
content_link = video_obj.get("share_url")
|
|
content_id = video_obj.get("id")
|
|
content_id = video_obj.get("id")
|
|
self.LocalLog.info(f"content_link == {content_link} \n content_id == {content_id}")
|
|
self.LocalLog.info(f"content_link == {content_link} \n content_id == {content_id}")
|
|
@@ -192,7 +195,8 @@ class ZhongQingKanDian:
|
|
continue
|
|
continue
|
|
# 当前内容id保存到redis
|
|
# 当前内容id保存到redis
|
|
self.redis_ops.save_recommend_video(content_id)
|
|
self.redis_ops.save_recommend_video(content_id)
|
|
- await self.req_detail(content_link, **video_obj)
|
|
|
|
|
|
+ time.sleep(random.randint(5, 10))
|
|
|
|
+ self.req_detail(content_link, **video_obj)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
tb_info = traceback.format_exc()
|
|
tb_info = traceback.format_exc()
|
|
self.aliyun_log.logging(
|
|
self.aliyun_log.logging(
|
|
@@ -203,14 +207,13 @@ class ZhongQingKanDian:
|
|
self.LocalLog.info(f"控制推荐视频请求和处理时发生异常:\n{tb_info}")
|
|
self.LocalLog.info(f"控制推荐视频请求和处理时发生异常:\n{tb_info}")
|
|
self.LocalLog.info(f"循环结束,当前 limit_flag 值为: {self.limit_flag}")
|
|
self.LocalLog.info(f"循环结束,当前 limit_flag 值为: {self.limit_flag}")
|
|
|
|
|
|
- async def process_video_obj(self, video_obj):
|
|
|
|
|
|
+ def process_video_obj(self, video_obj):
|
|
"""
|
|
"""
|
|
处理视频对象,包括检查视频时长、用户信息、保存数据等操作。
|
|
处理视频对象,包括检查视频时长、用户信息、保存数据等操作。
|
|
:param video_obj: 视频对象,包含视频的各种信息
|
|
:param video_obj: 视频对象,包含视频的各种信息
|
|
:return: 无返回值,完成视频对象的处理
|
|
:return: 无返回值,完成视频对象的处理
|
|
"""
|
|
"""
|
|
try:
|
|
try:
|
|
-
|
|
|
|
video_duration = video_obj["video_url_list"][0]['video_duration']
|
|
video_duration = video_obj["video_url_list"][0]['video_duration']
|
|
video_id = video_obj['channel_content_id']
|
|
video_id = video_obj['channel_content_id']
|
|
# 检查视频ID是否存在
|
|
# 检查视频ID是否存在
|
|
@@ -240,9 +243,7 @@ class ZhongQingKanDian:
|
|
self.db_ops.insert_user(account_id, account_name, account_avatar)
|
|
self.db_ops.insert_user(account_id, account_name, account_avatar)
|
|
self.aliyun_log.logging(code="1007", message=f"用户数据写入成功,用户ID:{account_id}")
|
|
self.aliyun_log.logging(code="1007", message=f"用户数据写入成功,用户ID:{account_id}")
|
|
self.LocalLog.info(f"用户数据写入成功,用户ID: {account_id}")
|
|
self.LocalLog.info(f"用户数据写入成功,用户ID: {account_id}")
|
|
- if video_duration > self.rule_dict.get("duration", {}).get("max",
|
|
|
|
- 1200) or video_duration < self.rule_dict.get(
|
|
|
|
- "duration", {}).get("min", 30):
|
|
|
|
|
|
+ if video_duration > self.rule_dict.get("duration", {}).get("max", 1200) or video_duration < self.rule_dict.get("duration", {}).get("min", 30):
|
|
self.aliyun_log.logging(
|
|
self.aliyun_log.logging(
|
|
code="3005",
|
|
code="3005",
|
|
message=f"视频时长不满足条件[>=30s&<=1200s]视频ID:{video_obj['channel_content_id']},视频时长:{video_duration}"
|
|
message=f"视频时长不满足条件[>=30s&<=1200s]视频ID:{video_obj['channel_content_id']},视频时长:{video_duration}"
|
|
@@ -253,7 +254,7 @@ class ZhongQingKanDian:
|
|
|
|
|
|
item.add_video_info("video_id", video_obj['channel_content_id'])
|
|
item.add_video_info("video_id", video_obj['channel_content_id'])
|
|
item.add_video_info("video_title", video_obj["title"])
|
|
item.add_video_info("video_title", video_obj["title"])
|
|
- item.add_video_info("play_cnt", int(video_obj["read_num"]))
|
|
|
|
|
|
+ item.add_video_info("play_cnt", self.convert_number(video_obj["read_num"]))
|
|
item.add_video_info("publish_time_stamp", int(int(video_obj["publish_timestamp"]) / 1000))
|
|
item.add_video_info("publish_time_stamp", int(int(video_obj["publish_timestamp"]) / 1000))
|
|
item.add_video_info("out_user_id", video_obj["channel_account_id"])
|
|
item.add_video_info("out_user_id", video_obj["channel_account_id"])
|
|
item.add_video_info("cover_url", video_obj["image_url_list"][0]['image_url'])
|
|
item.add_video_info("cover_url", video_obj["image_url_list"][0]['image_url'])
|
|
@@ -277,7 +278,7 @@ class ZhongQingKanDian:
|
|
rule_dict=self.rule_dict,
|
|
rule_dict=self.rule_dict,
|
|
env=self.env,
|
|
env=self.env,
|
|
item=mq_obj,
|
|
item=mq_obj,
|
|
- trace_id=trace_id
|
|
|
|
|
|
+ trace_id=traceback.format_exc()
|
|
)
|
|
)
|
|
if pipeline.process_item():
|
|
if pipeline.process_item():
|
|
title_list = self.title_rule.split(",")
|
|
title_list = self.title_rule.split(",")
|
|
@@ -312,6 +313,7 @@ class ZhongQingKanDian:
|
|
# 保存视频ID
|
|
# 保存视频ID
|
|
self.redis_ops.save_video_id(video_obj['channel_content_id'])
|
|
self.redis_ops.save_video_id(video_obj['channel_content_id'])
|
|
if self.download_cnt >= self.rule_dict.get("videos_cnt", {}).get("min", 100):
|
|
if self.download_cnt >= self.rule_dict.get("videos_cnt", {}).get("min", 100):
|
|
|
|
+ self.LocalLog.info("当日视频已达到最大爬取量")
|
|
self.limit_flag = False
|
|
self.limit_flag = False
|
|
except Exception as e:
|
|
except Exception as e:
|
|
tb_info = traceback.format_exc()
|
|
tb_info = traceback.format_exc()
|
|
@@ -322,31 +324,29 @@ class ZhongQingKanDian:
|
|
)
|
|
)
|
|
self.LocalLog.error(f"处理视频对象时发生异常: {e}\n{tb_info}")
|
|
self.LocalLog.error(f"处理视频对象时发生异常: {e}\n{tb_info}")
|
|
|
|
|
|
- async def run(self):
|
|
|
|
- """
|
|
|
|
- 运行主流程,异步执行推荐视频和相关推荐视频的请求,直到达到下载数量限制。
|
|
|
|
|
|
|
|
|
|
+ def convert_number(self, s):
|
|
|
|
+ if not isinstance(s, str):
|
|
|
|
+ return s
|
|
|
|
+ try:
|
|
|
|
+ return float(s.strip('万')) * 10000 if '万' in s else int(s)
|
|
|
|
+ except ValueError:
|
|
|
|
+ self.LocalLog.info(f"无法将 '{s}' 转换为有效的数字。")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def run(self):
|
|
|
|
+ """
|
|
|
|
+ 运行主流程,执行推荐视频和相关推荐视频的请求,直到达到下载数量限制。
|
|
:return: 无返回值,程序运行的主逻辑
|
|
:return: 无返回值,程序运行的主逻辑
|
|
"""
|
|
"""
|
|
self.LocalLog.info("开始执行中青看点推荐抓取...")
|
|
self.LocalLog.info("开始执行中青看点推荐抓取...")
|
|
- await asyncio.gather(
|
|
|
|
- self.control_request_recommend()
|
|
|
|
- )
|
|
|
|
|
|
+ self.control_request_recommend()
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
- asyncio.run(ZhongQingKanDian(
|
|
|
|
|
|
+ ZhongQingKanDian(
|
|
platform="zhongqingkandian",
|
|
platform="zhongqingkandian",
|
|
mode="recommend",
|
|
mode="recommend",
|
|
- rule_dict={"videos_cnt": {"min": 2, "max": 0}},
|
|
|
|
|
|
+ rule_dict={'videos_cnt': {'min': 2, 'max': 0}, 'duration': {'min': 30, 'max': 1200}},
|
|
user_list=[{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
|
|
user_list=[{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
|
|
- ).run())
|
|
|
|
- # content_link = "https://vol.youth.cn/4X32ftEV6SsA9Mq9?signature=6y30XlmbkL9oxwAjJd1PXOBX0idx0ZD1gMQE2nZKW8RNpvPrqz"
|
|
|
|
- # asyncio.run(ZhongQingKanDian(
|
|
|
|
- # platform="zhongqingkandian",
|
|
|
|
- # mode="recommend",
|
|
|
|
- # rule_dict={
|
|
|
|
- # {"videos_cnt":{"min":100,"max":0}},{"duration":{"min":30,"max":1200}}
|
|
|
|
- # },
|
|
|
|
- # user_list=[{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
|
|
|
|
- # ).req_detail(content_link,"测试"))
|
|
|
|
|
|
+ ).run()
|