Server
/
automatic_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
							import os
import sys
import json
import random
import uuid
import time
import traceback
from datetime import datetime
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

sys.path.append(os.getcwd())
from application.common.feishu import FsData
from application.common.feishu.feishu_utils import FeishuUtils
from application.common.gpt import GPT4oMini
from application.common.messageQueue import MQ
from application.common.log import AliyunLogger
from application.functions.zqkd_db_redis import DatabaseOperations, RedisOperations
from application.items import VideoItem
from application.pipeline import PiaoQuanPipeline
from application.common.log import Local


class ZhongQingKanDian:
    API_BASE_URL = "http://8.217.192.46:8889"
    COMMON_HEADERS = {
        "Content-Type": "application/json"
    }
    # 最大重试次数
    MAX_RETRIES = 3
    # 最大等待时长
    TIMEOUT = 30

    def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
        """
        初始化
        :param platform: 平台名称 zhongqingkandian
        :param mode: 运行模式  recommend
        :param rule_dict: 规则字典，包含视频数量限制、时长限制等规则 [{"videos_cnt":{"min":100,"max":0}},{"duration":{"min":30,"max":1200}}]
        :param user_list: 用户列表
        :param env: 运行环境，默认为 "prod"
        """
        self.limit_flag = True
        self.platform = platform
        self.mode = mode
        self.rule_dict = rule_dict
        self.user_list = user_list
        self.env = env
        self.download_cnt = 0
        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
        self.expire_flag = False
        self.aliyun_log = AliyunLogger(mode=self.mode, platform=self.platform)
        self.db_ops = DatabaseOperations(mode=mode, platform=platform)
        self.redis_ops = RedisOperations(mode=mode, platform=platform)
        data_rule = FsData()
        self.title_rule = data_rule.get_title_rule()
        self.LocalLog = Local.logger(self.platform, self.mode)
        self.session = requests.session()

    def send_request(self, path, data):
        """
        同步发送 POST 请求到指定路径，带有重试机制。
        :param path: 请求的 API 路径
        :param data: 请求的数据
        :return: 响应的 JSON 数据，如果请求失败则返回 None
        """
        full_url = f"{self.API_BASE_URL}{path}"

        for retry in range(self.MAX_RETRIES):
            try:
                response = self.session.post(full_url, data=data, timeout=self.TIMEOUT, headers=self.COMMON_HEADERS)
                response.raise_for_status()
                self.LocalLog.info(f"{path}响应数据:{response.json()}")
                return response.json()
            except Exception as e:
                tb_info = traceback.format_exc()
                self.LocalLog.info(f"{path}请求失败:{e}  \n{tb_info}")
                self.aliyun_log.logging(
                    code="3000",
                    message=f"请求 {path} 失败，错误信息: {str(e)}",
                    data={"path": path}
                )
                time.sleep(5)
        return None

    def is_response_valid(self, resp, url):
        """
        检查响应是否有效（状态码为 0 表示有效）。
        :param resp: 响应数据
        :param url: 请求的 URL
        :return: 如果响应有效则返回响应数据，否则返回 None
        """
        try:
            if resp and resp.get('code') != 0:
                self.aliyun_log.logging(
                    code="3000",
                    message=f"抓取{url}失败,请求失败,响应：{resp}"
                )
                self.LocalLog.info(f"{url}请求失败,响应：{resp}")
                return None
            return resp
        except Exception as e:
            tb_info = traceback.format_exc()
            self.aliyun_log.logging(
                code="3000",
                message=f"检查响应有效性时出错，错误信息: {str(e)}",
                data={"url": url, "resp": resp}
            )
            self.LocalLog.info(f"检查 {url} 响应有效性时出错:{e} \n{tb_info}")
            return None

    def req_recommend_list(self):
        """
        同步请求推荐视频列表。
        :return: 推荐视频列表的有效响应数据，如果请求失败则返回 None
        """
        try:
            url = '/crawler/zhong_qing_kan_dian/recommend'
            body = json.dumps({"cursor": ""})
            self.LocalLog.info(f"开始请求推荐{body}")
            resp = self.send_request(url, body)
            return self.is_response_valid(resp, url)
        except Exception as e:
            tb_info = traceback.format_exc()
            self.aliyun_log.logging(
                code="1003",
                message=f"请求推荐视频列表时发生异常，错误信息: {str(e)}\n{tb_info}",
                data={"url": url}
            )
            self.LocalLog.info(f"请求推荐视频列表 {url} 时发生异常:{str(e)}   \n{tb_info}")
            return None

    def req_detail(self, content_link, **kwargs):
        """
        同步请求视频详情。
        :param content_link: 视频内容链接
        :param label: 视频标签（如 "recommend" 或 "related"）
        :param kwargs: 额外的视频信息
        :return: 无返回值，处理视频详情信息
        """
        try:
            self.LocalLog.info(f"开始请求视频详情，链接: {content_link}")
            url = '/crawler/zhong_qing_kan_dian/detail'
            body = json.dumps({
                "content_link": content_link
            })
            resp = self.send_request(url, body)
            if not self.is_response_valid(resp, url):
                return
            data = resp.get("data", {}).get("data", {})
            if data.get("content_type") != "video":
                self.aliyun_log.logging(
                    code="3003",
                    message=f"跳过非视频内容",
                    data={"content_link": content_link}
                )
                self.LocalLog.info(f"跳过非视频内容，链接: {content_link}")
                return
            self.LocalLog.info(f"{content_link} 是视频")
            data.update(kwargs)
            self.process_video_obj(data)
        except Exception as e:
            tb_info = traceback.format_exc()
            self.aliyun_log.logging(
                code="1005",
                message=f"请求视频详情时发生异常，错误信息: {str(e)}",
                data={"content_link": content_link}
            )
            self.LocalLog.error(f"请求视频详情，链接 {content_link} 时发生异常:{e}  \n{tb_info}")

    def control_request_recommend(self):
        """
        控制推荐视频列表的请求和处理流程。
        :return: 无返回值，根据下载数量限制控制流程
        """
        while self.limit_flag:
            try:
                self.LocalLog.info(f"开始推荐视频列表的请求和处理流程,今日已爬推荐 {self.download_cnt} 个视频")

                recommend_resp = self.req_recommend_list()
                if not recommend_resp:
                    time.sleep(random.randint(5, 10))
                    continue
                recommend_list = recommend_resp.get("data", {}).get("data", [])
                self.LocalLog.info(f"获取的推荐列表长度：{len(recommend_list)}")
                for video_obj in recommend_list:
                    # if not self.limit_flag:
                    #     self.LocalLog.info(f"今日视频数量已达最大量{self.download_cnt}")
                    #     return
                    content_link = video_obj.get("share_url")
                    content_id = video_obj.get("id")
                    self.LocalLog.info(f"content_link == {content_link} \n content_id == {content_id}")
                    if not (content_link and content_id):
                        continue
                    # 当前内容id保存到redis
                    self.redis_ops.save_recommend_video(content_id)
                    time.sleep(random.randint(5, 10))
                    self.req_detail(content_link, **video_obj)
            except Exception as e:
                tb_info = traceback.format_exc()
                self.aliyun_log.logging(
                    code="3008",
                    message=f"控制推荐视频请求和处理时发生异常，错误信息: {str(e)}",
                    data={}
                )
                self.LocalLog.info(f"控制推荐视频请求和处理时发生异常:\n{tb_info}")
        self.LocalLog.info(f"循环结束，当前 limit_flag 值为: {self.limit_flag}")

    def process_video_obj(self, video_obj):
        """
        处理视频对象，包括检查视频时长、用户信息、保存数据等操作。
        :param video_obj: 视频对象，包含视频的各种信息
        :return: 无返回值，完成视频对象的处理
        """
        try:
            video_duration = video_obj["video_url_list"][0]['video_duration']
            video_id = video_obj['channel_content_id']
            # 检查视频ID是否存在
            if self.redis_ops.check_video_id_exists(video_id):
                self.aliyun_log.logging(
                    code="3004",
                    message=f"重复视频ID：{video_id}"
                )
                self.LocalLog.info(f"重复视频ID: {video_id}")
                return
            our_user = random.choice(self.user_list)
            trace_id = self.platform + str(uuid.uuid1())
            item = VideoItem()

            account_id = video_obj["channel_account_id"]
            account_name = video_obj["channel_account_name"]
            account_avatar = video_obj["avatar"]
            # 检查用户ID是否存在
            is_repeat_user = self.db_ops.check_user_id(account_id)
            if is_repeat_user:
                # 更新用户信息，使用异步方法并等待结果
                self.LocalLog.info(f"用户{account_id}已经存在数据库中")
                self.db_ops.update_user(account_id, account_name, account_avatar)
            else:
                self.LocalLog.info(f"用户{account_id}没在数据库中")
                # 插入用户信息，使用异步方法并等待结果
                self.db_ops.insert_user(account_id, account_name, account_avatar)
                self.aliyun_log.logging(code="1007", message=f"用户数据写入成功，用户ID：{account_id}")
                self.LocalLog.info(f"用户数据写入成功，用户ID: {account_id}")
            if video_duration > self.rule_dict.get("duration", {}).get("max", 1200) or video_duration < self.rule_dict.get("duration", {}).get("min", 30):
                self.aliyun_log.logging(
                    code="3005",
                    message=f"视频时长不满足条件[>=30s&<=1200s]视频ID：{video_obj['channel_content_id']}，视频时长：{video_duration}"
                )
                self.LocalLog.info(
                    f"视频时长不满足条件，视频ID: {video_obj['channel_content_id']}, 视频时长: {video_duration}")
                return

            item.add_video_info("video_id", video_obj['channel_content_id'])
            item.add_video_info("video_title", video_obj["title"])
            item.add_video_info("play_cnt",  self.convert_number(video_obj["read_num"]))
            item.add_video_info("publish_time_stamp", int(int(video_obj["publish_timestamp"]) / 1000))
            item.add_video_info("out_user_id", video_obj["channel_account_id"])
            item.add_video_info("cover_url", video_obj["image_url_list"][0]['image_url'])
            item.add_video_info("like_cnt", 0)
            item.add_video_info("collection_cnt", int(video_obj['collect_num']))
            item.add_video_info("share_cnt", int(video_obj["share_num"]))
            item.add_video_info("comment_cnt", int(video_obj["cmt_num"]))
            item.add_video_info("video_url", video_obj["video_url_list"][0]['video_url'])
            item.add_video_info("out_video_id", int(video_obj["channel_content_id"]))
            item.add_video_info("duration", video_obj["video_url_list"][0]['video_duration'])
            item.add_video_info("platform", self.platform)
            item.add_video_info("strategy", self.mode)
            item.add_video_info("session", f"{self.platform}-{int(time.time())}")
            item.add_video_info("user_id", our_user["uid"])
            item.add_video_info("user_name", our_user["nick_name"])

            mq_obj = item.produce_item()
            pipeline = PiaoQuanPipeline(
                platform=self.platform,
                mode=self.mode,
                rule_dict=self.rule_dict,
                env=self.env,
                item=mq_obj,
                trace_id=traceback.format_exc()
            )
            if pipeline.process_item():
                title_list = self.title_rule.split(",")
                title = video_obj["title"]
                contains_keyword = any(keyword in title for keyword in title_list)
                if contains_keyword:
                    new_title = GPT4oMini.get_ai_mini_title(title)
                    if new_title:
                        item.add_video_info("video_title", new_title)
                        current_time = datetime.now()
                        formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                        values = [
                            [
                                video_obj["video_url_list"][0]['video_url'],
                                video_obj["image_url_list"][0]['image_url'],
                                title,
                                new_title,
                                formatted_time,
                            ]
                        ]
                        FeishuUtils.insert_columns("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "v8S6nL", "ROWS", 1, 2)
                        time.sleep(0.5)
                        FeishuUtils.update_values("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "v8S6nL", "A2:Z2", values)

                self.mq.send_msg(mq_obj)
                self.download_cnt += 1
                self.aliyun_log.logging(
                    code="2009",
                    message=f"成功发送视频到etl",
                    data={"video_obj": video_obj}
                )
                # 保存视频ID
                self.redis_ops.save_video_id(video_obj['channel_content_id'])
                if self.download_cnt >= self.rule_dict.get("videos_cnt", {}).get("min", 100):
                    self.LocalLog.info("当日视频已达到最大爬取量")
                    self.limit_flag = False
        except Exception as e:
            tb_info = traceback.format_exc()
            self.aliyun_log.logging(
                code="1005",
                message=f"处理视频对象时发生异常，错误信息: {str(e)}",
                data={"video_obj": video_obj}
            )
            self.LocalLog.error(f"处理视频对象时发生异常: {e}\n{tb_info}")


    def convert_number(self, s):
        if not isinstance(s, str):
            return s
        try:
            return float(s.strip('万')) * 10000 if '万' in s else int(s)
        except ValueError:
            self.LocalLog.info(f"无法将 '{s}' 转换为有效的数字。")


    def run(self):
        """
        运行主流程，执行推荐视频和相关推荐视频的请求，直到达到下载数量限制。
        :return: 无返回值，程序运行的主逻辑
        """
        self.LocalLog.info("开始执行中青看点推荐抓取...")
        self.control_request_recommend()


if __name__ == '__main__':
    ZhongQingKanDian(
        platform="zhongqingkandian",
        mode="recommend",
        rule_dict={'videos_cnt': {'min': 2, 'max': 0}, 'duration': {'min': 30, 'max': 1200}},
        user_list=[{"uid": 81522822, "link": "中青看点推荐", "nick_name": "免不了俗"}]
    ).run()