Server
/
automatic_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
							import os
import random
import sys
import time
import uuid
import json

from datetime import datetime

import cv2
import requests

from application.common import Feishu
from application.common.feishu import FsData
from application.common.feishu.feishu_utils import FeishuUtils
from application.common.gpt import GPT4oMini

sys.path.append(os.getcwd())

from application.items import VideoItem
from application.pipeline import PiaoQuanPipeline
from application.common.messageQueue import MQ
from application.common.log import AliyunLogger
from application.common.mysql import MysqlHelper


class BSZHRecommend(object):

    """
    本山祝福推荐流
    """

    def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
        self.limit_flag = False
        self.platform = platform
        self.mode = mode
        self.rule_dict = rule_dict
        self.user_list = user_list
        self.env = env
        self.download_cnt = 0
        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
        self.expire_flag = False
        self.aliyun_log = AliyunLogger(mode=self.mode, platform=self.platform)
        self.mysql = MysqlHelper(mode=self.mode, platform=self)

    def get_video_duration(self, video_link: str) -> int:
        cap = cv2.VideoCapture(video_link)
        if cap.isOpened():
            rate = cap.get(5)
            frame_num = cap.get(7)
            duration = int(frame_num / rate)
            return duration
        return 0

    def get_recommend_list(self):
        print("本山祝福开始")

        """
        获取推荐页视频
        """
        url = "http://8.217.192.46:8889/crawler/ben_shan_zhu_fu/recommend"
        next_cursor = 1
        data_rule = FsData()
        title_rule = data_rule.get_title_rule()
        for i in range(1, 200):
            payload = json.dumps({
                "cursor": f"{next_cursor}"
            })
            headers = {
                'Content-Type': 'application/json'
            }
            for j in range(3):
                response = requests.request("POST", url, headers=headers, data=payload)
                response = response.json()
                if response['code'] != 0:
                    time.sleep(2)
                    continue
                else:
                    break
            if response['code'] != 0:
                self.aliyun_log.logging(
                    code="3000",
                    message="抓取单条视频失败,请求失败"
                ),
                return
            for index, video_obj in enumerate(response['data']['data'], i):
                try:
                    self.aliyun_log.logging(
                        code="1001", message="扫描到一条视频", data=video_obj
                    )
                    next_cursor = response['data']['next_cursor']
                    self.process_video_obj(video_obj, title_rule)
                except Exception as e:
                    self.aliyun_log.logging(
                        code="3000",
                        message="抓取单条视频失败, 该视频位于第{}页第{}条报错原因是{}".format(
                            i, index, e
                        ),
                    )
                if self.limit_flag:
                    return
                time.sleep(random.randint(5, 10))

    def process_video_obj(self, video_obj, title_rule):
        """
        处理视频
        :param video_obj:
        """
        time.sleep(random.randint(3, 8))
        trace_id = self.platform + str(uuid.uuid1())
        our_user = random.choice(self.user_list)
        item = VideoItem()
        # id = uuid.uuid4()
        item.add_video_info("video_id", video_obj["nid"])
        item.add_video_info("video_title", video_obj["title"])
        item.add_video_info("play_cnt", 0)
        item.add_video_info("publish_time_stamp", int(video_obj["update_time"]))
        item.add_video_info("out_user_id", video_obj["nid"])
        item.add_video_info("cover_url", video_obj["video_cover"])
        item.add_video_info("like_cnt", 0)
        item.add_video_info("video_url", video_obj["video_url"])
        item.add_video_info("out_video_id", video_obj["nid"])
        item.add_video_info("platform", self.platform)
        item.add_video_info("strategy", self.mode)
        item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
        item.add_video_info("user_id", our_user["uid"])
        item.add_video_info("user_name", our_user["nick_name"])
        mq_obj = item.produce_item()
        pipeline = PiaoQuanPipeline(
            platform=self.platform,
            mode=self.mode,
            rule_dict=self.rule_dict,
            env=self.env,
            item=mq_obj,
            trace_id=trace_id,
        )
        if pipeline.process_item():
            title_list = title_rule.split(",")
            title = video_obj["title"]
            contains_keyword = any(keyword in title for keyword in title_list)
            if contains_keyword:
                new_title = GPT4oMini.get_ai_mini_title(title)
                if new_title:
                    item.add_video_info("video_title", new_title)
                    current_time = datetime.now()
                    formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
                    values = [
                        [
                            video_obj["video_url"],
                            video_obj["video_cover"],
                            title,
                            new_title,
                            formatted_time,
                        ]
                    ]
                    FeishuUtils.insert_columns("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "aTSJH4", "ROWS", 1, 2)
                    time.sleep(0.5)
                    FeishuUtils.update_values("U5dXsSlPOhiNNCtEfgqcm1iYnpf", "aTSJH4", "A2:Z2", values)

            self.download_cnt += 1
            self.mq.send_msg(mq_obj)
            self.aliyun_log.logging(code="1002", message="成功发送至 ETL", data=mq_obj)
            if self.download_cnt >= int(
                    self.rule_dict.get("videos_cnt", {}).get("min", 200)
            ):
                self.limit_flag = True


    def run(self):
        self.get_recommend_list()


if __name__ == '__main__':
    J = BSZHRecommend(
        platform="benshanzhufu",
        mode="recommend",
        rule_dict={},
        user_list=[{
    "createTime": 1684311893899,
    "id": 8,
    "interval": 7200,
    "machine": "aliyun",
    "mode": "recommend",
    "operator": "王雪珂",
    "rule": "[{\"like_cnt\":{\"min\":0,\"max\":0}}]",
    "source": "benshanzhufu",
    "spiderName": "run_bszf_recommend",
    "startTime": 1730452800000,
    "status": 0,
    "taskName": "本山祝福",
    "updateTime": 1730452617817
}],

    )
    J.get_recommend_list()
    # J.logic()