Server
/
piaoquan_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
							# -*- coding: utf-8 -*-
# @Time: 2023/11/07
import os
import random
import sys
import time

import cv2
import requests
import json
import urllib3


sys.path.append(os.getcwd())
from datetime import timedelta, date
from common.common import Common
from common import AliyunLogger
from common.mq import MQ
from requests.adapters import HTTPAdapter
from common.scheduling_db import MysqlHelper
from common.public import get_config_from_mysql, download_rule
from douyin.douyin_author.douyin_author_scheduling_help import DouYinHelper
from common.limit import AuthorLimit


class DouyinauthorScheduling:
    platform = "抖音"
    download_cnt = 0
    limiter = AuthorLimit(platform="douyin", mode="author")


    @classmethod
    def videos_cnt(cls, rule_dict):
        videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
        if videos_cnt == 0:
            videos_cnt = 1000
        return videos_cnt


    @classmethod
    def get_cookie(cls, log_type, crawler, env):
        select_sql = f""" select * from crawler_config where source="{crawler}" """
        configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
        for config in configs:
            if "cookie" in config["config"]:
                cookie_dict = {
                    "cookie_id": config["id"],
                    "title": config["title"].strip(),
                    "cookie": dict(eval(config["config"]))["cookie"].strip(),
                    "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
                    "operator": config["operator"].strip()
                }
                return cookie_dict

    @classmethod
    def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
        mq = MQ(topic_name="topic_crawler_etl_" + env)
        next_cursor = 0
        special = 0
        for i in range(3):
        # while True:
            flag = user_dict["link"].split("_")[0]
            if flag == "V1":
                special = 0.01
                rule_dict = {
                    'period': {"min": 15, "max": 0},
                }
            elif flag == "V2":
                special = 0.01
                rule_dict = {
                    'period': {"min": 15, "max": 0},
                }
            elif flag == "V3":
                special = 0.01
                rule_dict = {
                    'period': {"min": 15, "max": 0},
                }
            cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
            if user_dict['link'][0] == "V":
                link = user_dict["link"][3:]
            else:
                link = user_dict["link"]
            time.sleep(random.randint(5, 10))
            url = 'https://www.douyin.com/aweme/v1/web/aweme/post/'
            account_id = link
            headers = {
                'Accept': 'application/json, text/plain, */*',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'no-cache',
                'Cookie': cookie,
                # 'Cookie': "ttwid=1%7Cyj16cpJ4yxvUv9QWry1Uz3MoZ3Ci7FHGQR4qW3W70ac%7C1704436922%7C435637f1aa3c55cbed4587acf02003b5d74cfcac945a0df3893e041a288ce3c1; bd_ticket_guard_client_web_domain=2; passport_assist_user=CjzcKtls0e65w_tjpggJoAB9du8ZDR8XRxt178-cIHsJhxCRZPLxqAj0PHWKZ4g2xmxWzTHsK7mi4vxt0lcaSgo8W1SZlyQoj2vcxlToyotQ902cRuWULW6HqkHEJHMRcIoo_Y7maHi82HqNSTCVE5xBSQnTOXW31hxsJ4EIENPsxQ0Yia_WVCABIgED619Mew%3D%3D; n_mh=uPso8EqWH8OYYER0xnVFOgB1e9TbTzK9J1CBmr4IQVA; sso_uid_tt=f829ccc6652eae601ff8e56da1fccdb5; sso_uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; toutiao_sso_user=d2fa09f7626319fb35fd2553b5ec5b76; toutiao_sso_user_ss=d2fa09f7626319fb35fd2553b5ec5b76; LOGIN_STATUS=1; store-region=cn-hn; store-region-src=uid; d_ticket=dd5890b4b8f873453c1f1a090b9aa6ccb205c; sid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; ssid_ucp_v1=1.0.0-KGU1NTNlNmFjMGJmZTEwNjFhYWZjZTMyMGEzYmI4YmVjOTdjYzU0N2YKGQjPnMP64wEQiLzirQYY7zEgDDgGQPQHSAQaAmxxIiA1M2NmNzM1ZjUyMzA1ZTkxZDMyZTEyMmVhM2ZhYTQ1YQ; dy_swidth=1449; dy_sheight=906; __live_version__=%221.1.1.8009%22; live_use_vvc=%22false%22; xgplayer_user_id=510446933624; uid_tt=f829ccc6652eae601ff8e56da1fccdb5; uid_tt_ss=f829ccc6652eae601ff8e56da1fccdb5; sid_tt=d2fa09f7626319fb35fd2553b5ec5b76; sessionid=d2fa09f7626319fb35fd2553b5ec5b76; sessionid_ss=d2fa09f7626319fb35fd2553b5ec5b76; passport_csrf_token=34235b71f9c981e07032bd9041848f1e; passport_csrf_token_default=34235b71f9c981e07032bd9041848f1e; download_guide=%223%2F20240313%2F1%22; publish_badge_show_info=%220%2C0%2C0%2C1710488198823%22; EnhanceDownloadGuide=%220_0_0_0_2_1710734032%22; sid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; ssid_ucp_sso_v1=1.0.0-KGFiZDU4YWZmOTcwOWI1ZGIzOWUxNTBjZTc2Y2IxNmY4MjA3NDU1ZjEKHQjPnMP64wEQ2b7lrwYY7zEgDDDzrarJBTgGQPQHGgJobCIgZDJmYTA5Zjc2MjYzMTlmYjM1ZmQyNTUzYjVlYzViNzY; sid_guard=d2fa09f7626319fb35fd2553b5ec5b76%7C1710841689%7C5184001%7CSat%2C+18-May-2024+09%3A48%3A10+GMT; __ac_nonce=065fb9e6800dd2b1e14b1; __ac_signature=_02B4Z6wo00f01l39XCgAAIDBYFRGtw.YiKZd3ViAAPKTmnfg2zaxyzrXD6iNtRPPtcoSm5zbE6snYTcix8FTXgxsxQK195O6vG-zEOdZqKTq-ouYFPANlN1Jmu1.ZxBLTzOstKAOorrHEYQN06; douyin.com; xg_device_score=7.654580937785368; device_web_cpu_core=16; device_web_memory_size=8; architecture=amd64; csrf_session_id=f524ab33e8de0e4e922d8b48c362e6c1; strategyABtestKey=%221710988910.691%22; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.281%7D; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A1%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; my_rd=2; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710989305107%2F0%22; SEARCH_RESULT_LIST_TYPE=%22single%22; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1449%2C%5C%22screen_height%5C%22%3A906%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A50%7D%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCT05aUWpZcmNjMWhVYXhidlVoUG9uUC9lV0phYzBNbnhTQldxUmZESGFZQ290cUhOSE1GdmJ2ZTdSY1REdVpiemdHUU82cS90dWhzNVdnTmxaeVR3TzQ9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; tt_scid=IMdpIr4sRF90L0IaD0TdSlOy0Sm1rX-hlw5-OAxNAcisxsztezRzg3356KIGHx4cee78; pwa2=%220%7C0%7C1%7C0%22; odin_tt=f05b7460c2544b994f5deae19a5bbf0828870c64564040ef36c9d7cb40da9e44bc41ee52b1cac76d042b80fc4dcb4394; msToken=Tq7-Wv99mG0yhHDIz7-R1fxSAQyf8R7dNAvHMxnjrbWpbi531L8TI6VdQhQSDTAl8jQQJr9IWhJpbRu3E01IgC5uQ7DE_5oGYW046WpPb_bjluz255YhMdqfJ3Qmeg==; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAABDh3DP0PxVP5lMvEKL8Fhg8sSD732Z2rOc0db7nqY9o%2F1711036800000%2F0%2F1710990128301%2F0%22; msToken=YstZKHMONS09-8nDsHM40jwWV2nr5E1wYmv7cBeAmeY02prkpNLjRwB8C3tp52nc1hxvL5R1F-hkmvDSc0TNeNxz-DNodK3GMV8dK3gkVT8DVPKeVL5umskY5Am5; passport_fe_beating_status=false; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22",
                'Pragma': 'no-cache',
                'Referer': f'https://www.douyin.com/user/{account_id}',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/118.0.0.0 Safari/537.36',
            }
            query = DouYinHelper.get_full_query(ua=headers['User-Agent'], extra_data={
                'sec_user_id': account_id,
                'max_cursor': next_cursor,
                'locate_query': 'false',
                'show_live_replay_strategy': '1',
                'need_time_list': '1',
                'time_list_query': '0',
                'whale_cut_token': '',
                'cut_version': '1',
                'count': '18',
                'publish_video_strategy_type': '2',
            })
            urllib3.disable_warnings()
            s = requests.session()
            # max_retries=3 重试3次
            s.mount('http://', HTTPAdapter(max_retries=3))
            s.mount('https://', HTTPAdapter(max_retries=3))
            response = requests.request(method='GET', url=url, headers=headers, params=query)
            body = response.content.decode()
            obj = json.loads(body)

            has_more = True if obj.get('has_more', 0) == 1 else False
            next_cursor = str(obj.get('max_cursor')) if has_more else None
            data = obj.get('aweme_list', [])
            response.close()
            if response.status_code != 200:
                Common.logger(log_type, crawler).warning(f"data:{data}\n")
                AliyunLogger.logging(
                    code="2000",
                    platform=crawler,
                    mode=log_type,
                    env=env,
                    message=f"data:{data}\n"
                )
                return
            elif len(data) == 0:
                Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
                AliyunLogger.logging(
                    code="2001",
                    platform=crawler,
                    mode=log_type,
                    env=env,
                    message=f"没有更多视频啦 ~\n"
                )
                return
            for i in range(len(data)):
                try:
                    entity_type = data[i].get('search_impr').get('entity_type')
                    if entity_type == 'GENERAL':
                        Common.logger(log_type, crawler).info('扫描到一条视频\n')
                        AliyunLogger.logging(
                            code="1001",
                            platform=crawler,
                            mode=log_type,
                            env=env,
                            message='扫描到一条视频\n'
                        )
                        is_top = data[i].get('is_top')  # 是否置顶

                        video_id = data[i].get('aweme_id')  # 文章id
                        video_title = data[i].get('desc', "").strip().replace("\n", "") \
                            .replace("/", "").replace("\\", "").replace("\r", "") \
                            .replace(":", "").replace("*", "").replace("？", "") \
                            .replace("?", "").replace('"', "").replace("<", "") \
                            .replace(">", "").replace("|", "").replace(" ", "") \
                            .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
                            .replace("'", "").replace("#", "").replace("Merge", "")
                        publish_time_stamp = data[i].get('create_time')  # 发布时间
                        publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))

                        # video_url = data[i].get('video').get('play_addr').get('url_list')[0]  # 视频链接
                        video_uri = data[i].get('video', {}).get('play_addr', {}).get('uri')
                        ratio = f'{data[i].get("video", {}).get("height")}p'
                        video_url = f'https://www.iesdouyin.com/aweme/v1/play/?video_id={video_uri}&ratio={ratio}&line=0' # 视频链接
                        cover_url = data[i].get('video').get('cover').get('url_list')[0] # 视频封面
                        digg_count = int(data[i].get('statistics').get('digg_count')) # 点赞
                        comment_count = int(data[i].get('statistics').get('comment_count'))  # 评论
                        # collect_count = data[i].get('statistics').get('collect_count')  # 收藏
                        share_count = int(data[i].get('statistics').get('share_count'))  # 转发
                        if share_count < 500:
                            AliyunLogger.logging(
                                code="2004",
                                platform=crawler,
                                mode=log_type,
                                env=env,
                                message=f'分享小于500\n'
                            )
                            continue
                        video_percent = '%.2f' % (share_count / digg_count)
                        special = float(0.25)
                        if float(video_percent) < special:
                            AliyunLogger.logging(
                                code="2004",
                                platform=crawler,
                                mode=log_type,
                                env=env,
                                message=f'分享/点赞小于25%\n'
                            )
                            continue
                        duration = cls.video_duration(video_url)
                        if int(duration) < 45:
                            AliyunLogger.logging(
                                code="2004",
                                platform=crawler,
                                mode=log_type,
                                env=env,
                                message=f'视频时常小于45秒\n'
                            )
                            continue
                        # if special != 0:
                        #     if share_count != 0:
                        #         video_percent = '%.2f' % (share_count / digg_count)
                        #         special = float(special)
                        #         if float(video_percent) < special:
                        #             Common.logger(log_type, crawler).info(f"不符合条件：分享/点赞-{video_percent}\n")
                        #             AliyunLogger.logging(
                        #                 code="2004",
                        #                 platform=crawler,
                        #                 mode=log_type,
                        #                 env=env,
                        #                 message=f"不符合条件：分享/点赞-{video_percent},点赞量-{digg_count}\n"
                        #             )
                        #             continue
                        video_dict = {'video_title': video_title,
                                      'video_id': video_id,
                                      'play_cnt': 0,
                                      'like_cnt': digg_count,
                                      'comment_cnt': comment_count,
                                      'share_cnt': share_count,
                                      'video_width': 0,
                                      'video_height': 0,
                                      'duration': 0,
                                      'publish_time_stamp': publish_time_stamp,
                                      'publish_time_str': publish_time_str,
                                      'user_name': "douyin",
                                      'user_id': video_id,
                                      'avatar_url': '',
                                      'cover_url': cover_url,
                                      'video_url': video_url,
                                      'session': f"douyin-{int(time.time())}"}
                        for k, v in video_dict.items():
                            Common.logger(log_type, crawler).info(f"{k}:{v}")
                        AliyunLogger.logging(
                            code="1000",
                            platform=crawler,
                            mode=log_type,
                            env=env,
                            message=f"{video_dict}\n"
                        )
                        if is_top == 0:
                            if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
                                Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
                                AliyunLogger.logging(
                                    code="2004",
                                    platform=crawler,
                                    mode=log_type,
                                    env=env,
                                    message=f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n'
                                )
                                return
                        if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
                            Common.logger(log_type, crawler).info('无效视频\n')
                            AliyunLogger.logging(
                                code="2004",
                                platform=crawler,
                                mode=log_type,
                                env=env,
                                message='无效视频\n'
                            )
                        elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
                            Common.logger(log_type, crawler).info("不满足抓取规则\n")
                            AliyunLogger.logging(
                                code="2004",
                                platform=crawler,
                                mode=log_type,
                                env=env,
                                message='不满足抓取规则\n'
                            )
                        elif any(str(word) if str(word) in video_dict["video_title"] else False
                                 for word in get_config_from_mysql(log_type=log_type,
                                                                   source=crawler,
                                                                   env=env,
                                                                   text="filter",
                                                                   action="")) is True:
                            Common.logger(log_type, crawler).info('已中过滤词\n')
                            AliyunLogger.logging(
                                code="2004",
                                platform=crawler,
                                mode=log_type,
                                env=env,
                                message='已中过滤词\n'
                            )
                        elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
                            Common.logger(log_type, crawler).info('视频已下载\n')
                            AliyunLogger.logging(
                                code="2002",
                                platform=crawler,
                                mode=log_type,
                                env=env,
                                message='视频已下载\n'
                            )
                        else:
                            video_dict["out_user_id"] = video_dict["user_id"]
                            video_dict["platform"] = crawler
                            video_dict["strategy"] = log_type
                            video_dict["out_video_id"] = video_dict["video_id"]
                            video_dict["width"] = video_dict["video_width"]
                            video_dict["height"] = video_dict["video_height"]
                            video_dict["crawler_rule"] = json.dumps(rule_dict)
                            video_dict["user_id"] = user_dict["uid"]
                            video_dict["publish_time"] = video_dict["publish_time_str"]
                            video_dict["strategy_type"] = log_type
                            limit_flag = cls.limiter.author_limitation(user_id=video_dict['user_id'])
                            if limit_flag:
                                mq.send_msg(video_dict)
                                cls.download_cnt += 1
                                AliyunLogger.logging(code="1002", message="成功发送至 ETL", data=video_dict)

                except Exception as e:
                    Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
                    AliyunLogger.logging(
                        code="3000",
                        platform=crawler,
                        mode=log_type,
                        env=env,
                        message=f"抓取单条视频异常:{e}\n"
                    )

    @classmethod
    def repeat_video(cls, log_type, crawler, video_id, env):
        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
        return len(repeat_video)


    @classmethod
    def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
        for user_dict in user_list:
            try:
                Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
                AliyunLogger.logging(
                    code="2000",
                    platform=crawler,
                    mode=log_type,
                    env=env,
                    message=f"开始抓取 {user_dict['nick_name']} 主页视频"
                )
                cls.download_cnt = 0
                cls.get_videoList(log_type=log_type,
                                  crawler=crawler,
                                  user_dict=user_dict,
                                  rule_dict=rule_dict,
                                  env=env)
            except Exception as e:
                Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
                AliyunLogger.logging(
                    code="3000",
                    platform=crawler,
                    mode=log_type,
                    env=env,
                    message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
                )

    @classmethod
    def video_duration(cls, filename):
        cap = cv2.VideoCapture(filename)
        if cap.isOpened():
            rate = cap.get(5)
            frame_num = cap.get(7)
            duration = frame_num / rate
            return duration
        return 0


if __name__ == "__main__":
    print(DouyinauthorScheduling.get_cookie("author", "douyin", "prod")["cookie"])
    pass