Browse Source

add douyin 主页

zhangyong 1 year ago
parent
commit
9121f4ef43

+ 138 - 0
douyin/douyin_author/douyin_author_scheduling_help.py

@@ -0,0 +1,138 @@
+import json
+import time
+from base64 import b64encode
+from functools import reduce
+from hashlib import md5
+from random import choice, randint
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlencode
+
+
+class DouYinHelper(object):
+    ttwid_list = [
+        '1|G3wy_-RdLJnfG5P9zAcP54OM8_nTLZVrJxNi1lPzdmg|1693558867|5e43c47a424e939aaf7193b096e3c6f2274982ee64e9608c99c54d2a43982aca'
+    ]
+
+    @classmethod
+    def _0x30492c(cls, x: bytes, y: bytes, f: Optional[List[int]] = None) -> bytes:
+        """RC4加密, 可以用Crypto.Cipher.ARC4替代"""
+        c = 0
+        d = [i for i in range(256)]
+        for b in range(256):
+            c = (c + d[b] + x[b % len(x)]) % 256
+            e = d[b]
+            d[b] = d[c]
+            d[c] = e
+        t, c = 0, 0
+
+        if not f:
+            f = []
+        for i in range(len(y)):
+            t = (t + 1) % 256
+            c = (c + d[t]) % 256
+            e = d[t]
+            d[t] = d[c]
+            d[c] = e
+            f.append(y[i] ^ d[(d[t] + d[c]) % 256])
+        return bytes(f)
+
+    @classmethod
+    def _0x485470(cls, a: str) -> List[int]:
+        _0x583e81 = [0] * 103
+        for i in range(10):
+            _0x583e81[i + 48] = i
+        for j in range(10, 16):
+            _0x583e81[j + 87] = j
+
+        b = len(a) >> 1
+        e = b << 1
+        d = [0] * b
+        c = 0
+        for f in range(0, e, 2):
+            d[c] = _0x583e81[ord(a[f])] << 4 | _0x583e81[ord(a[f + 1])]
+            c += 1
+        return d
+
+    @classmethod
+    def calc_x_bogus(cls, ua: str, query: str, data: Optional[Dict[str, Any]] = None) -> str:
+        """计算X_Bogus参数"""
+        query = query.encode()
+        for _ in range(2):
+            query = md5(query).hexdigest()
+            query = bytes([int(query[i:i + 2], 16) for i in range(0, len(query), 2)])
+
+        data = json.dumps(data, separators=(',', ':'), ensure_ascii=False).encode() if data else b''
+        for _ in range(2):
+            data = md5(data).hexdigest()
+            data = bytes([int(data[i:i + 2], 16) for i in range(0, len(data), 2)])
+
+        a = b'\x00\x01\x0e'
+        ua = b64encode(cls._0x30492c(a, ua.encode())).decode()
+        ua = md5(ua.encode()).hexdigest()
+        ua = cls._0x485470(ua)
+
+        t = int(time.time())
+        fp = 2421646185  # 真实的canvas指纹
+        arr1 = [
+            64,
+            1 / 256,
+            1 % 256,
+            14,
+            query[14],
+            query[15],
+            data[14],
+            data[15],
+            ua[14],
+            ua[15],
+            t >> 24 & 255,
+            t >> 16 & 255,
+            t >> 8 & 255,
+            t >> 0 & 255,
+            fp >> 24 & 255,
+            fp >> 16 & 255,
+            fp >> 8 & 255,
+            fp >> 0 & 255,
+        ]
+        reduce_num = reduce(lambda x, y: int(x) ^ int(y), arr1)
+        arr1.append(reduce_num)
+        arr2 = [int(arr1[i]) for i in range(len(arr1))]
+
+        garble = cls._0x30492c(b'\xff', bytes(arr2), [2, 255])
+        m = 'Dkdpgh4ZKsQB80/Mfvw36XI1R25-WUAlEi7NLboqYTOPuzmFjJnryx9HVGcaStCe='
+        xb = ''
+        for i in range(0, len(garble), 3):
+            a, b, c = garble[i], garble[i + 1], garble[i + 2]
+            base_num = c | b << 8 | a << 16
+            c1 = m[(base_num & 16515072) >> 18]
+            c2 = m[(base_num & 258048) >> 12]
+            c3 = m[(base_num & 4032) >> 6]
+            c4 = m[(base_num & 63)]
+            xb += ''.join([c1, c2, c3, c4])
+        return xb
+
+    @classmethod
+    def get_full_query(cls, ua: str, extra_data: Dict[str, Any]) -> Dict[str, Any]:
+        ms_token = b64encode(bytes([randint(0, 255) for _ in range(94)])).decode()
+        ms_token = ms_token.replace('+', '-').replace('/', '_').rstrip('=')
+
+        data = {
+            'device_platform': 'webapp',
+            'aid': '6383',
+            'channel': 'channel_pc_web',
+            'pc_client_type': '1',
+            'version_code': '190500',
+            'version_name': '19.5.0',
+            'cookie_enabled': 'true',
+            'platform': 'PC',
+            'msToken': ms_token,
+        }
+        data.update(extra_data)
+        query = urlencode(data, safe='=')
+        x_bogus = cls.calc_x_bogus(ua=ua, query=query, data=None)
+        data.update({'X-Bogus': x_bogus})
+        return data
+
+    @classmethod
+    def get_cookie(cls):
+        ttwid = choice(cls.ttwid_list)
+        return f'ttwid={ttwid}'

+ 314 - 0
douyin/douyin_author/douyin_author_scheduling_new.py

@@ -0,0 +1,314 @@
+# -*- coding: utf-8 -*-
+# @Time: 2023/11/07
+import os
+import random
+import sys
+import time
+
+import requests
+import json
+import urllib3
+
+
+sys.path.append(os.getcwd())
+from datetime import timedelta, date
+from common.common import Common
+from common import AliyunLogger
+from common.mq import MQ
+from requests.adapters import HTTPAdapter
+from common.scheduling_db import MysqlHelper
+from common.public import get_config_from_mysql, download_rule
+from douyin.douyin_author.douyin_author_scheduling_help import DouYinHelper
+
+
+
+class DouyinauthorScheduling:
+    platform = "抖音"
+    download_cnt = 0
+
+    @classmethod
+    def videos_cnt(cls, rule_dict):
+        videos_cnt = rule_dict.get("videos_cnt", {}).get("min", 0)
+        if videos_cnt == 0:
+            videos_cnt = 1000
+        return videos_cnt
+
+
+
+    @classmethod
+    def get_cookie(cls, log_type, crawler, env):
+        select_sql = f""" select * from crawler_config where source="{crawler}" """
+        configs = MysqlHelper.get_values(log_type, crawler, select_sql, env, action="")
+        for config in configs:
+            if "cookie" in config["config"]:
+                cookie_dict = {
+                    "cookie_id": config["id"],
+                    "title": config["title"].strip(),
+                    "cookie": dict(eval(config["config"]))["cookie"].strip(),
+                    "update_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(config["update_time"] / 1000))),
+                    "operator": config["operator"].strip()
+                }
+                return cookie_dict
+
+    @classmethod
+    def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env):
+        mq = MQ(topic_name="topic_crawler_etl_" + env)
+        next_cursor = 0
+        while True:
+            cookie = cls.get_cookie(log_type, crawler, env)["cookie"]
+
+            time.sleep(random.randint(10, 50))
+            url = 'https://www.douyin.com/aweme/v1/web/aweme/post/'
+            account_id = user_dict["link"]
+            headers = {
+                'Accept': 'application/json, text/plain, */*',
+                'Accept-Language': 'zh-CN,zh;q=0.9',
+                'Cache-Control': 'no-cache',
+                'Cookie': cookie,
+                'Pragma': 'no-cache',
+                'Referer': f'https://www.douyin.com/user/{account_id}',
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
+                              'Chrome/118.0.0.0 Safari/537.36',
+            }
+            query = DouYinHelper.get_full_query(ua=headers['User-Agent'], extra_data={
+                'sec_user_id': account_id,
+                'max_cursor': next_cursor,
+                'locate_query': 'false',
+                'show_live_replay_strategy': '1',
+                'need_time_list': '1',
+                'time_list_query': '0',
+                'whale_cut_token': '',
+                'cut_version': '1',
+                'count': '18',
+                'publish_video_strategy_type': '2',
+            })
+            urllib3.disable_warnings()
+            s = requests.session()
+            # max_retries=3 重试3次
+            s.mount('http://', HTTPAdapter(max_retries=3))
+            s.mount('https://', HTTPAdapter(max_retries=3))
+            response = requests.request(method='GET', url=url, headers=headers, params=query)
+            body = response.content.decode()
+            obj = json.loads(body)
+
+            has_more = True if obj.get('has_more', 0) == 1 else False
+            next_cursor = str(obj.get('max_cursor')) if has_more else None
+            data = obj.get('aweme_list', [])
+            response.close()
+            if response.status_code != 200:
+                Common.logger(log_type, crawler).warning(f"data:{data}\n")
+                AliyunLogger.logging(
+                    code="2000",
+                    platform=crawler,
+                    mode=log_type,
+                    env=env,
+                    message=f"data:{data}\n"
+                )
+                return
+            elif len(data) == 0:
+                Common.logger(log_type, crawler).warning(f"没有更多视频啦 ~\n")
+                AliyunLogger.logging(
+                    code="2001",
+                    platform=crawler,
+                    mode=log_type,
+                    env=env,
+                    message=f"没有更多视频啦 ~\n"
+                )
+                return
+            for i in range(len(data)):
+                try:
+                    entity_type = data[i].get('search_impr').get('entity_type')
+                    if entity_type == 'GENERAL':
+                        Common.logger(log_type, crawler).info('扫描到一条视频\n')
+                        AliyunLogger.logging(
+                            code="1001",
+                            platform=crawler,
+                            mode=log_type,
+                            env=env,
+                            message='扫描到一条视频\n'
+                        )
+                        video_id = data[i].get('aweme_id')  # 文章id
+                        video_title = data[i].get('desc', "").strip().replace("\n", "") \
+                            .replace("/", "").replace("\\", "").replace("\r", "") \
+                            .replace(":", "").replace("*", "").replace("?", "") \
+                            .replace("?", "").replace('"', "").replace("<", "") \
+                            .replace(">", "").replace("|", "").replace(" ", "") \
+                            .replace("&NBSP", "").replace(".", "。").replace(" ", "") \
+                            .replace("'", "").replace("#", "").replace("Merge", "")
+                        publish_time_stamp = data[i].get('create_time')  # 发布时间
+                        publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
+
+                        video_url = data[i].get('video').get('play_addr').get('url_list')[0]  # 视频链接
+                        cover_url = data[i].get('video').get('cover').get('url_list')[0] # 视频封面
+                        digg_count = int(data[i].get('statistics').get('digg_count')) # 点赞
+                        comment_count = int(data[i].get('statistics').get('comment_count'))  # 评论
+                        # collect_count = data[i].get('statistics').get('collect_count')  # 收藏
+                        share_count = int(data[i].get('statistics').get('share_count'))  # 转发
+                        date_three_days_ago_string = (date.today() + timedelta(days=-5)).strftime("%Y-%m-%d %H:%M:%S")
+                        rule = publish_time_str > date_three_days_ago_string
+                        if i > 2:
+                            if rule == False:
+                                break
+                        if rule == False:
+                            Common.logger(log_type, crawler).info(f"发布时间小于5天,发布时间:{publish_time_str}\n")
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message=f"发布时间小于5天,发布时间:{publish_time_str}\n"
+                            )
+                            continue
+                        video_percent = '%.2f' % (share_count / digg_count)
+                        if digg_count < 50000 and digg_count < 50:
+                            if video_percent < 0.02:
+                                Common.logger(log_type, crawler).info(f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n")
+                                AliyunLogger.logging(
+                                    code="2004",
+                                    platform=crawler,
+                                    mode=log_type,
+                                    env=env,
+                                    message=f"不符合条件:分享/点赞-{video_percent},点赞量-{digg_count}\n"
+                                )
+                                continue
+                        video_dict = {'video_title': video_title,
+                                      'video_id': video_id,
+                                      'play_cnt': 0,
+                                      'like_cnt': digg_count,
+                                      'comment_cnt': comment_count,
+                                      'share_cnt': share_count,
+                                      'video_width': 0,
+                                      'video_height': 0,
+                                      'duration': 0,
+                                      'publish_time_stamp': publish_time_stamp,
+                                      'publish_time_str': publish_time_str,
+                                      'user_name': "douyin",
+                                      'user_id': video_id,
+                                      'avatar_url': '',
+                                      'cover_url': cover_url,
+                                      'video_url': video_url,
+                                      'session': f"douyin-{int(time.time())}"}
+                        for k, v in video_dict.items():
+                            Common.logger(log_type, crawler).info(f"{k}:{v}")
+                        AliyunLogger.logging(
+                            code="1000",
+                            platform=crawler,
+                            mode=log_type,
+                            env=env,
+                            message=f"{video_dict}\n"
+                        )
+                        if int((int(time.time()) - int(publish_time_stamp)) / (3600*24)) > int(rule_dict.get("period", {}).get("max", 1000)):
+                            Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message=f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n'
+                            )
+                            return
+                        if video_dict["video_id"] == '' or video_dict["cover_url"] == '' or video_dict["video_url"] == '':
+                            Common.logger(log_type, crawler).info('无效视频\n')
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message='无效视频\n'
+                            )
+                        elif download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
+                            Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message='不满足抓取规则\n'
+                            )
+                        elif any(str(word) if str(word) in video_dict["video_title"] else False
+                                 for word in get_config_from_mysql(log_type=log_type,
+                                                                   source=crawler,
+                                                                   env=env,
+                                                                   text="filter",
+                                                                   action="")) is True:
+                            Common.logger(log_type, crawler).info('已中过滤词\n')
+                            AliyunLogger.logging(
+                                code="2004",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message='已中过滤词\n'
+                            )
+                        elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
+                            Common.logger(log_type, crawler).info('视频已下载\n')
+                            AliyunLogger.logging(
+                                code="2002",
+                                platform=crawler,
+                                mode=log_type,
+                                env=env,
+                                message='视频已下载\n'
+                            )
+                        else:
+                            video_dict["out_user_id"] = video_dict["user_id"]
+                            video_dict["platform"] = crawler
+                            video_dict["strategy"] = log_type
+                            video_dict["out_video_id"] = video_dict["video_id"]
+                            video_dict["width"] = video_dict["video_width"]
+                            video_dict["height"] = video_dict["video_height"]
+                            video_dict["crawler_rule"] = json.dumps(rule_dict)
+                            video_dict["user_id"] = user_dict["uid"]
+                            video_dict["publish_time"] = video_dict["publish_time_str"]
+                            video_dict["strategy_type"] = log_type
+                            mq.send_msg(video_dict)
+                            cls.download_cnt += 1
+
+                except Exception as e:
+                    Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
+                    AliyunLogger.logging(
+                        code="3000",
+                        platform=crawler,
+                        mode=log_type,
+                        env=env,
+                        message=f"抓取单条视频异常:{e}\n"
+                    )
+
+    @classmethod
+    def repeat_video(cls, log_type, crawler, video_id, env):
+        sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """
+        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env)
+        return len(repeat_video)
+
+
+
+    @classmethod
+    def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
+        for user_dict in user_list:
+            try:
+                Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 主页视频")
+                AliyunLogger.logging(
+                    code="2000",
+                    platform=crawler,
+                    mode=log_type,
+                    env=env,
+                    message=f"开始抓取 {user_dict['nick_name']} 主页视频"
+                )
+                cls.download_cnt = 0
+                cls.get_videoList(log_type=log_type,
+                                  crawler=crawler,
+                                  user_dict=user_dict,
+                                  rule_dict=rule_dict,
+                                  env=env)
+            except Exception as e:
+                Common.logger(log_type, crawler).warning(f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n")
+                AliyunLogger.logging(
+                    code="3000",
+                    platform=crawler,
+                    mode=log_type,
+                    env=env,
+                    message=f"抓取用户{user_dict['nick_name']}主页视频时异常:{e}\n"
+                )
+
+
+if __name__ == "__main__":
+    print(DouyinauthorScheduling.get_cookie("author", "douyin", "prod")["cookie"])
+    pass

+ 1 - 1
douyin/douyin_main/run_dy_author.py

@@ -9,7 +9,7 @@ sys.path.append(os.getcwd())
 from common.common import Common
 from common.public import get_consumer, ack_message, task_fun_mq
 from common.scheduling_db import MysqlHelper
-from douyin.douyin_author.douyin_author_scheduling import DouyinauthorScheduling
+from douyin.douyin_author.douyin_author_scheduling_new import DouyinauthorScheduling
 
 
 def main(log_type, crawler, topic_name, group_id, env):

+ 1 - 1
douyin/douyin_main/run_dy_author_dev.py

@@ -5,7 +5,7 @@ import os
 import sys
 sys.path.append(os.getcwd())
 from common.common import Common
-from douyin.douyin_author.douyin_author_scheduling import DouyinauthorScheduling
+from douyin.douyin_author.douyin_author_scheduling_new import DouyinauthorScheduling
 
 
 def douyin_author_main(log_type, crawler, env):