|  | @@ -0,0 +1,240 @@
 | 
											
												
													
														|  | 
 |  | +import json
 | 
											
												
													
														|  | 
 |  | +import os
 | 
											
												
													
														|  | 
 |  | +import random
 | 
											
												
													
														|  | 
 |  | +import sys
 | 
											
												
													
														|  | 
 |  | +import time
 | 
											
												
													
														|  | 
 |  | +import uuid
 | 
											
												
													
														|  | 
 |  | +import requests
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +from common.mq import MQ
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +sys.path.append(os.getcwd())
 | 
											
												
													
														|  | 
 |  | +from common.common import Common
 | 
											
												
													
														|  | 
 |  | +from common import AliyunLogger, PiaoQuanPipeline
 | 
											
												
													
														|  | 
 |  | +from common.public import get_config_from_mysql, clean_title
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def tunnel_proxies():
 | 
											
												
													
														|  | 
 |  | +    # 隧道域名:端口号
 | 
											
												
													
														|  | 
 |  | +    tunnel = "q796.kdltps.com:15818"
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    # 用户名密码方式
 | 
											
												
													
														|  | 
 |  | +    username = "t17772369458618"
 | 
											
												
													
														|  | 
 |  | +    password = "5zqcjkmy"
 | 
											
												
													
														|  | 
 |  | +    tunnel_proxies = {
 | 
											
												
													
														|  | 
 |  | +        "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
 | 
											
												
													
														|  | 
 |  | +        % {"user": username, "pwd": password, "proxy": tunnel},
 | 
											
												
													
														|  | 
 |  | +        "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
 | 
											
												
													
														|  | 
 |  | +        % {"user": username, "pwd": password, "proxy": tunnel},
 | 
											
												
													
														|  | 
 |  | +    }
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    return tunnel_proxies
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +class XiaoNianGaoAuthor:
 | 
											
												
													
														|  | 
 |  | +    def __init__(self, platform, mode, rule_dict, env, user_list):
 | 
											
												
													
														|  | 
 |  | +        self.platform = platform
 | 
											
												
													
														|  | 
 |  | +        self.mode = mode
 | 
											
												
													
														|  | 
 |  | +        self.rule_dict = rule_dict
 | 
											
												
													
														|  | 
 |  | +        self.env = env
 | 
											
												
													
														|  | 
 |  | +        self.user_list = user_list
 | 
											
												
													
														|  | 
 |  | +        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
 | 
											
												
													
														|  | 
 |  | +        self.download_count = 0
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def get_author_list(self):
 | 
											
												
													
														|  | 
 |  | +        # 每轮只抓取定量的数据,到达数量后自己退出
 | 
											
												
													
														|  | 
 |  | +        max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 200))
 | 
											
												
													
														|  | 
 |  | +        for user_dict in self.user_list:
 | 
											
												
													
														|  | 
 |  | +            if self.download_count <= max_count:
 | 
											
												
													
														|  | 
 |  | +                self.get_video_list(user_dict)
 | 
											
												
													
														|  | 
 |  | +                time.sleep(random.randint(1, 15))
 | 
											
												
													
														|  | 
 |  | +            else:
 | 
											
												
													
														|  | 
 |  | +                AliyunLogger.logging(
 | 
											
												
													
														|  | 
 |  | +                    code="2000",
 | 
											
												
													
														|  | 
 |  | +                    platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +                    mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +                    env=self.env,
 | 
											
												
													
														|  | 
 |  | +                    message="本轮已经抓取足够数量的视频,已经自动退出",
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +                return
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def get_video_list(self, user_dict):
 | 
											
												
													
														|  | 
 |  | +        next_t = -1
 | 
											
												
													
														|  | 
 |  | +        # 只抓取更新的视频,如果刷到已经更新的立即退出
 | 
											
												
													
														|  | 
 |  | +        url = "https://kapi-xng-app.xiaoniangao.cn/v1/album/user_public"
 | 
											
												
													
														|  | 
 |  | +        headers = {
 | 
											
												
													
														|  | 
 |  | +            "Host": "kapi-xng-app.xiaoniangao.cn",
 | 
											
												
													
														|  | 
 |  | +            "content-type": "application/json; charset=utf-8",
 | 
											
												
													
														|  | 
 |  | +            "accept": "*/*",
 | 
											
												
													
														|  | 
 |  | +            "verb": "POST",
 | 
											
												
													
														|  | 
 |  | +            "accept-language": "zh-cn",
 | 
											
												
													
														|  | 
 |  | +            "date": "Wed, 01 Nov 2023 11:53:22 GMT",
 | 
											
												
													
														|  | 
 |  | +            "x-token-id": "",
 | 
											
												
													
														|  | 
 |  | +            "x-signaturemethod": "hmac-sha1",
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +        while True:
 | 
											
												
													
														|  | 
 |  | +            payload = {
 | 
											
												
													
														|  | 
 |  | +                "token": "",
 | 
											
												
													
														|  | 
 |  | +                "limit": 20,
 | 
											
												
													
														|  | 
 |  | +                "start_t": next_t,
 | 
											
												
													
														|  | 
 |  | +                "visited_mid": int(user_dict["link"]),
 | 
											
												
													
														|  | 
 |  | +                "share_width": 300,
 | 
											
												
													
														|  | 
 |  | +                "share_height": 240,
 | 
											
												
													
														|  | 
 |  | +            }
 | 
											
												
													
														|  | 
 |  | +            response = requests.request(
 | 
											
												
													
														|  | 
 |  | +                "POST",
 | 
											
												
													
														|  | 
 |  | +                url,
 | 
											
												
													
														|  | 
 |  | +                headers=headers,
 | 
											
												
													
														|  | 
 |  | +                data=json.dumps(payload),
 | 
											
												
													
														|  | 
 |  | +                proxies=tunnel_proxies(),
 | 
											
												
													
														|  | 
 |  | +            )
 | 
											
												
													
														|  | 
 |  | +            if "data" not in response.text or response.status_code != 200:
 | 
											
												
													
														|  | 
 |  | +                Common.logger(self.mode, self.platform).info(
 | 
											
												
													
														|  | 
 |  | +                    f"get_videoList:{response.text}\n"
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +                AliyunLogger.logging(
 | 
											
												
													
														|  | 
 |  | +                    code="2000",
 | 
											
												
													
														|  | 
 |  | +                    platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +                    mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +                    env=self.env,
 | 
											
												
													
														|  | 
 |  | +                    message=f"get_videoList:{response.text}\n",
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +                return
 | 
											
												
													
														|  | 
 |  | +            elif "list" not in response.json()["data"]:
 | 
											
												
													
														|  | 
 |  | +                Common.logger(self.mode, self.platform).info(
 | 
											
												
													
														|  | 
 |  | +                    f"get_videoList:{response.json()}\n"
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +                AliyunLogger.logging(
 | 
											
												
													
														|  | 
 |  | +                    code="2000",
 | 
											
												
													
														|  | 
 |  | +                    platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +                    mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +                    env=self.env,
 | 
											
												
													
														|  | 
 |  | +                    message=f"get_videoList:{response.text}\n",
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +                return
 | 
											
												
													
														|  | 
 |  | +            elif len(response.json()["data"]["list"]) == 0:
 | 
											
												
													
														|  | 
 |  | +                Common.logger(self.mode, self.platform).info(f"没有更多数据啦~\n")
 | 
											
												
													
														|  | 
 |  | +                AliyunLogger.logging(
 | 
											
												
													
														|  | 
 |  | +                    code="2000",
 | 
											
												
													
														|  | 
 |  | +                    platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +                    mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +                    env=self.env,
 | 
											
												
													
														|  | 
 |  | +                    message=f"没有更多数据啦~\n",
 | 
											
												
													
														|  | 
 |  | +                )
 | 
											
												
													
														|  | 
 |  | +                return
 | 
											
												
													
														|  | 
 |  | +            else:
 | 
											
												
													
														|  | 
 |  | +                next_t = response.json()["data"]["next_t"]
 | 
											
												
													
														|  | 
 |  | +                feeds = response.json()["data"]["list"]
 | 
											
												
													
														|  | 
 |  | +                for video_obj in feeds:
 | 
											
												
													
														|  | 
 |  | +                    try:
 | 
											
												
													
														|  | 
 |  | +                        AliyunLogger.logging(
 | 
											
												
													
														|  | 
 |  | +                            code="1001",
 | 
											
												
													
														|  | 
 |  | +                            platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +                            mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +                            env=self.env,
 | 
											
												
													
														|  | 
 |  | +                            message="扫描到一条视频",
 | 
											
												
													
														|  | 
 |  | +                        )
 | 
											
												
													
														|  | 
 |  | +                        self.process_video_obj(video_obj, user_dict)
 | 
											
												
													
														|  | 
 |  | +                    except Exception as e:
 | 
											
												
													
														|  | 
 |  | +                        AliyunLogger.logging(
 | 
											
												
													
														|  | 
 |  | +                            code="3000",
 | 
											
												
													
														|  | 
 |  | +                            platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +                            mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +                            env=self.env,
 | 
											
												
													
														|  | 
 |  | +                            data=video_obj,
 | 
											
												
													
														|  | 
 |  | +                            message="抓取单条视频异常, 报错原因是: {}".format(e),
 | 
											
												
													
														|  | 
 |  | +                        )
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    def process_video_obj(self, video_obj, user_dict):
 | 
											
												
													
														|  | 
 |  | +        trace_id = self.platform + str(uuid.uuid1())
 | 
											
												
													
														|  | 
 |  | +        # 标题,表情随机加在片头、片尾,或替代句子中间的标点符号
 | 
											
												
													
														|  | 
 |  | +        xiaoniangao_title = clean_title(video_obj.get("title", ""))
 | 
											
												
													
														|  | 
 |  | +        # 随机取一个表情/符号
 | 
											
												
													
														|  | 
 |  | +        emoji = random.choice(
 | 
											
												
													
														|  | 
 |  | +            get_config_from_mysql(self.mode, self.platform, self.env, "emoji")
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        # 生成最终标题,标题list[表情+title, title+表情]随机取一个
 | 
											
												
													
														|  | 
 |  | +        video_title = random.choice(
 | 
											
												
													
														|  | 
 |  | +            [f"{emoji}{xiaoniangao_title}", f"{xiaoniangao_title}{emoji}"]
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        # 发布时间
 | 
											
												
													
														|  | 
 |  | +        publish_time_stamp = int(int(video_obj.get("t", 0)) / 1000)
 | 
											
												
													
														|  | 
 |  | +        publish_time_str = time.strftime(
 | 
											
												
													
														|  | 
 |  | +            "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        # 用户名 / 头像
 | 
											
												
													
														|  | 
 |  | +        user_name = (
 | 
											
												
													
														|  | 
 |  | +            video_obj.get("user", {})
 | 
											
												
													
														|  | 
 |  | +            .get("nick", "")
 | 
											
												
													
														|  | 
 |  | +            .strip()
 | 
											
												
													
														|  | 
 |  | +            .replace("\n", "")
 | 
											
												
													
														|  | 
 |  | +            .replace("/", "")
 | 
											
												
													
														|  | 
 |  | +            .replace(" ", "")
 | 
											
												
													
														|  | 
 |  | +            .replace(" ", "")
 | 
											
												
													
														|  | 
 |  | +            .replace("&NBSP", "")
 | 
											
												
													
														|  | 
 |  | +            .replace("\r", "")
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        video_dict = {
 | 
											
												
													
														|  | 
 |  | +            "video_title": video_title,
 | 
											
												
													
														|  | 
 |  | +            "video_id": video_obj.get("vid", ""),
 | 
											
												
													
														|  | 
 |  | +            "duration": int(video_obj.get("du", 0) / 1000),
 | 
											
												
													
														|  | 
 |  | +            "play_cnt": video_obj.get("play_pv", 0),
 | 
											
												
													
														|  | 
 |  | +            "like_cnt": video_obj.get("favor", {}).get("total", 0),
 | 
											
												
													
														|  | 
 |  | +            "comment_cnt": video_obj.get("comment_count", 0),
 | 
											
												
													
														|  | 
 |  | +            "share_cnt": video_obj.get("share", 0),
 | 
											
												
													
														|  | 
 |  | +            "user_name": user_name,
 | 
											
												
													
														|  | 
 |  | +            "publish_time_stamp": publish_time_stamp,
 | 
											
												
													
														|  | 
 |  | +            "publish_time_str": publish_time_str,
 | 
											
												
													
														|  | 
 |  | +            "update_time_stamp": int(time.time()),
 | 
											
												
													
														|  | 
 |  | +            "video_width": int(video_obj.get("w", 0)),
 | 
											
												
													
														|  | 
 |  | +            "video_height": int(video_obj.get("h", 0)),
 | 
											
												
													
														|  | 
 |  | +            "avatar_url": video_obj.get("user", {}).get("hurl", ""),
 | 
											
												
													
														|  | 
 |  | +            "profile_id": video_obj["id"],
 | 
											
												
													
														|  | 
 |  | +            "profile_mid": video_obj.get("user", {}).get("mid", ""),
 | 
											
												
													
														|  | 
 |  | +            "cover_url": video_obj.get("url", ""),
 | 
											
												
													
														|  | 
 |  | +            "video_url": video_obj.get("v_url", ""),
 | 
											
												
													
														|  | 
 |  | +            "session": f"xiaoniangao-author-{int(time.time())}",
 | 
											
												
													
														|  | 
 |  | +            "out_user_id": video_obj["id"],
 | 
											
												
													
														|  | 
 |  | +            "platform": self.platform,
 | 
											
												
													
														|  | 
 |  | +            "strategy": self.mode,
 | 
											
												
													
														|  | 
 |  | +            "out_video_id": video_obj.get("vid", ""),
 | 
											
												
													
														|  | 
 |  | +        }
 | 
											
												
													
														|  | 
 |  | +        for k, v in video_dict.items():
 | 
											
												
													
														|  | 
 |  | +            Common.logger(self.mode, self.platform).info(f"{k}:{v}")
 | 
											
												
													
														|  | 
 |  | +        pipeline = PiaoQuanPipeline(
 | 
											
												
													
														|  | 
 |  | +            platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +            mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +            rule_dict=self.rule_dict,
 | 
											
												
													
														|  | 
 |  | +            env=self.env,
 | 
											
												
													
														|  | 
 |  | +            item=video_dict,
 | 
											
												
													
														|  | 
 |  | +            trace_id=trace_id,
 | 
											
												
													
														|  | 
 |  | +        )
 | 
											
												
													
														|  | 
 |  | +        flag = pipeline.process_item()
 | 
											
												
													
														|  | 
 |  | +        if flag:
 | 
											
												
													
														|  | 
 |  | +            video_dict["width"] = video_dict["video_width"]
 | 
											
												
													
														|  | 
 |  | +            video_dict["height"] = video_dict["video_height"]
 | 
											
												
													
														|  | 
 |  | +            video_dict["crawler_rule"] = json.dumps(self.rule_dict)
 | 
											
												
													
														|  | 
 |  | +            video_dict["user_id"] = user_dict["uid"]
 | 
											
												
													
														|  | 
 |  | +            video_dict["publish_time"] = video_dict["publish_time_str"]
 | 
											
												
													
														|  | 
 |  | +            # print(video_dict)
 | 
											
												
													
														|  | 
 |  | +            self.mq.send_msg(video_dict)
 | 
											
												
													
														|  | 
 |  | +            AliyunLogger.logging(
 | 
											
												
													
														|  | 
 |  | +                code="1002",
 | 
											
												
													
														|  | 
 |  | +                platform=self.platform,
 | 
											
												
													
														|  | 
 |  | +                mode=self.mode,
 | 
											
												
													
														|  | 
 |  | +                env=self.env,
 | 
											
												
													
														|  | 
 |  | +                data=video_dict,
 | 
											
												
													
														|  | 
 |  | +                trace_id=trace_id,
 | 
											
												
													
														|  | 
 |  | +                message="成功发送 MQ 至 ETL",
 | 
											
												
													
														|  | 
 |  | +            )
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +if __name__ == "__main__":
 | 
											
												
													
														|  | 
 |  | +    XNGA = XiaoNianGaoAuthor(
 | 
											
												
													
														|  | 
 |  | +        platform="xiaoniangao",
 | 
											
												
													
														|  | 
 |  | +        mode="author",
 | 
											
												
													
														|  | 
 |  | +        rule_dict={},
 | 
											
												
													
														|  | 
 |  | +        env="prod",
 | 
											
												
													
														|  | 
 |  | +        user_list=[{"link": 295640510, "uid": "12334"}],
 | 
											
												
													
														|  | 
 |  | +    )
 | 
											
												
													
														|  | 
 |  | +    XNGA.get_author_list()
 |