|
@@ -0,0 +1,194 @@
|
|
|
+import json
|
|
|
+import os
|
|
|
+import random
|
|
|
+import sys
|
|
|
+import time
|
|
|
+import uuid
|
|
|
+import requests
|
|
|
+
|
|
|
+from common.mq import MQ
|
|
|
+
|
|
|
+sys.path.append(os.getcwd())
|
|
|
+from common.pipeline import PiaoQuanPipelineTest
|
|
|
+from common.public import get_config_from_mysql, clean_title
|
|
|
+
|
|
|
+
|
|
|
+def tunnel_proxies():
|
|
|
+ # 隧道域名:端口号
|
|
|
+ tunnel = "q796.kdltps.com:15818"
|
|
|
+
|
|
|
+ # 用户名密码方式
|
|
|
+ username = "t17772369458618"
|
|
|
+ password = "5zqcjkmy"
|
|
|
+ tunnel_proxies = {
|
|
|
+ "http": "http://%(user)s:%(pwd)s@%(proxy)s/"
|
|
|
+ % {"user": username, "pwd": password, "proxy": tunnel},
|
|
|
+ "https": "http://%(user)s:%(pwd)s@%(proxy)s/"
|
|
|
+ % {"user": username, "pwd": password, "proxy": tunnel},
|
|
|
+ }
|
|
|
+
|
|
|
+ return tunnel_proxies
|
|
|
+
|
|
|
+
|
|
|
+class XiaoNianGaoAuthor:
|
|
|
+ def __init__(self, platform, mode, rule_dict, env, user_list):
|
|
|
+ self.platform = platform
|
|
|
+ self.mode = mode
|
|
|
+ self.rule_dict = rule_dict
|
|
|
+ self.env = env
|
|
|
+ self.user_list = user_list
|
|
|
+ self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
+ self.download_count = 0
|
|
|
+
|
|
|
+ def get_author_list(self):
|
|
|
+ # 每轮只抓取定量的数据,到达数量后自己退出
|
|
|
+ max_count = int(self.rule_dict.get("videos_cnt", {}).get("min", 300))
|
|
|
+ for user_dict in self.user_list:
|
|
|
+ if self.download_count <= max_count:
|
|
|
+ self.get_video_list(user_dict)
|
|
|
+ time.sleep(random.randint(1, 15))
|
|
|
+ else:
|
|
|
+ message = "本轮已经抓取足够数量的视频,已经自动退出"
|
|
|
+ print(message)
|
|
|
+ return
|
|
|
+
|
|
|
+ def get_video_list(self, user_dict):
|
|
|
+ next_t = -1
|
|
|
+ # 只抓取更新的视频,如果刷到已经更新的立即退出
|
|
|
+ url = "https://kapi-xng-app.xiaoniangao.cn/v1/album/user_public"
|
|
|
+ headers = {
|
|
|
+ "Host": "kapi-xng-app.xiaoniangao.cn",
|
|
|
+ "content-type": "application/json; charset=utf-8",
|
|
|
+ "accept": "*/*",
|
|
|
+ "verb": "POST",
|
|
|
+ "accept-language": "zh-cn",
|
|
|
+ "date": "Wed, 01 Nov 2023 11:53:22 GMT",
|
|
|
+ "x-token-id": "",
|
|
|
+ "x-signaturemethod": "hmac-sha1",
|
|
|
+ }
|
|
|
+ while True:
|
|
|
+ payload = {
|
|
|
+ "token": "",
|
|
|
+ "limit": 20,
|
|
|
+ "start_t": next_t,
|
|
|
+ "visited_mid": int(user_dict["link"]),
|
|
|
+ "share_width": 300,
|
|
|
+ "share_height": 240,
|
|
|
+ }
|
|
|
+ response = requests.request(
|
|
|
+ "POST",
|
|
|
+ url,
|
|
|
+ headers=headers,
|
|
|
+ data=json.dumps(payload),
|
|
|
+ proxies=tunnel_proxies(),
|
|
|
+ )
|
|
|
+ if "data" not in response.text or response.status_code != 200:
|
|
|
+ message = f"get_videoList:{response.text}"
|
|
|
+ print(message)
|
|
|
+ return
|
|
|
+ elif "list" not in response.json()["data"]:
|
|
|
+ message = f"get_videoList:{response.json()}"
|
|
|
+ print(message)
|
|
|
+ return
|
|
|
+ elif len(response.json()["data"]["list"]) == 0:
|
|
|
+ message = f"没有更多数据啦~"
|
|
|
+ print(message)
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ next_t = response.json()["data"]["next_t"]
|
|
|
+ feeds = response.json()["data"]["list"]
|
|
|
+ for video_obj in feeds:
|
|
|
+ try:
|
|
|
+ message = f"扫描到一条视频"
|
|
|
+ print(message)
|
|
|
+ self.process_video_obj(video_obj, user_dict)
|
|
|
+ except Exception as e:
|
|
|
+ message = "抓取单条视频异常, 报错原因是: {}".format(e)
|
|
|
+ print(message)
|
|
|
+
|
|
|
+ def process_video_obj(self, video_obj, user_dict):
|
|
|
+ trace_id = self.platform + str(uuid.uuid1())
|
|
|
+ # 标题,表情随机加在片头、片尾,或替代句子中间的标点符号
|
|
|
+ xiaoniangao_title = clean_title(video_obj.get("title", ""))
|
|
|
+ # 随机取一个表情/符号
|
|
|
+ emoji = random.choice(
|
|
|
+ get_config_from_mysql(self.mode, self.platform, self.env, "emoji")
|
|
|
+ )
|
|
|
+ # 生成最终标题,标题list[表情+title, title+表情]随机取一个
|
|
|
+ video_title = random.choice(
|
|
|
+ [f"{emoji}{xiaoniangao_title}", f"{xiaoniangao_title}{emoji}"]
|
|
|
+ )
|
|
|
+ # 发布时间
|
|
|
+ publish_time_stamp = int(int(video_obj.get("t", 0)) / 1000)
|
|
|
+ publish_time_str = time.strftime(
|
|
|
+ "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
|
|
|
+ )
|
|
|
+ # 用户名 / 头像
|
|
|
+ user_name = (
|
|
|
+ video_obj.get("user", {})
|
|
|
+ .get("nick", "")
|
|
|
+ .strip()
|
|
|
+ .replace("\n", "")
|
|
|
+ .replace("/", "")
|
|
|
+ .replace(" ", "")
|
|
|
+ .replace(" ", "")
|
|
|
+ .replace("&NBSP", "")
|
|
|
+ .replace("\r", "")
|
|
|
+ )
|
|
|
+ video_dict = {
|
|
|
+ "video_title": video_title,
|
|
|
+ "video_id": video_obj.get("vid", ""),
|
|
|
+ "duration": int(video_obj.get("du", 0) / 1000),
|
|
|
+ "play_cnt": video_obj.get("play_pv", 0),
|
|
|
+ "like_cnt": video_obj.get("favor", {}).get("total", 0),
|
|
|
+ "comment_cnt": video_obj.get("comment_count", 0),
|
|
|
+ "share_cnt": video_obj.get("share", 0),
|
|
|
+ "user_name": user_name,
|
|
|
+ "publish_time_stamp": publish_time_stamp,
|
|
|
+ "publish_time_str": publish_time_str,
|
|
|
+ "update_time_stamp": int(time.time()),
|
|
|
+ "video_width": int(video_obj.get("w", 0)),
|
|
|
+ "video_height": int(video_obj.get("h", 0)),
|
|
|
+ "avatar_url": video_obj.get("user", {}).get("hurl", ""),
|
|
|
+ "profile_id": video_obj["id"],
|
|
|
+ "profile_mid": video_obj.get("user", {}).get("mid", ""),
|
|
|
+ "cover_url": video_obj.get("url", ""),
|
|
|
+ "video_url": video_obj.get("v_url", ""),
|
|
|
+ "session": f"xiaoniangao-author-{int(time.time())}",
|
|
|
+ "out_user_id": video_obj["id"],
|
|
|
+ "platform": self.platform,
|
|
|
+ "strategy": self.mode,
|
|
|
+ "out_video_id": video_obj.get("vid", ""),
|
|
|
+ }
|
|
|
+ print(video_dict)
|
|
|
+ pipeline = PiaoQuanPipelineTest(
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ rule_dict=self.rule_dict,
|
|
|
+ env=self.env,
|
|
|
+ item=video_dict,
|
|
|
+ trace_id=trace_id,
|
|
|
+ )
|
|
|
+ flag = pipeline.process_item()
|
|
|
+ if flag:
|
|
|
+ video_dict["width"] = video_dict["video_width"]
|
|
|
+ video_dict["height"] = video_dict["video_height"]
|
|
|
+ video_dict["crawler_rule"] = json.dumps(self.rule_dict)
|
|
|
+ video_dict["user_id"] = user_dict["uid"]
|
|
|
+ video_dict["publish_time"] = video_dict["publish_time_str"]
|
|
|
+ # print(video_dict)
|
|
|
+ self.mq.send_msg(video_dict)
|
|
|
+ self.download_count += 1
|
|
|
+ message = "成功发送 MQ 至 ETL"
|
|
|
+ print(message)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ XNGA = XiaoNianGaoAuthor(
|
|
|
+ platform="xiaoniangao",
|
|
|
+ mode="author",
|
|
|
+ rule_dict={},
|
|
|
+ env="prod",
|
|
|
+ user_list=[{"link": 295640510, "uid": "12334"}],
|
|
|
+ )
|
|
|
+ XNGA.get_author_list()
|