|
@@ -0,0 +1,298 @@
|
|
|
+import os
|
|
|
+import json
|
|
|
+import sys
|
|
|
+import datetime
|
|
|
+import time
|
|
|
+import uuid
|
|
|
+
|
|
|
+import requests
|
|
|
+
|
|
|
+sys.path.append(os.getcwd())
|
|
|
+from common import PiaoQuanPipeline, AliyunLogger
|
|
|
+from common.feishu import Feishu
|
|
|
+from common.db import MysqlHelper
|
|
|
+from common.mq import MQ
|
|
|
+
|
|
|
+
|
|
|
+def find_target_user(name, user_list):
|
|
|
+ for obj in user_list:
|
|
|
+ if obj["nickname"] == name:
|
|
|
+ return obj
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+class ShiPinHaoAccount:
|
|
|
+ def __init__(self, platform, mode, rule_dict, user_dict, env):
|
|
|
+ # self.token = token
|
|
|
+ # self.cookie = cookie
|
|
|
+ self.account_name = user_dict["link"]
|
|
|
+ self.platform = platform
|
|
|
+ self.mode = mode
|
|
|
+ self.rule_dict = rule_dict
|
|
|
+ self.user_dict = user_dict
|
|
|
+ self.env = env
|
|
|
+ self.download_cnt = 0
|
|
|
+ self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
+
|
|
|
+ def get_token_from_mysql(self):
|
|
|
+ select_sql = (
|
|
|
+ f"""SELECT config from crawler_config where source = '{self.platform}'; """
|
|
|
+ )
|
|
|
+ # print(select_sql)
|
|
|
+ configs = MysqlHelper.get_values(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ sql=select_sql,
|
|
|
+ env=self.env,
|
|
|
+ machine="",
|
|
|
+ )
|
|
|
+ token_config = configs[0][0]
|
|
|
+ token_info = json.loads(token_config)
|
|
|
+ self.token = token_info["token"]
|
|
|
+ self.cookie = token_info["cookie"]
|
|
|
+
|
|
|
+ def get_history_id(self):
|
|
|
+ """
|
|
|
+ 从数据库表中读取 id
|
|
|
+ """
|
|
|
+ select_user_sql = f"""select name_id from accounts where name = "{self.account_name}" and platform = "{self.platform}" and useful = 1 limit 1"""
|
|
|
+ name_id = MysqlHelper.get_values(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ sql=select_user_sql,
|
|
|
+ env=self.env,
|
|
|
+ machine="",
|
|
|
+ )
|
|
|
+ if name_id:
|
|
|
+ return name_id[0]
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+ def get_account_id(self):
|
|
|
+ # 读历史数据,如果存在 id,则直接返回 id
|
|
|
+ history_id = self.get_history_id()
|
|
|
+ if history_id:
|
|
|
+ return history_id
|
|
|
+ else:
|
|
|
+ url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
|
|
|
+ params = {
|
|
|
+ "action": "search",
|
|
|
+ "scene": "1",
|
|
|
+ "buffer": "",
|
|
|
+ "query": self.account_name,
|
|
|
+ "count": "21",
|
|
|
+ "token": self.token,
|
|
|
+ "lang": "zh_CN",
|
|
|
+ "f": "json",
|
|
|
+ "ajax": "1",
|
|
|
+ }
|
|
|
+ headers = {
|
|
|
+ "authority": "mp.weixin.qq.com",
|
|
|
+ "accept": "*/*",
|
|
|
+ "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
|
|
|
+ "cookie": self.cookie,
|
|
|
+ "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN".format(
|
|
|
+ self.token
|
|
|
+ ),
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
|
+ "x-requested-with": "XMLHttpRequest",
|
|
|
+ }
|
|
|
+ response = requests.request("GET", url, headers=headers, params=params)
|
|
|
+ user_list = response.json()["acct_list"]
|
|
|
+ target_user = find_target_user(name=self.account_name, user_list=user_list)
|
|
|
+ # 写入 MySql 数据库
|
|
|
+ if target_user:
|
|
|
+ update_sql = f"""INSERT INTO accounts (name, name_id, platform) values ("{self.account_name}", "{target_user['username']}", "{self.platform}")"""
|
|
|
+ # print(update_sql)
|
|
|
+ MysqlHelper.update_values(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ sql=update_sql,
|
|
|
+ env=self.env,
|
|
|
+ machine="",
|
|
|
+ )
|
|
|
+ return target_user["username"]
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+ def get_account_videos(self):
|
|
|
+ # 一个账号最多抓取 30 条数据
|
|
|
+ self.get_token_from_mysql()
|
|
|
+ user_id = self.get_account_id()
|
|
|
+ if user_id:
|
|
|
+ url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
|
|
|
+ headers = {
|
|
|
+ "authority": "mp.weixin.qq.com",
|
|
|
+ "accept": "*/*",
|
|
|
+ "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
|
|
|
+ "cookie": self.cookie,
|
|
|
+ "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN".format(
|
|
|
+ self.token
|
|
|
+ ),
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
|
+ "x-requested-with": "XMLHttpRequest",
|
|
|
+ }
|
|
|
+ buffer = "" # 翻页指示器
|
|
|
+ while True:
|
|
|
+ if self.download_cnt >= int(
|
|
|
+ self.rule_dict.get("videos_cnt", {}).get("min", 30)
|
|
|
+ ):
|
|
|
+ return
|
|
|
+ params = {
|
|
|
+ "action": "get_feed_list",
|
|
|
+ "username": user_id,
|
|
|
+ "buffer": buffer,
|
|
|
+ "count": "15",
|
|
|
+ "scene": "1",
|
|
|
+ "token": self.token,
|
|
|
+ "lang": "zh_CN",
|
|
|
+ "f": "json",
|
|
|
+ "ajax": "1",
|
|
|
+ }
|
|
|
+ response = requests.request("GET", url, headers=headers, params=params)
|
|
|
+ res_json = response.json()
|
|
|
+ # 开始判断视频是否有信息,是否频控
|
|
|
+ if res_json["base_resp"]["err_msg"] == "invalid session":
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="2000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message=f"status_code:{response.status_code}, get_videoList:{response.text}\n",
|
|
|
+ )
|
|
|
+ if 20 >= datetime.datetime.now().hour >= 10:
|
|
|
+ Feishu.bot(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ text="视频号Token 过期啦"
|
|
|
+ # text=f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/"
|
|
|
+ )
|
|
|
+ time.sleep(60 * 15)
|
|
|
+ continue
|
|
|
+ if res_json["base_resp"]["err_msg"] == "freq control":
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="2000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message=f"status_code:{response.status_code}, get_videoList:{response.text}\n",
|
|
|
+ )
|
|
|
+ if 20 >= datetime.datetime.now().hour >= 10:
|
|
|
+ Feishu.bot(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ text="视频号Token 过期啦"
|
|
|
+ # text=f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/"
|
|
|
+ )
|
|
|
+ time.sleep(60 * 15)
|
|
|
+ continue
|
|
|
+ if not res_json.get("list"):
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="2000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message="没有更多视频了",
|
|
|
+ )
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ buffer = res_json["last_buff"]
|
|
|
+ for obj in res_json["list"]:
|
|
|
+ try:
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="1001",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ message="扫描到一条视频",
|
|
|
+ env=self.env,
|
|
|
+ data=obj,
|
|
|
+ )
|
|
|
+ repeat_flag = self.process_video_obj(obj)
|
|
|
+ if not repeat_flag:
|
|
|
+ return
|
|
|
+ except Exception as e:
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="3000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message=f"抓取单条视频异常:{e}\n",
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="3000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message="{}\t获取 id 失败".format(self.account_name),
|
|
|
+ )
|
|
|
+
|
|
|
+ def process_video_obj(self, video_obj):
|
|
|
+ trace_id = self.platform + str(uuid.uuid1())
|
|
|
+ video_dict = {
|
|
|
+ "video_id": video_obj["nonce_id"],
|
|
|
+ "video_title": video_obj["desc"],
|
|
|
+ "out_video_id": video_obj["nonce_id"],
|
|
|
+ "publish_time_stamp": int(time.time()),
|
|
|
+ "publish_time_str": time.strftime(
|
|
|
+ "%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))
|
|
|
+ ),
|
|
|
+ "play_cnt": 0,
|
|
|
+ "comment_cnt": 0,
|
|
|
+ "like_cnt": 0,
|
|
|
+ "share_cnt": 0,
|
|
|
+ "user_id": self.user_dict["uid"],
|
|
|
+ "cover_url": video_obj["media"][0]["cover_url"],
|
|
|
+ "video_url": video_obj["media"][0]["video_url"],
|
|
|
+ "avatar_url": video_obj["head_url"],
|
|
|
+ "width": video_obj["media"][0]["width"],
|
|
|
+ "height": video_obj["media"][0]["height"],
|
|
|
+ "duration": video_obj["media"][0]["video_play_len_s"],
|
|
|
+ "platform": self.platform,
|
|
|
+ "strategy": self.mode,
|
|
|
+ "crawler_rule": self.rule_dict,
|
|
|
+ "session": f"shipinhao-author-{int(time.time())}",
|
|
|
+ }
|
|
|
+ # video_dict["out_user_id"] = video_dict["user_id"]
|
|
|
+ # 无更新时间,去重即可
|
|
|
+ pipeline = PiaoQuanPipeline(
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ item=video_dict,
|
|
|
+ rule_dict=self.rule_dict,
|
|
|
+ env=self.env,
|
|
|
+ trace_id=trace_id,
|
|
|
+ )
|
|
|
+ if not pipeline.repeat_video():
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ video_dict["publish_time"] = video_dict["publish_time_str"]
|
|
|
+ self.mq.send_msg(video_dict)
|
|
|
+ self.download_cnt += 1
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="1002",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ data=video_dict,
|
|
|
+ trace_id=trace_id,
|
|
|
+ message="成功发送 MQ 至 ETL",
|
|
|
+ )
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+# if __name__ == "__main__":
|
|
|
+# temp_token = "2080949641"
|
|
|
+# temp_cookie = "ua_id=bw4VuFJr6fAuSkwdAAAAAClaW0m9Aua-6IfHaXU_zpo=; wxuin=95302180931488; mm_lang=zh_CN; RK=kreEMgtMMJ; ptcz=8fd1b267c98a1185bbe6455a081f1264048ee388363ca305d9ef4812892c7900; qq_domain_video_guid_verify=2ba78a5010233582; poc_sid=HOinP2Wj322Ex737kV651Zqy6y8fSprOUUvaegBg; _qimei_q36=; _qimei_h38=9eea33ea92afe8a922333fce03000001317916; pgv_pvid=9056371236; _clck=3930572231|1|fgk|0; uuid=6562bbd8859230ce4120dfa063c76997; rand_info=CAESIGAatjSIjvxVJVDxRDN7F/CNFWMifvAVqje98rd++8UY; slave_bizuin=3236647229; data_bizuin=3236647229; bizuin=3236647229; data_ticket=qm3i6jRhObs1yKHttGh0gVI02Mz7FTPfatn0RMLdaWyD7Ukcokm5Dc3mmYLQUZPg; slave_sid=UWxjZnhBREZRRTNKZ3dYZTlYRE9Db2lxQUhOM3lZUlRoMkV0MG1wdVVudGpQTWxnVkxzYW5pV2c3NjB3bnAyQ2lPaXBBVVRPazEybWtKSVEzTnUyazZ6WEJsdnFaWWVDaUFrM3pTTXRkeUNJS3RNVTc2NFRBWkZiVGQzYllacEFRalBBZ2tXZlltblJYS2VS; slave_user=gh_d284c09295eb; xid=cb96e6ba4b4960d74a22869b1bb21406; _clsk=z77guf|1699532621466|4|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
|
|
+# SP = ShiPinHaoAccount(
|
|
|
+# token=temp_token,
|
|
|
+# cookie=temp_cookie,
|
|
|
+# account_name="心煤",
|
|
|
+# platform="shipinhao",
|
|
|
+# mode="author",
|
|
|
+# rule_dict={},
|
|
|
+# env="prod",
|
|
|
+# )
|
|
|
+# SP.get_account_videos()
|