|
@@ -1,11 +1,21 @@
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import datetime
|
|
|
+import time
|
|
|
+import uuid
|
|
|
+
|
|
|
import requests
|
|
|
-from common.aliyun_log import AliyunLogger
|
|
|
+
|
|
|
+sys.path.append(os.getcwd())
|
|
|
+from common import PiaoQuanPipeline, AliyunLogger
|
|
|
+from common.feishu import Feishu
|
|
|
from common.db import MysqlHelper
|
|
|
+from common.mq import MQ
|
|
|
|
|
|
|
|
|
def find_target_user(name, user_list):
|
|
|
for obj in user_list:
|
|
|
- if obj['nickname'] == name:
|
|
|
+ if obj["nickname"] == name:
|
|
|
return obj
|
|
|
else:
|
|
|
continue
|
|
@@ -13,21 +23,33 @@ def find_target_user(name, user_list):
|
|
|
|
|
|
|
|
|
class ShiPinHaoAccount:
|
|
|
- def __init__(self, token, cookie, account_name, platform, mode, rule_dict, env):
|
|
|
- self.token = token
|
|
|
- self.cookie = cookie
|
|
|
- self.account_name = account_name
|
|
|
+ def __init__(self, platform, mode, rule_dict, user_dict, env):
|
|
|
+ # self.token = token
|
|
|
+ # self.cookie = cookie
|
|
|
+ self.account_name = user_dict["name"]
|
|
|
self.platform = platform
|
|
|
self.mode = mode
|
|
|
self.rule_dict = rule_dict
|
|
|
+ self.user_dict = user_dict
|
|
|
self.env = env
|
|
|
+ self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
+
|
|
|
+ def get_token_from_mysql(self):
|
|
|
+ self.token = ""
|
|
|
+ self.cookie = ""
|
|
|
|
|
|
def get_history_id(self):
|
|
|
"""
|
|
|
从数据库表中读取 id
|
|
|
"""
|
|
|
select_user_sql = f"""select name_id from accounts where name = "{self.account_name}" and platform = "{self.platform}" and useful = 1 limit 1"""
|
|
|
- name_id = MysqlHelper.get_values(log_type=self.mode, crawler=self.platform, sql=select_user_sql, env=self.env, machine="")
|
|
|
+ name_id = MysqlHelper.get_values(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ sql=select_user_sql,
|
|
|
+ env=self.env,
|
|
|
+ machine="",
|
|
|
+ )
|
|
|
if name_id:
|
|
|
return name_id[0]
|
|
|
else:
|
|
@@ -49,45 +71,53 @@ class ShiPinHaoAccount:
|
|
|
"token": self.token,
|
|
|
"lang": "zh_CN",
|
|
|
"f": "json",
|
|
|
- "ajax": "1"
|
|
|
+ "ajax": "1",
|
|
|
}
|
|
|
headers = {
|
|
|
- 'authority': 'mp.weixin.qq.com',
|
|
|
- 'accept': '*/*',
|
|
|
- 'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
|
|
|
- 'cookie': self.cookie,
|
|
|
- 'referer': 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN'.format(
|
|
|
- self.token),
|
|
|
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
|
|
- 'x-requested-with': 'XMLHttpRequest'
|
|
|
+ "authority": "mp.weixin.qq.com",
|
|
|
+ "accept": "*/*",
|
|
|
+ "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
|
|
|
+ "cookie": self.cookie,
|
|
|
+ "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN".format(
|
|
|
+ self.token
|
|
|
+ ),
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
|
+ "x-requested-with": "XMLHttpRequest",
|
|
|
}
|
|
|
response = requests.request("GET", url, headers=headers, params=params)
|
|
|
- user_list = response.json()['acct_list']
|
|
|
+ user_list = response.json()["acct_list"]
|
|
|
target_user = find_target_user(name=self.account_name, user_list=user_list)
|
|
|
# 写入 MySql 数据库
|
|
|
if target_user:
|
|
|
update_sql = f"""INSERT INTO accounts (name, name_id, platform) values ("{self.account_name}", "{target_user['username']}", "{self.platform}")"""
|
|
|
print(update_sql)
|
|
|
- MysqlHelper.update_values(log_type=self.mode, crawler=self.platform, sql=update_sql, env=self.env, machine="")
|
|
|
- return target_user['username']
|
|
|
+ MysqlHelper.update_values(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ sql=update_sql,
|
|
|
+ env=self.env,
|
|
|
+ machine="",
|
|
|
+ )
|
|
|
+ return target_user["username"]
|
|
|
else:
|
|
|
return False
|
|
|
|
|
|
def get_account_videos(self):
|
|
|
user_id = self.get_account_id()
|
|
|
- buffer = ""
|
|
|
if user_id:
|
|
|
url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
|
|
|
headers = {
|
|
|
- 'authority': 'mp.weixin.qq.com',
|
|
|
- 'accept': '*/*',
|
|
|
- 'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
|
|
|
- 'cookie': self.cookie,
|
|
|
- 'referer': 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN'.format(
|
|
|
- self.token),
|
|
|
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
|
|
- 'x-requested-with': 'XMLHttpRequest'
|
|
|
+ "authority": "mp.weixin.qq.com",
|
|
|
+ "accept": "*/*",
|
|
|
+ "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
|
|
|
+ "cookie": self.cookie,
|
|
|
+ "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN".format(
|
|
|
+ self.token
|
|
|
+ ),
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
|
+ "x-requested-with": "XMLHttpRequest",
|
|
|
}
|
|
|
+ buffer = "" # 翻页指示器
|
|
|
while True:
|
|
|
params = {
|
|
|
"action": "get_feed_list",
|
|
@@ -98,29 +128,137 @@ class ShiPinHaoAccount:
|
|
|
"token": self.token,
|
|
|
"lang": "zh_CN",
|
|
|
"f": "json",
|
|
|
- "ajax": "1"
|
|
|
+ "ajax": "1",
|
|
|
}
|
|
|
response = requests.request("GET", url, headers=headers, params=params)
|
|
|
- video_list = response.json()
|
|
|
- buffer = video_list['last_buff']
|
|
|
- # print(json.dumps(video_list, ensure_ascii=False, indent=4))
|
|
|
- # print(len(video_list['list']))
|
|
|
- for obj in video_list['list']:
|
|
|
- print(obj['desc'])
|
|
|
+ res_json = response.json()
|
|
|
+ # 开始判断视频是否有信息,是否频控
|
|
|
+ if res_json["base_resp"]["err_msg"] == "invalid session":
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="2000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message=f"status_code:{response.status_code}, get_videoList:{response.text}\n",
|
|
|
+ )
|
|
|
+ if 20 >= datetime.datetime.now().hour >= 10:
|
|
|
+ Feishu.bot(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ text="视频号Token 过期啦"
|
|
|
+ # text=f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/"
|
|
|
+ )
|
|
|
+ time.sleep(60 * 15)
|
|
|
+ continue
|
|
|
+ if res_json["base_resp"]["err_msg"] == "freq control":
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="2000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message=f"status_code:{response.status_code}, get_videoList:{response.text}\n",
|
|
|
+ )
|
|
|
+ if 20 >= datetime.datetime.now().hour >= 10:
|
|
|
+ Feishu.bot(
|
|
|
+ log_type=self.mode,
|
|
|
+ crawler=self.platform,
|
|
|
+ text="视频号Token 过期啦"
|
|
|
+ # text=f"{token_dict['title']}\n操作人:{token_dict['operator']}\n更换日期:{token_dict['update_time']} \n频控啦,请扫码更换其他公众号token\nhttps://mp.weixin.qq.com/"
|
|
|
+ )
|
|
|
+ time.sleep(60 * 15)
|
|
|
+ continue
|
|
|
+ if not res_json["list"]:
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="2000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message="没有更多视频了",
|
|
|
+ )
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ buffer = res_json["last_buff"]
|
|
|
+ for obj in res_json["list"]:
|
|
|
+ try:
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="1001",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ message="扫描到一条视频",
|
|
|
+ env=self.env,
|
|
|
+ data=obj,
|
|
|
+ )
|
|
|
+ repeat_flag = self.process_video_obj(obj)
|
|
|
+ if not repeat_flag:
|
|
|
+ return
|
|
|
+ except Exception as e:
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="3000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message=f"抓取单条视频异常:{e}\n",
|
|
|
+ )
|
|
|
else:
|
|
|
- print("Did not find any user info")
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="3000",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ message="{}\t获取 id 失败".format(self.account_name),
|
|
|
+ )
|
|
|
|
|
|
def process_video_obj(self, video_obj):
|
|
|
+ trace_id = self.platform + str(uuid.uuid1())
|
|
|
video_dict = {
|
|
|
- "video_id": video_obj['nonce_id'],
|
|
|
- "video_title": video_obj['desc'],
|
|
|
- "cover_url": video_obj['media']["cover_url"],
|
|
|
- "video_url": video_obj['media']['video_url'],
|
|
|
- "avatar_url": video_obj['head_url'],
|
|
|
- "width": video_obj['media']['width'],
|
|
|
- "height": video_obj['media']['height']
|
|
|
+ "video_id": video_obj["nonce_id"],
|
|
|
+ "video_title": video_obj["desc"],
|
|
|
+ "publish_time_stamp": int(time.time()),
|
|
|
+ "publish_time_str": time.strftime(
|
|
|
+ "%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))
|
|
|
+ ),
|
|
|
+ "play_cnt": 0,
|
|
|
+ "comment_cnt": 0,
|
|
|
+ "like_cnt": 0,
|
|
|
+ "share_cnt": 0,
|
|
|
+ "user_id": self.user_dict["user_id"],
|
|
|
+ "cover_url": video_obj["media"]["cover_url"],
|
|
|
+ "video_url": video_obj["media"]["video_url"],
|
|
|
+ "avatar_url": video_obj["head_url"],
|
|
|
+ "width": video_obj["media"]["width"],
|
|
|
+ "height": video_obj["media"]["height"],
|
|
|
+ "duration": video_obj["media"]["video_play_len_s"],
|
|
|
+ "platform": self.platform,
|
|
|
+ "strategy": self.mode,
|
|
|
+ "crawler_rule": self.rule_dict,
|
|
|
+ "session": f"shipinhao-author-{int(time.time())}",
|
|
|
}
|
|
|
- print(self.platform)
|
|
|
+ # 无更新时间,去重即可
|
|
|
+ pipeline = PiaoQuanPipeline(
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ item=video_dict,
|
|
|
+ rule_dict=self.rule_dict,
|
|
|
+ env=self.env,
|
|
|
+ trace_id=trace_id,
|
|
|
+ )
|
|
|
+ if not pipeline.repeat_video():
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ video_dict["out_user_id"] = video_dict["user_id"]
|
|
|
+ video_dict["user_id"] = self.user_dict["uid"]
|
|
|
+ video_dict["publish_time"] = video_dict["publish_time_str"]
|
|
|
+ self.mq.send_msg(video_dict)
|
|
|
+ AliyunLogger.logging(
|
|
|
+ code="1002",
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ env=self.env,
|
|
|
+ data=video_dict,
|
|
|
+ trace_id=trace_id,
|
|
|
+ message="成功发送 MQ 至 ETL",
|
|
|
+ )
|
|
|
+ return True
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
@@ -133,7 +271,6 @@ if __name__ == "__main__":
|
|
|
platform="shipinhao",
|
|
|
mode="author",
|
|
|
rule_dict={},
|
|
|
- env="prod"
|
|
|
+ env="prod",
|
|
|
)
|
|
|
SP.get_account_videos()
|
|
|
-
|