123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- import datetime
- import json
- import os
- import random
- import sys
- import time
- import uuid
- import requests
- from common.feishu import Feishu
- sys.path.append(os.getcwd())
- from selenium.webdriver import DesiredCapabilities
- from selenium.webdriver.chrome.service import Service
- from selenium.webdriver.common.by import By
- from selenium import webdriver
- from common.mq import MQ
- from common import AliyunLogger, PiaoQuanPipeline
- from common.limit import AuthorLimit
- from datetime import datetime
- class GZXHAuthor:
- """
- 公众新号账号爬虫
- """
- def __init__(self, platform, mode, rule_dict, user_dict, env):
- self.platform = platform
- self.mode = mode
- self.rule_dict = rule_dict
- self.user_dict = user_dict
- self.env = env
- self.download_cnt = 0
- self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
- self.limiter = AuthorLimit(platform=self.platform, mode=self.mode)
- def get_account_videos(self):
- AliyunLogger.logging(
- code="1003",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message="开始抓取公众新号: {}".format(self.user_dict['link']),
- )
- try:
- self.get_videoList()
- except Exception as e:
- AliyunLogger.logging(
- code="3000",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message=f"抓取公众新号: {self.user_dict['link']} 时异常,异常信息: {e}",
- )
- AliyunLogger.logging(
- code="1004",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message="抓取公众新号: {}".format(self.user_dict['link']),
- )
- # 获取腾讯视频下载链接
- def get_tencent_video_url(self, video_id):
- # url = 'https://vv.video.qq.com/getinfo?vids=' + str(video_id) + '&platform=101001&charge=0&otype=json'
- # response = requests.get(url=url).text.replace('QZOutputJson=', '').replace('"};', '"}')
- # response = json.loads(response)
- # url = response['vl']['vi'][0]['ul']['ui'][0]['url']
- # fvkey = response['vl']['vi'][0]['fvkey']
- # video_url = url + str(video_id) + '.mp4?vkey=' + fvkey
- # return video_url
- url = "https://h5vv.video.qq.com/getinfo?vid={}&platform=101001&charge=0&otype=json&defn=shd".format(
- video_id
- )
- headers = {
- "Host": "h5vv.video.qq.com",
- "xweb_xhr": "1",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
- "Content-Type": "application/x-www-form-urlencoded",
- "Accept": "*/*",
- "Sec-Fetch-Site": "cross-site",
- "Sec-Fetch-Mode": "cors",
- "Sec-Fetch-Dest": "empty",
- "Referer": "https://servicewechat.com/wx5fcd817f3f80aece/3/page-frame.html",
- "Accept-Language": "en",
- }
- response = requests.get(url, headers=headers)
- result = json.loads(response.text.replace("QZOutputJson=", "")[:-1])
- vl = result["vl"]["vi"][0]
- key = vl["fvkey"]
- name = vl["fn"]
- folder = vl["ul"]["ui"][0]["url"]
- video_url = folder + name + "?vkey=" + key
- time.sleep(random.randint(1, 5))
- return video_url
- def get_video_url(self, article_url):
- # 打印请求配置
- ca = DesiredCapabilities.CHROME
- ca["goog:loggingPrefs"] = {"performance": "ALL"}
- # 不打开浏览器运行
- chrome_options = webdriver.ChromeOptions()
- chrome_options.add_argument("headless")
- chrome_options.add_argument(
- f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"
- )
- chrome_options.add_argument("--no-sandbox")
- # driver初始化
- if self.env == "prod":
- driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
- else:
- driver = webdriver.Chrome(
- # desired_capabilities=ca,
- # options=chrome_options,
- service=Service(
- "/Users/tzld/Downloads/chromedriver_mac64/chromedriver"
- ),
- )
- driver.implicitly_wait(10)
- driver.get(article_url)
- time.sleep(1)
- if (
- len(
- driver.find_elements(
- By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
- )
- )
- != 0
- ):
- video_url = driver.find_element(
- By.XPATH, '//div[@class="js_video_poster video_poster"]/*[2]'
- ).get_attribute("src")
- elif (
- len(
- driver.find_elements(
- By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
- )
- )
- != 0
- ):
- iframe = driver.find_element(
- By.XPATH, '//span[@class="js_tx_video_container"]/*[1]'
- ).get_attribute("src")
- video_id = iframe.split("vid=")[-1].split("&")[0]
- video_url = self.get_tencent_video_url(video_id)
- else:
- video_url = 0
- driver.quit()
- if "mpvideo.qpic.cn" in str(video_url):
- time.sleep(random.randint(1, 3))
- return video_url
- def get_wechat_gh(self, link: str):
- url = "http://8.217.190.241:8888/crawler/wei_xin/account_info"
- payload = json.dumps({"content_link": link})
- headers = {'Content-Type': 'application/json'}
- response = requests.request("POST", url, headers=headers, data=payload).json()
- if response['code'] == 0:
- wx_gh = response['data']['data']['wx_gh']
- return wx_gh
- # 获取文章列表
- def get_videoList(self):
- mq = MQ(topic_name="topic_crawler_etl_" + self.env)
- time.sleep(1)
- wechat_gh = self.get_wechat_gh(self.user_dict['link'])
- if None == wechat_gh:
- AliyunLogger.logging(
- code="2004",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message=f"获取用主页为空{self.user_dict['link']}",
- )
- return
- time.sleep(1)
- url = "http://61.48.133.26:30001/GetGh_Doc"
- payload = json.dumps({
- "appid": wechat_gh,
- "decode": "1"
- })
- headers = {
- 'Content-Type': 'application/json'
- }
- r = requests.request("POST", url, headers=headers, data=payload)
- if "list" not in r.json():
- AliyunLogger.logging(
- code="2000",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message=f"status_code:{r.status_code}, get_videoList:{r.text}\n",
- )
- time.sleep(60 * 15)
- return
- if len(r.json()["list"]) == 0:
- AliyunLogger.logging(
- code="2000",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message="没有更多视频了\n",
- )
- return
- else:
- user_name = r.json().get("gh_name")
- app_msg_list = r.json()["list"]
- for article in app_msg_list:
- try:
- AliyunLogger.logging(
- code="1001",
- platform=self.platform,
- mode=self.mode,
- message="扫描到一条视频",
- env=self.env,
- data=article,
- )
- repeat_flag = self.process_video_obj(article, user_name, wechat_gh)
- if not repeat_flag:
- return
- except Exception as e:
- AliyunLogger.logging(
- code="3000",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message=f"抓取单条视频异常:{e}\n",
- )
- return
- def process_video_obj(self, article, user_name, wechat_gh):
- trace_id = self.platform + str(uuid.uuid1())
- # update_time_stamp = int(time.time())
- publish_time_str = article.get("published_time", 0)
- date_format = "%Y-%m-%d %H:%M:%S"
- date_time_obj = datetime.strptime(publish_time_str, date_format)
- publish_time_stamp = int(date_time_obj.timestamp())
- article_url = article.get("url", "")
- video_id = wechat_gh + str(int(date_time_obj.timestamp()))
- cover_url = article.get("head_pic", "")
- video_url = self.get_video_url(article_url)
- video_dict = {
- "user_name": user_name,
- "video_id": video_id,
- "video_title": article.get("title", "")
- .replace(" ", "")
- .replace('"', "")
- .replace("'", ""),
- "out_video_id": video_id,
- "publish_time_stamp": publish_time_stamp,
- "update_time_stamp": 0,
- "publish_time_str": publish_time_str,
- "play_cnt": 0,
- "comment_cnt": 0,
- "like_cnt": 0,
- "share_cnt": 0,
- "user_id": self.user_dict["uid"],
- "cover_url": cover_url,
- "video_url": video_url,
- "width": 0,
- "height": 0,
- "duration": 0,
- "platform": self.platform,
- "strategy": self.mode,
- "crawler_rule": self.rule_dict,
- "session": f"gongzhongxinhao-author-{int(time.time())}",
- }
- AliyunLogger.logging(
- code="1001",
- trace_id=trace_id,
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- message="扫描到一条视频",
- data=video_dict,
- )
- pipeline = PiaoQuanPipeline(
- platform=self.platform,
- mode=self.mode,
- item=video_dict,
- rule_dict=self.rule_dict,
- env=self.env,
- trace_id=trace_id,
- )
- if not pipeline.publish_time_flag():
- return False
- if not pipeline.repeat_video():
- return True
- if not pipeline.download_rule_flag():
- return True
- if not pipeline.title_flag():
- return True
- else:
- current_time = datetime.now()
- timestamp = current_time.strftime("%Y-%m-%d %H:%M:%S")
- values = [[
- user_name,
- video_id,
- article.get("title", "")
- .replace(" ", "")
- .replace('"', "")
- .replace("'", ""),
- publish_time_str,
- timestamp,
- video_url,
- article.get("head_pic", ""),
- self.user_dict['link']
- ]]
- Feishu.insert_columns('gongzhonghao', 'gongzhonghao', "9QU7wE", "ROWS", 1, 2)
- time.sleep(0.5)
- Feishu.update_values('gongzhonghao', 'gongzhonghao', "9QU7wE", "A2:Z2", values)
- video_dict["publish_time"] = video_dict["publish_time_str"]
- limit_flag = self.limiter.author_limitation(user_id=video_dict['user_id'])
- if limit_flag:
- self.mq.send_msg(video_dict)
- self.download_cnt += 1
- AliyunLogger.logging(
- code="1002",
- platform=self.platform,
- mode=self.mode,
- env=self.env,
- data=video_dict,
- trace_id=trace_id,
- message="成功发送 MQ 至 ETL",
- )
- time.sleep(5)
- return True
- if __name__ == "__main__":
- GZ = GZXHAuthor(
- platform="gongzhonghao",
- mode="author",
- user_dict={"uid": "123456", "link": "https://mp.weixin.qq.com/s/Cwc2D3RUNDk30zv_s0IxTA", "user_id": "1234565"},
- rule_dict={"period":{"min":1,"max":1}},
- env="dev",
- )
- GZ.get_account_videos()
|