Explorar el Código

公众新号 获取视频方式优化

zhangyong hace 1 año
padre
commit
e8c1d96e89
Se han modificado 1 ficheros con 78 adiciones y 8 borrados
  1. 78 8
      gongzhongxinhao/gongzhongxinhao/gongzhongxinhao_author.py

+ 78 - 8
gongzhongxinhao/gongzhongxinhao/gongzhongxinhao_author.py

@@ -2,6 +2,7 @@ import datetime
 import json
 import os
 import random
+import re
 import sys
 import time
 import uuid
@@ -156,13 +157,79 @@ class GZXHAuthor:
         return video_url
 
     def get_wechat_gh(self, link: str):
-        url = "http://8.217.190.241:8888/crawler/wei_xin/account_info"
-        payload = json.dumps({"content_link": link})
-        headers = {'Content-Type': 'application/json'}
-        response = requests.request("POST", url, headers=headers, data=payload).json()
-        if response['code'] == 0:
-            wx_gh = response['data']['data']['wx_gh']
-        return wx_gh
+        for i in range(3):
+            time.sleep(1)
+            url = "http://8.217.190.241:8888/crawler/wei_xin/account_info"
+            payload = json.dumps({"content_link": link})
+            headers = {'Content-Type': 'application/json'}
+            response = requests.request("POST", url, headers=headers, data=payload).json()
+            if response['code'] == 0:
+                wx_gh = response['data']['data']['wx_gh']
+                return wx_gh
+
+
+    def get_js(self, link):
+        payload = {}
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
+        }
+        response = requests.request("GET", link, headers=headers, data=payload)
+        js_code = response.content.decode()
+        return js_code
+
+
+    def get_link(self, video_id):
+        url = "https://h5vv.video.qq.com/getinfo?vid={}&platform=101001&charge=0&otype=json&defn=shd".format(
+            video_id
+        )
+        headers = {
+            "Host": "h5vv.video.qq.com",
+            "xweb_xhr": "1",
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Accept": "*/*",
+            "Sec-Fetch-Site": "cross-site",
+            "Sec-Fetch-Mode": "cors",
+            "Sec-Fetch-Dest": "empty",
+            "Referer": "https://servicewechat.com/wx5fcd817f3f80aece/3/page-frame.html",
+            "Accept-Language": "en",
+        }
+        response = requests.get(url, headers=headers)
+        result = json.loads(response.text.replace("QZOutputJson=", "")[:-1])
+        vl = result["vl"]["vi"][0]
+        key = vl["fvkey"]
+        name = vl["fn"]
+        folder = vl["ul"]["ui"][0]["url"]
+        video_url = folder + name + "?vkey=" + key
+        return video_url
+
+    def get_url(self, js_code):
+        pattern = re.compile(r"url: \('(.*?)'\)")
+        urls = pattern.findall(js_code)
+        if urls:
+            return urls[0]
+        else:
+            match = re.search(r'target_url\s*:\s*"(.*?)"', js_code)
+            # 提取匹配到的 URL
+            url = match.group(1) if match else None
+            return url
+
+
+    def get_video_url(self, url: str):
+        for i in range(3):
+            js_code = self.get_js(url)
+            regex = r"video_id:\s*'([^']*)'"
+            match = re.search(regex, js_code)
+            video_id = match.group(1) if match else None
+            if video_id:
+                mp4_link = self.get_link(video_id)
+                if mp4_link:
+                    return mp4_link
+            else:
+                mp4_link = self.get_url(js_code)
+                mp4_link = mp4_link.replace("\\x26amp;", "&")
+                if mp4_link:
+                    return mp4_link
 
 
 
@@ -247,9 +314,12 @@ class GZXHAuthor:
         date_time_obj = datetime.strptime(publish_time_str, date_format)
         publish_time_stamp = int(date_time_obj.timestamp())
         article_url = article.get("url", "")
+        if article_url:
+            video_url = self.get_video_url(article_url)
+
         video_id = wechat_gh + str(int(date_time_obj.timestamp()))
         cover_url = article.get("head_pic", "")
-        video_url = self.get_video_url(article_url)
+        # video_url = self.get_video_url(article_url)
         video_dict = {
             "user_name": user_name,
             "video_id": video_id,