Parcourir la source

nlp-api-improve

luojunhui il y a 7 mois
Parent
commit
73d6c2fb3f

+ 2 - 1
applications/api/__init__.py

@@ -2,4 +2,5 @@
 @author: luojunhui
 """
 from .moon_shot_api import fetch_moon_shot_response
-from .nlp_api import similarity_between_title_list
+from .nlp_api import similarity_between_title_list
+from .gewe_api import WechatChannelAPI

+ 108 - 0
applications/api/gewe_api.py

@@ -0,0 +1,108 @@
+"""
+@author: luojunhui
+"""
+
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+from requests.exceptions import RequestException
+import requests
+import json
+from typing import Optional, Dict
+
+COMMON_RETRY = dict(
+    stop=stop_after_attempt(3),  # 总共尝试3次
+    wait=wait_exponential(min=2, max=30),
+    retry=retry_if_exception_type((RequestException, TimeoutError)),
+    reraise=True  # 重试耗尽后重新抛出异常
+)
+
+
+class WechatChannelAPI:
+    """
+    wechat channel api by gw
+    """
+
+    def __init__(self, base_url: str, token: str, app_id: str):
+        self.base_url = base_url
+        self.token = token
+        self.app_id = app_id
+
+    @retry(**COMMON_RETRY)
+    def search(
+            self,
+            search_key: str,
+            search_type: int,
+            page: int = 0,
+            cookie: str = "",
+            search_id: str = "",
+            offset: int = 0,
+    ) -> Optional[Dict]:
+        """
+        搜索微信视频号内容(支持重试)
+
+        :param search_key: 搜索关键字
+        :param search_type: 搜索类型,1: 搜索所有视频,2: 搜索视频号账号
+        :param page: 页码
+        :param cookie: 登录后的cookie
+        :param search_id: 搜索id
+        :param offset: 偏移量
+        :return: 返回搜索结果字典,失败时返回None
+        """
+        url = f"{self.base_url}/gewe/v2/api/finder/search"
+        payload = {
+            "appId": self.app_id,
+            "proxyIp": "",
+            "content": search_key,
+            "category": search_type,
+            "filter": 0,
+            "page": page,
+            "cookie": cookie,
+            "searchId": search_id,
+            "offset": offset,
+        }
+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
+
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            return response.json()
+        except RequestException as e:
+            print(f"API请求失败: {e}")
+        except json.JSONDecodeError as e:
+            print(f"响应解析失败: {e}")
+        return None
+
+    @retry(**COMMON_RETRY)
+    def get_channel_video_list(
+            self, user_id: str, last_buffer: str = ""
+    ) -> Optional[Dict]:
+        """
+        获取视频号账号的视频列表(支持重试)
+
+        :param user_id: 视频号账号ID
+        :param last_buffer: 分页标记,用于获取下一页数据
+        :return: 返回视频列表字典,失败时返回None
+        """
+        url = f"{self.base_url}/gewe/v2/api/finder/userPage"
+        payload = {
+            "appId": self.app_id,
+            "proxyIp": "",
+            "lastBuffer": last_buffer,
+            "toUserName": user_id,
+            "maxId": 0,
+        }
+        headers = {"X-GEWE-TOKEN": self.token, "Content-Type": "application/json"}
+
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            return response.json()
+        except RequestException as e:
+            print(f"获取视频列表请求失败: {e}")
+        except json.JSONDecodeError as e:
+            print(f"响应解析失败: {e}")
+        return None

+ 2 - 1
coldStartTasks/crawler/channels/__init__.py

@@ -1,5 +1,6 @@
 """
 @author: luojunhui
+@tool: pycharm && deepseek
 """
-from .blogger import get_channel_account_video_list
+from .blogger import get_channel_account_videos
 from .search import search_in_wechat_channel

+ 12 - 44
coldStartTasks/crawler/channels/blogger.py

@@ -2,53 +2,21 @@
 @author: luojunhui
 """
 
-import json
-
-from typing import List, Dict
-
 import requests
+import json
 
-cookie_ = "RK=kreEdgt2YJ; ptcz=988b2dee721fc7f396a696a31bcfaca33cdb372f1b881ee5affbce5e5d978e8c; _qimei_uuid42=186031009051009d7cd1945011a64a99cb68d2482e; _qimei_q36=; _qimei_h38=428c111f7cd1945011a64a990300000ca18603; pgv_pvid=2616476048; pgv_pvi=2160320512; pac_uid=0_ddQwmCn3ZjrMh; ua_id=KprpYPwJOYehnDVGAAAAAN8sApDAL6kVurDXPvDD1LE=; wxuin=19406922801022; mm_lang=zh_CN; qq_domain_video_guid_verify=3dffe9bc35c6dbe6; eas_sid=51Q7O323M2M9L6L6u9o3V6C6O1; _qimei_fingerprint=f9c116e7a475fe667b054d9b03458b75; rewardsn=; wxtokenkey=777; poc_sid=HB1oo2ejseSHevB6BGoDCoB-GU-iNU8OPY2xtDQ-; pgv_info=ssid=s3327168304; uuid=de51161116b3868d07b845a0e78be181; rand_info=CAESIB81zx0VcstyAKgHM0wgP6hOjqZBcxfhzXI6WXcKLyr6; slave_bizuin=3524986952; data_bizuin=3524986952; bizuin=3524986952; data_ticket=H2f/yAT7QnqDUp3owWqHfV+clyYa5e7HoxyFe2dId3RLeHVcdlyNTQHLPSSJa/e1; slave_sid=c0dMelVKMWJ4WnZWc0ppOUcxS0lmSkpmbEV5ZFVfTUVIU2N2NmF3RXp6dDNDWEJma3FCYmdaTXVqdEpfbk9JQXBWUFUxYjRNbDhxWFdsejhuR05iZ3JLVGd4Y3ppZG92Y3EySGNCbFZNVldPU2V2bzE5WkVmMHNMVjhmZ0hXYThzdmFaMFJoNkw3Uk44Rlk0; slave_user=gh_0d8cf8319a3b; xid=73cc085ade4f756adfb9c3b36c406132; _clck=3524986952|1|ftl|0; _clsk=rvxmb1|1740039679010|2|1|mp.weixin.qq.com/weheat-agent/payload/record"
+from applications.api import WechatChannelAPI
+from config import gewe_token, gewe_app_id
 
 
-def get_channel_account_video_list(
-    account_id: str,
-    token: str,
-    buffer: str = "",
-    count: int = 15,
-    cookie: str = "",
-) -> List[Dict]:
+def get_channel_account_videos(user_id, last_buffer=""):
     """
-    :param account_id: 视频号id
-    :param token: 视频号token
-    :param buffer: 类似于cursor
-    :param count: 一次抓取的视频数量
-    :param cookie: 登录后的cookie
-    :return: result_list
+    get channel account videos
     """
-    url = "https://mp.weixin.qq.com/cgi-bin/videosnap?"
-    params = {
-        "action": "get_feed_list",
-        "username": account_id,
-        "buffer": buffer,
-        "count": count,
-        "scene": "0",
-        "token": token,
-        "lang": "zh_CN",
-        "f": "json",
-        "ajax": "1",
-    }
-    headers = {
-        "accept": "*/*",
-        "accept-language": "zh,zh-CN;q=0.9",
-        "priority": "u=1, i",
-        "referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN".format(
-            token
-        ),
-        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
-        "x-requested-with": "XMLHttpRequest",
-        "cookie": cookie,
-    }
-    response = requests.request("GET", url, headers=headers, params=params)
-    response_json = response.json()
-    return response_json
+    channel_api = WechatChannelAPI(
+        base_url='http://api.geweapi.com',
+        token=gewe_token,
+        app_id=gewe_app_id
+    )
+    result = channel_api.get_channel_video_list(user_id, last_buffer)
+    return result

+ 16 - 22
coldStartTasks/crawler/channels/search.py

@@ -2,10 +2,10 @@
 @author: luojunhui
 """
 
-import json
-from typing import List, Dict
+from typing import Dict
 
-import requests
+from applications.api import WechatChannelAPI
+from config import gewe_token, gewe_app_id
 
 
 def search_in_wechat_channel(
@@ -25,23 +25,17 @@ def search_in_wechat_channel(
     :param offset: 偏移量
     :return: result_list
     """
-    token = "d3fb918f-0f36-4769-b095-410181614231"
-    app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"
-    url = "http://api.geweapi.com/gewe/v2/api/finder/search"
-    payload = json.dumps(
-        {
-            "appId": app_id,
-            "proxyIp": "",
-            "content": search_key,
-            "category": search_type,
-            "filter": 0,
-            "page": page,
-            "cookie": cookie,
-            "searchId": search_id,
-            "offset": offset,
-        }
+    channel_api = WechatChannelAPI(
+        base_url='http://api.geweapi.com',
+        token=gewe_token,
+        app_id=gewe_app_id
     )
-    headers = {"X-GEWE-TOKEN": token, "Content-Type": "application/json"}
-    response = requests.request("POST", url, headers=headers, data=payload, timeout=60)
-    response_json = response.json()
-    return response_json
+    result = channel_api.search(
+        search_key=search_key,
+        search_type=search_type,
+        page=page,
+        cookie=cookie,
+        search_id=search_id,
+        offset=offset
+    )
+    return result

+ 5 - 0
config/__init__.py

@@ -88,3 +88,8 @@ moon_shot = {
     "model": "moonshot-v1-32k",
     "base_url": "https://api.moonshot.cn/v1"
 }
+
+
+#GeWe
+gewe_token = "d3fb918f-0f36-4769-b095-410181614231"
+gewe_app_id = "wx_GKpVW8xfEhcaxMIK9sSm6"

+ 2 - 1
requirements.txt

@@ -20,4 +20,5 @@ protobuf~=3.20.3
 openai~=1.17.0
 oss2~=2.19.1
 fake-useragent~=1.5.1
-playwright~=1.49.1
+playwright~=1.49.1
+tenacity~=9.0.0

+ 18 - 0
tasks/crawler_channel_account_videos.py

@@ -0,0 +1,18 @@
+"""
+@author: luojunhui
+@tool: pycharm && deepseek
+"""
+
+from applications.db import DatabaseConnector
+from config import long_articles_config
+from coldStartTasks.crawler.channels import get_channel_account_videos
+
+
+class CrawlerChannelAccountVideos:
+    """
+    crawler channel account videos
+    """
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+