فهرست منبع

1. 新增视频号——账户爬虫代码(未完全)
2. 美化了 pipeline代码
3. 修改了西瓜视频的抓取规则

罗俊辉 1 سال پیش
والد
کامیت
d989081edd

+ 0 - 27
common/pipeline.py

@@ -79,19 +79,6 @@ class PiaoQuanPipeline:
 
     # 视频基础下载规则
     def download_rule_flag(self):
-        # 格式化 video_dict:publish_time_stamp
-        # if self.item.get("publish_time_stamp"):
-        #     self.item["publish_time"] = self.item["publish_time_stamp"] * 1000
-        # # 格式化 video_dict:period
-        # if (
-        #     self.item.get("publish_time")
-        #     and self.item.get("period", "noperiod") == "noperiod"
-        # ):
-        #     self.item["period"] = int(
-        #         (int(time.time() * 1000) - self.item["publish_time"])
-        #         / (3600 * 24 * 1000)
-        #     )
-        # 格式化 rule_dict 最大值取值为 0 的问题
         for key in self.item:
             if self.rule_dict.get(key):
                 max_value = (
@@ -101,20 +88,6 @@ class PiaoQuanPipeline:
                 )
                 if key == "peroid": # peroid是抓取周期天数
                     continue
-                    # flag = 0 <= int(self.item[key]) <= max_value
-                    # if not flag:
-                    #     AliyunLogger.logging(
-                    #         code="2004",
-                    #         trace_id=self.trace_id,
-                    #         platform=self.platform,
-                    #         mode=self.mode,
-                    #         env=self.env,
-                    #         data=self.item,
-                    #         message="{}: 0 <= {} <= {}, {}".format(
-                    #             key, self.item[key], max_value, flag
-                    #         ),
-                    #     )
-                    #     return flag
                 else:
                     flag = int(self.rule_dict[key]["min"]) <= int(self.item[key]) <= max_value
                     if not flag:

+ 0 - 0
shipinhao/shipinhao_author/__init__.py


+ 84 - 0
shipinhao/shipinhao_author/shipinhao_author_test.py

@@ -0,0 +1,84 @@
+import json
+import requests
+
+
+class SphAuthor:
+    def __init__(self, name):
+        self.token = "1678001807"
+        self.cookie = 'ua_id=bw4VuFJr6fAuSkwdAAAAAClaW0m9Aua-6IfHaXU_zpo=; wxuin=95302180931488; mm_lang=zh_CN; RK=kreEMgtMMJ; ptcz=8fd1b267c98a1185bbe6455a081f1264048ee388363ca305d9ef4812892c7900; qq_domain_video_guid_verify=2ba78a5010233582; poc_sid=HOinP2Wj322Ex737kV651Zqy6y8fSprOUUvaegBg; _qimei_q36=; _qimei_h38=9eea33ea92afe8a922333fce03000001317916; pgv_pvid=9056371236; uuid=c2c0f943291da7eff8f6972740e4f894; _clck=3930572231|1|fgk|0; rand_info=CAESILaJJXli7mbr458BaiPXyXMbWTe3TKCzsfFmsXaTOPcU; slave_bizuin=3524986952; data_bizuin=3524986952; bizuin=3524986952; data_ticket=2/3sHr4KYg12+LGHQV6k5K3pJ6S8S4nAYWhKsIhhij/OcGFjUysTrpY75aUZy9M9; slave_sid=a2JpR21wSF9xRmNLU1V1Ylh5U0Ywd1o4MUdyZ3FVZXhfNGUyWXc3dURfbWlpdjFUcUl5elp0RURpWERwTktBb3VDenFsR2hxbHZ2cnRBdjZJSk9oMnRtSV83azFWOG9WbDd5U0h6Q1RkajhxY21CTmxzSFlYZDVjaUhteWozbzRFS3luRzNUUExzVmxkRzhG; slave_user=gh_0d8cf8319a3b; xid=84a3765ddefaf98f144be8b1aafa0d58; _clsk=1xoy7wc|1699512584249|6|1|mp.weixin.qq.com/weheat-agent/payload/record; bizuin=3524986952; data_bizuin=3524986952; data_ticket=2/3sHr4KYg12+LGHQV6k5K3pJ6S8S4nAYWhKsIhhij/OcGFjUysTrpY75aUZy9M9; rand_info=CAESILaJJXli7mbr458BaiPXyXMbWTe3TKCzsfFmsXaTOPcU; slave_bizuin=3524986952; slave_sid=a2JpR21wSF9xRmNLU1V1Ylh5U0Ywd1o4MUdyZ3FVZXhfNGUyWXc3dURfbWlpdjFUcUl5elp0RURpWERwTktBb3VDenFsR2hxbHZ2cnRBdjZJSk9oMnRtSV83azFWOG9WbDd5U0h6Q1RkajhxY21CTmxzSFlYZDVjaUhteWozbzRFS3luRzNUUExzVmxkRzhG; slave_user=gh_0d8cf8319a3b'
+        self.name = name
+
+    def get_user_id(self):
+        url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
+        params = {
+            "action": "search",
+            "scene": "1",
+            "buffer": "",
+            "query": self.name,
+            "count": "21",
+            "token":  self.token,
+            "lang": "zh_CN",
+            "f": "json",
+            "ajax": "1"
+        }
+        headers = {
+            'authority': 'mp.weixin.qq.com',
+            'accept': '*/*',
+            'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
+            'cookie': self.cookie,
+            'referer': 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token=1678001807&lang=zh_CN',
+            'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"macOS"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+            'x-requested-with': 'XMLHttpRequest'
+        }
+        response = requests.request("GET", url, headers=headers, params=params)
+        user_list = response.json()
+        target_user = user_list['acct_list'][0]  # 可以优化
+        return target_user
+
+    def get_video_list(self):
+        user_info = self.get_user_id()
+        url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
+        params = {
+            "action": "get_feed_list",
+            "username": user_info['username'],
+            "buffer": "",
+            "count": "15",
+            "scene": "1",
+            "token": self.token,
+            "lang": "zh_CN",
+            "f": "json",
+            "ajax": "1"
+        }
+        headers = {
+            'authority': 'mp.weixin.qq.com',
+            'accept': '*/*',
+            'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
+            'cookie': self.cookie,
+            'referer': 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token=1678001807&lang=zh_CN',
+            'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"macOS"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+            'x-requested-with': 'XMLHttpRequest'
+        }
+
+        response = requests.request("GET", url, headers=headers, params=params)
+        video_list = response.json()
+        # print(json.dumps(video_list, ensure_ascii=False, indent=4))
+        # print(len(video_list['list']))
+        for obj in video_list['list']:
+            print(obj['desc'])
+
+
+if __name__ == "__main__":
+    Sph = SphAuthor("心煤")
+    Sph.get_video_list()

+ 99 - 0
shipinhao/shipinhao_author/shipinhao_scheduling.py

@@ -0,0 +1,99 @@
+import requests
+from common.aliyun_log import AliyunLogger
+from common.db import MysqlHelper
+
+
+def get_history_id(name):
+    """
+    从数据库表中读取 id
+    """
+    # select_user_sql = f"""select * from crawler_user_v3 where ={task_id}"""
+    # user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
+    name_id_dict = {}
+    if name_id_dict.get(name):
+        return name_id_dict[name]
+    else:
+        return False
+
+
+def find_target_user(name, user_list):
+    for obj in user_list:
+        if obj['name'] == name:
+            return obj
+        else:
+            continue
+    return False
+
+
+class ShiPinHaoAccount:
+    def __init__(self, token, cookie, account_name):
+        self.token = token
+        self.cookie = cookie
+        self.account_name = account_name
+
+    def get_account_id(self):
+        # 读历史数据,如果存在 id,则直接返回 id
+        history_id = get_history_id(self.account_name)
+        if history_id:
+            return history_id
+        else:
+            url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
+            params = {
+                "action": "search",
+                "scene": "1",
+                "buffer": "",
+                "query": self.account_name,
+                "count": "21",
+                "token": self.token,
+                "lang": "zh_CN",
+                "f": "json",
+                "ajax": "1"
+            }
+            headers = {
+                'authority': 'mp.weixin.qq.com',
+                'accept': '*/*',
+                'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
+                'cookie': self.cookie,
+                'referer': 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN'.format(
+                    self.token),
+                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+                'x-requested-with': 'XMLHttpRequest'
+            }
+            response = requests.request("GET", url, headers=headers, params=params)
+            user_list = response.json()['acct_list']
+            target_user = find_target_user(name=self.account_name, user_list=user_list)
+            return target_user
+
+    def get_account_videos(self):
+        user_info = self.get_account_id()
+        if user_info:
+            url = "https://mp.weixin.qq.com/cgi-bin/videosnap"
+            params = {
+                "action": "get_feed_list",
+                "username": user_info['username'],
+                "buffer": "",
+                "count": "15",
+                "scene": "1",
+                "token": self.token,
+                "lang": "zh_CN",
+                "f": "json",
+                "ajax": "1"
+            }
+            headers = {
+                'authority': 'mp.weixin.qq.com',
+                'accept': '*/*',
+                'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
+                'cookie': self.cookie,
+                'referer': 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=77&createType=0&token={}&lang=zh_CN'.format(self.token),
+                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+                'x-requested-with': 'XMLHttpRequest'
+            }
+
+            response = requests.request("GET", url, headers=headers, params=params)
+            video_list = response.json()
+            # print(json.dumps(video_list, ensure_ascii=False, indent=4))
+            # print(len(video_list['list']))
+            for obj in video_list['list']:
+                print(obj['desc'])
+        else:
+            print("Did not find any user info")

+ 29 - 14
xigua/xigua_author/xigua_author.py

@@ -783,6 +783,7 @@ class XiGuaAuthor:
                 ),
             )
             return False
+
         pipeline = PiaoQuanPipeline(
             platform=self.platform,
             mode=self.mode,
@@ -791,20 +792,34 @@ class XiGuaAuthor:
             item=video_dict,
             trace_id=trace_id,
         )
-        flag = pipeline.process_item()
-        if flag:
-            print(json.dumps(video_dict, ensure_ascii=False, indent=4))
-            self.mq.send_msg(video_dict)
-            self.download_count += 1
-            AliyunLogger.logging(
-                code="1002",
-                platform=self.platform,
-                mode=self.mode,
-                env=self.env,
-                data=video_dict,
-                trace_id=trace_id,
-                message="成功发送 MQ 至 ETL",
-            )
+        title_flag = pipeline.title_flag()
+        repeat_flag = pipeline.repeat_video()
+        if title_flag and repeat_flag:
+            if int(video_dict['play_cnt']) >= int(self.rule_dict.get("play_cnt", {}).get("min", 100000)):
+                self.mq.send_msg(video_dict)
+                self.download_count += 1
+                AliyunLogger.logging(
+                    code="1002",
+                    platform=self.platform,
+                    mode=self.mode,
+                    env=self.env,
+                    data=video_dict,
+                    trace_id=trace_id,
+                    message="成功发送 MQ 至 ETL",
+                )
+            else:
+                if float(video_dict['like_cnt']) / float(video_dict['play_cnt']) >= 0.04:
+                    self.mq.send_msg(video_dict)
+                    self.download_count += 1
+                    AliyunLogger.logging(
+                        code="1002",
+                        platform=self.platform,
+                        mode=self.mode,
+                        env=self.env,
+                        data=video_dict,
+                        trace_id=trace_id,
+                        message="成功发送 MQ 至 ETL",
+                    )
         return True
 
     def get_video_info(self, item_id, trace_id):