ソースを参照

祝百岁之好,抓取视频详情页内视频流,一次 100 条

罗俊辉 1 年間 前
コミット
596752bc8b
2 ファイル変更74 行追加248 行削除
  1. 2 0
      common/pipeline.py
  2. 72 248
      zhubaisuizhihao/zhubaisuizhihao_recommend/zbszh_recommend.py

+ 2 - 0
common/pipeline.py

@@ -125,6 +125,8 @@ class PiaoQuanPipeline:
             return True
         if self.platform == "zhujinshanjinmei" and self.mode == "recommend":
             return True
+        if self.platform == "zhubaisuizhihao" and self.mode == "recommend":
+            return True
         out_id = self.item["out_video_id"]
         sql = f""" select * from crawler_video where platform = "{self.platform}" and out_video_id="{out_id}"; """
         repeat_video = MysqlHelper.get_values(

+ 72 - 248
zhubaisuizhihao/zhubaisuizhihao_recommend/zbszh_recommend.py

@@ -11,7 +11,6 @@ sys.path.append(os.getcwd())
 from common.video_item import VideoItem
 from common import PiaoQuanPipeline, AliyunLogger, tunnel_proxies
 from common.mq import MQ
-from common.db import MysqlHelper
 from zhuwanwufusu.crypt import AESCipher as AES
 
 
@@ -28,6 +27,9 @@ class ZhuBaiSuiZhiHaoRecommend(object):
         self.cryptor = AES()
 
     def get_recommend_list(self):
+        """
+        抓推荐流,只抓第一页
+        """
         url = "https://api.lidongze.cn/jeecg-boot/ugc/getVideoListsEn2"
         headers = {
             'Host': 'api.lidongze.cn',
@@ -39,128 +41,46 @@ class ZhuBaiSuiZhiHaoRecommend(object):
             'referer': 'https://servicewechat.com/wx87c457cfb89f95ad/4/page-frame.html',
             'accept-language': 'en-US,en;q=0.9'
         }
-        page_index = 1
-        total_page = 2
-        while page_index <= total_page:
+        query = {
+            "pageNo": 1,
+            "pageSize": 10,
+            "groupId": "1650323161797439489",  # 推荐流的 ID
+            "vn": 1,
+            "gx": 1,
+            "appid": "wx87c457cfb89f95ad",
+            "type": 2
+        }
+        params = {
+            "v": self.cryptor.aes_encrypt(data=json.dumps(query))
+        }
+        response = requests.request("GET", url, headers=headers, params=params, proxies=tunnel_proxies())
+        result = json.loads(self.cryptor.aes_decrypt(response.text))
+        page_index = result['list']['current'] + 1
+        for index, video_obj in enumerate(result['list']['records'], 1):
             try:
-                query = {
-                    "pageNo": page_index,
-                    "pageSize": 10,
-                    "groupId": "1650323161797439489",  # 推荐流的 ID
-                    "vn": 1,
-                    "gx": 1,
-                    "appid": "wx87c457cfb89f95ad",
-                    "type": 2
-                }
-                params = {
-                    "v": self.cryptor.aes_encrypt(data=json.dumps(query))
-                }
-                response = requests.request("GET", url, headers=headers, params=params, proxies=tunnel_proxies())
-                result = json.loads(self.cryptor.aes_decrypt(response.text))
-                total_page = result['list']['pages']
-                page_index = result['list']['current'] + 1
-                for index, video_obj in enumerate(result['list']['records'], 1):
-                    try:
-                        AliyunLogger.logging(
-                            code="1001",
-                            platform=self.platform,
-                            mode=self.mode,
-                            env=self.env,
-                            message="扫描到一条视频",
-                            data=video_obj
-                        )
-                        self.process_video_obj(video_obj)
-                    except Exception as e:
-                        AliyunLogger.logging(
-                            code="3000",
-                            platform=self.platform,
-                            mode=self.mode,
-                            env=self.env,
-                            message="抓取单条视频失败, 该视频位于第{}页第{}条报错原因是{}".format(page_index, index, e)
-                        )
+                AliyunLogger.logging(
+                    code="1001",
+                    platform=self.platform,
+                    mode=self.mode,
+                    env=self.env,
+                    message="扫描到一条视频",
+                    data=video_obj
+                )
+                video_id = video_obj['id']
+                self.get_detail_video_list(video_id)
             except Exception as e:
                 AliyunLogger.logging(
                     code="3000",
                     platform=self.platform,
                     mode=self.mode,
                     env=self.env,
-                    message="抓取第{}页的时候失败, 报错原因是{}".format(page_index, e)
+                    message="抓取单条视频失败, 该视频位于第{}页第{}条报错原因是{}".format(page_index, index, e)
                 )
-            time.sleep(random.randint(5, 10))
 
-    def get_user_videos(self, user_id):
+    def process_video_obj(self, video_obj):
         """
-        在抓取完推荐页之后,去抓每一个用户的主页视频
+        process video obj
         """
-        url = "https://api.lidongze.cn/jeecg-boot/ugc/getAuthVideoList"
-        headers = {
-            'Host': 'api.lidongze.cn',
-            'xweb_xhr': '1',
-            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.4(0x13080410)XWEB/31009',
-            'token': '',
-            'content-type': 'application/json',
-            'accept': '*/*',
-            'referer': 'https://servicewechat.com/wx87c457cfb89f95ad/4/page-frame.html',
-            'accept-language': 'en-US,en;q=0.9'
-        }
-        page_index = 1
-        total_page = 1
-        while page_index <= total_page:
-            query = {
-                "pageNo": page_index,
-                "pageSize": 10,
-                "authid": user_id
-            }
-            params = {
-                "v": self.cryptor.aes_encrypt(data=json.dumps(query))
-            }
-            response = requests.request("GET", url, headers=headers, params=params, proxies=tunnel_proxies())
-            result = json.loads(self.cryptor.aes_decrypt(response.text))
-            total_page = result['list']['pages']
-            page_index = result['list']['current'] + 1
-            for index, video_temp in enumerate(result['list']['records']):
-                video_id = video_temp['id']
-                detail_query = {
-                    "videoId": video_id
-                }
-                detail_params = {
-                    "v": self.cryptor.aes_encrypt(data=json.dumps(detail_query))
-                }
-                url = "https://api.lidongze.cn/jeecg-boot/ugc/getVideosDataEn"
-                headers = {
-                    'Host': 'api.lidongze.cn',
-                    'xweb_xhr': '1',
-                    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.4(0x13080410)XWEB/31009',
-                    'token': '',
-                    'content-type': 'application/json',
-                    'accept': '*/*',
-                    'referer': 'https://servicewechat.com/wx87c457cfb89f95ad/4/page-frame.html',
-                    'accept-language': 'en-US,en;q=0.9'
-                }
-                detail_response = requests.request("GET", url, headers=headers, params=detail_params,
-                                                   proxies=tunnel_proxies())
-                detail_video = json.loads(self.cryptor.aes_decrypt(detail_response.text))
-                if detail_video['success']:
-                    try:
-                        AliyunLogger.logging(
-                            code="1001",
-                            platform=self.platform,
-                            mode=self.mode,
-                            env=self.env,
-                            message="扫描到一条视频",
-                            data=detail_video['data']
-                        )
-                        self.process_video_obj(detail_video['data'])
-                    except Exception as e:
-                        AliyunLogger.logging(
-                            code="3000",
-                            platform=self.platform,
-                            mode=self.mode,
-                            env=self.env,
-                            message="抓取单条视频失败, 该视频位于第{}条报错原因是{}".format(index, e)
-                        )
-
-    def process_video_obj(self, video_obj):
         time.sleep(random.randint(3, 8))
         trace_id = self.platform + str(uuid.uuid1())
         if video_obj.get("playnum"):
@@ -184,8 +104,6 @@ class ZhuBaiSuiZhiHaoRecommend(object):
         item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
         item.add_video_info("user_id", our_user['uid'])
         item.add_video_info("user_name", our_user['nick_name'])
-        # 把扫描到的账号存到 accounts 表中
-        self.manage_auth_id(out_user_id=video_obj['authid'], out_user_name=video_obj['authname'])
         mq_obj = item.produce_item()
         pipeline = PiaoQuanPipeline(
             platform=self.platform,
@@ -209,62 +127,10 @@ class ZhuBaiSuiZhiHaoRecommend(object):
             if self.download_cnt >= int(self.rule_dict.get("videos_cnt", {}).get("min", 200)):
                 self.limit_flag = True
 
-    def manage_auth_id(self, out_user_id, out_user_name):
+    def get_detail_video_list(self, video_id):
         """
-        out_user_id: 外站视频的用户 id
-        out_user_name: 外站视频用户名字
-        逻辑: 对新扫描到的视频的用户 id 进行判断,若用户 id 不存在,则把视频 id 存到表中,
-              如果用户 id 存在,则判断用户是否修改名字,若名字修改则更新名字
+        获取详情 id_list
         """
-        select_user_sql = f"""select name, name_id from accounts where name_id = "{out_user_id}" and platform = "{self.platform}" and useful = 1 limit 1"""
-        out_user_info = MysqlHelper.get_values(
-            log_type=self.mode,
-            crawler=self.platform,
-            sql=select_user_sql,
-            env=self.env,
-            machine="",
-        )
-        if out_user_info:
-            name, name_id = out_user_info[0]
-            if name == out_user_name:
-                return
-            else:
-                update_sql = f"""update accounts set name = "{out_user_name}" where name_id = "{out_user_id}";"""
-                MysqlHelper.update_values(
-                    log_type=self.mode,
-                    crawler=self.platform,
-                    sql=update_sql,
-                    env=self.env,
-                    machine=""
-                )
-        else:
-            insert_sql = f"""INSERT INTO accounts (name, name_id, platform, useful) values ("{out_user_name}", "{out_user_id}", "{self.platform}", 1 )"""
-            MysqlHelper.update_values(
-                log_type=self.mode,
-                crawler=self.platform,
-                sql=insert_sql,
-                env=self.env,
-                machine="",
-            )
-
-    def get_user_list(self):
-        select_user_sql = f"""select name_id from accounts where platform = "{self.platform}" and useful = 1"""
-        out_user_info = MysqlHelper.get_values(
-            log_type=self.mode,
-            crawler=self.platform,
-            sql=select_user_sql,
-            env=self.env,
-            machine="",
-        )
-        if out_user_info:
-            result = []
-            for i in out_user_info:
-                result.append(i[0])
-            return result
-        else:
-            return []
-
-    def get_detail_video_list(self):
         url = "https://api.lidongze.cn/jeecg-boot/ugc/getDetailVideoListsEn2"
         headers = {
             'Host': 'api.lidongze.cn',
@@ -275,98 +141,56 @@ class ZhuBaiSuiZhiHaoRecommend(object):
             'accept-language': 'en-US,en;q=0.9'
         }
         page_index = 1
-        total_page = 2
-        while page_index <= total_page:
-            try:
-                if self.limit_flag:
+        try:
+            query = {
+                "groupId": "1650323161797439489",
+                "pageNo": 1,
+                "pageSize": 10,
+                "videoId": video_id,
+                "appid": "wx87c457cfb89f95ad",
+                "type": 2,
+                "hxid": "1556555457243828666"
+            }
+            params = {
+                "v": self.cryptor.aes_encrypt(data=json.dumps(query))
+            }
+            response = requests.request("GET", url, headers=headers, params=params)
+            result = json.loads(self.cryptor.aes_decrypt(response.text))
+            page_index = result['list']['current'] + 1
+            for index, video_obj in enumerate(result['list']['records'], 1):
+                try:
                     AliyunLogger.logging(
-                        code="2000",
+                        code="1001",
                         platform=self.platform,
                         mode=self.mode,
                         env=self.env,
-                        message="本轮已经抓取足够数量的视频"
+                        message="扫描到一条视频",
+                        data=video_obj
                     )
-                    return
-                else:
-                    query = {
-                        "groupId": "1650323161797439489",
-                        "pageNo": page_index,
-                        "pageSize": 10,
-                        "appid": "wx87c457cfb89f95ad",
-                        "type": 2,
-                        "hxid": "1556555457243828666"
-                    }
-                    params = {
-                        "v": self.cryptor.aes_encrypt(data=json.dumps(query))
-                    }
-                    response = requests.request("GET", url, headers=headers, params=params)
-                    result = json.loads(self.cryptor.aes_decrypt(response.text))
-                    total_page = result['list']['pages']
-                    page_index = result['list']['current'] + 1
-                    for index, video_obj in enumerate(result['list']['records'], 1):
-                        try:
-                            AliyunLogger.logging(
-                                code="1001",
-                                platform=self.platform,
-                                mode=self.mode,
-                                env=self.env,
-                                message="扫描到一条视频",
-                                data=video_obj
-                            )
-                            self.process_video_obj(video_obj)
-                        except Exception as e:
-                            AliyunLogger.logging(
-                                code="3000",
-                                platform=self.platform,
-                                mode=self.mode,
-                                env=self.env,
-                                message="抓取单条视频失败, 该视频位于第{}页第{}条报错原因是{}".format(page_index, index, e)
-                            )
-            except Exception as e:
-                AliyunLogger.logging(
-                    code="3000",
-                    platform=self.platform,
-                    mode=self.mode,
-                    env=self.env,
-                    message="抓取第{}页的时候失败, 报错原因是{}".format(page_index, e)
-                )
-            time.sleep(random.randint(5, 10))
-
-    def schedule(self):
-        """
-        先抓取推荐列表的视频, 等待 2 分钟后抓取 detail 页面,等待 5 分钟后,抓取账号视频
-        """
-        self.get_recommend_list()
-        if self.limit_flag:
-            return
-        time.sleep(2 * 60)
-        self.get_detail_video_list()
-        if self.limit_flag:
-            return
-        time.sleep(5 * 60)
-        self.mode = "author"
-        user_list = self.get_user_list()
-        if user_list:
-            for index, user_id in enumerate(user_list):
-                try:
-                    if self.limit_flag:
-                        AliyunLogger.logging(
-                            code="2000",
-                            platform=self.platform,
-                            mode=self.mode,
-                            env=self.env,
-                            message="本轮已经抓取足够数量的视频"
-                        )
-                        return
-                    self.get_user_videos(user_id=user_id)
+                    self.process_video_obj(video_obj)
                 except Exception as e:
                     AliyunLogger.logging(
                         code="3000",
                         platform=self.platform,
                         mode=self.mode,
                         env=self.env,
-                        message="抓取账号视频出现异常,账号 id 是{}, 报错原因是{}".format(user_id, e)
+                        message="抓取单条视频失败, 该视频位于第{}页第{}条报错原因是{}".format(page_index, index, e)
                     )
+        except Exception as e:
+            AliyunLogger.logging(
+                code="3000",
+                platform=self.platform,
+                mode=self.mode,
+                env=self.env,
+                message="抓取第{}页的时候失败, 报错原因是{}".format(page_index, e)
+            )
+        time.sleep(random.randint(5, 10))
+
+    def schedule(self):
+        """
+        先抓取推荐列表的视频, 等待 2 分钟后抓取 detail 页面,等待 5 分钟后,抓取账号视频
+        """
+        self.get_recommend_list()
 
 
 if __name__ == '__main__':