1 년 전 · edbd2f7651
--- a/spider/ad_click/piaoquan_vlog.py
+++ b/spider/ad_click/piaoquan_vlog.py
@@ -7,6 +7,7 @@ import json
 
				 import random
			
 
				 import time
			
 
				 import requests
			
 
				+import datetime
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
@@ -76,7 +77,8 @@ class PiaoQuanVlog(object):
 
				             "abExpInfo": '{"ab_test004":[{"abExpCode":"126","configValue":""},{"abExpCode":"211","configValue":""}],"ab_test005":[],"ab_test006":[{"abExpCode":"310","configValue":""},{"abExpCode":"321","configValue":""},{"abExpCode":"331","configValue":""},{"abExpCode":"356","configValue":"{\\"playProgress\\": 10, \\"delayHide\\": 8}"},{"abExpCode":"371","configValue":"{\\"playIcon\\": \\"http://weapppiccdn.yishihui.com/wxicon/common/icon_play_btn_font.png?v=2\\", \\"width\\": 125}"}],"ab_test001":[{"abExpCode":"223","configValue":""},{"abExpCode":"201","configValue":""},{"abExpCode":"410","configValue":"{\\"layerStyle\\": 1, \\"oneDayShowCount\\": 3, \\"everyRecommendVideo\\": 5, \\"playProgress\\": 90, \\"closePosition\\": \\"top\\", \\"guideDialogText\\": [\\"您可以点击‘关注票圈公众号’\\", \\"每日最新资讯不错过\\"], \\"guideButtonText\\": \\"关注票圈公众号\\", \\"topImage\\": \\"https://weapppiccdn.yishihui.com/wxicon/common/img_cgi_image3.png\\", \\"jumpUrl\\": \\"https://mp.weixin.qq.com/s?__biz=MzIxMjg2MzE2Mg==&mid=2247483675&idx=1&sn=0338228015ba7a5b0a1937b14e610efc&chksm=97bed0cea0c959d81d90a5d9ce82502ca24fa418df70d6e619a88d4e007a8b14b2b3b3e62386#rd\\", \\"gzhId\\": 105}"}]}',
			
 
				             "extParams": '{"eventIds":"22040202,ab100,ab100,ab100,ab100,ab100,ab100,ab100,ab100,ab100,ab100","eventInfos":{"ab_test001":"ab100","ab_test002":"ab100","ab_test003":"ab100","ab_test004":"ab100","ab_test005":"ab100","ab_test006":"ab100","ab_test007":"ab100","ab_test008":"ab100","ab_test009":"ab100","ab_test010":"ab100"}}'
			
 
				         }
			
 
				-        basic_response = requests.request("POST", self.url, headers=self.headers, data=payload, proxies=tunnel_proxies()).json()
			
 
				+        basic_response = requests.request("POST", self.url, headers=self.headers, data=payload,
			
 
				+                                          proxies=tunnel_proxies()).json()
			
 
				         self.process_video_list(basic_response['data'])
			
 
				 
			
 
				     def process_video_list(self, video_list):
			
@@ -96,6 +98,7 @@ class PiaoQuanVlog(object):
 
				                 "shareCount": video_obj['shareCountFriend'],
			
 
				                 "favorCount": video_obj['favoriteds']
			
 
				             }
			
 
				+            time.sleep(12)
			
 
				             # print(json.dumps(video_item, ensure_ascii=False, indent=4))
			
 
				             self.aliyun_log.logging(
			
 
				                 code="7001",
			
@@ -105,22 +108,30 @@ class PiaoQuanVlog(object):
 
				 
			
 
				     def run(self):
			
 
				         """
			
 
				-        执行函数
			
 
				+        一天抓取 24h， 每个小时的 0-15min 不抓取，每一个小时抓取条数为 110，每天抓取条数为 24 * 110 = 2640  条
			
 
				+        110 / 4 =～ 28， 每一小时大抓取 28 页
			
 
				         :return: None
			
 
				         """
			
 
				         while True:
			
 
				-            for index in range(1, 51):
			
 
				-                try:
			
 
				-                    self.send_request(index)
			
 
				-                    # 随机休息 1 - 50  秒
			
 
				-                    time.sleep(random.randint(1, 50))
			
 
				-                except Exception as e:
			
 
				-                    self.aliyun_log.logging(
			
 
				-                        code="3000",
			
 
				-                        message="扫描第{}页失败， 原因是{}".format(index, e)
			
 
				-                    )
			
 
				-            # 抓完 50 页后休息 10 分钟
			
 
				-            time.sleep(60 * 10)
			
 
				+            # 每一小时执行一次
			
 
				+            current_time = datetime.datetime.now()
			
 
				+            if 0 <= current_time.minute < 15:
			
 
				+                # 计算需要等待的秒数，直到15分钟过去
			
 
				+                wait_time = (15 - current_time.minute) * 60 - current_time.second
			
 
				+                time.sleep(wait_time)
			
 
				+            else:
			
 
				+                # 平均 96 秒抓一页，即 96秒抓 4 条，每条视频之间等待时间是 24s
			
 
				+                # 一共抓取 28 页
			
 
				+                for index in range(1, 29):
			
 
				+                    try:
			
 
				+                        self.send_request(index)
			
 
				+                        # 随机休息 1 - 50  秒
			
 
				+                        time.sleep(random.randint(4 * 10, 4 * 14))
			
 
				+                    except Exception as e:
			
 
				+                        self.aliyun_log.logging(
			
 
				+                            code="3000",
			
 
				+                            message="扫描第{}页失败， 原因是{}".format(index, e)
			
 
				+                        )
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/spider/crawler_author/__init__.py
+++ b/spider/crawler_author/__init__.py
@@ -0,0 +1,3 @@
 
				+"""
			
 
				+piaoquan账号爬虫
			
 
				+"""
			
--- a/spider/crawler_author/gongzhonghao_author.py
+++ b/spider/crawler_author/gongzhonghao_author.py
--- a/spider/crawler_author/xiaoniangao.py
+++ b/spider/crawler_author/xiaoniangao.py
--- a/spider/crawler_author/xigua_author.py
+++ b/spider/crawler_author/xigua_author.py
--- a/spider/crawler_online/jiajiezhufuxishiduoduo.py
+++ b/spider/crawler_online/jiajiezhufuxishiduoduo.py
@@ -0,0 +1,5 @@
 
				+"""
			
 
				+佳节祝福喜事多多——推荐爬虫
			
 
				+@author: LuoJunhui
			
 
				+"""
			
 
				+
			
--- a/spider/crawler_search/__init__.py
+++ b/spider/crawler_search/__init__.py
@@ -0,0 +1,74 @@
 
				+import requests
			
 
				+
			
 
				+
			
 
				+def search_user(keyword):
			
 
				+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/search/user?'
			
 
				+    data = {
			
 
				+        "keyword": keyword,
			
 
				+        "pcursor": "",
			
 
				+        "ussid": ""
			
 
				+    }
			
 
				+    return requests.post(url, headers=headers, json=data).text
			
 
				+
			
 
				+
			
 
				+def search_video(keyword):
			
 
				+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/search/feed?'
			
 
				+    data = {
			
 
				+        "keyword": keyword,
			
 
				+        "pcursor": "",
			
 
				+        "ussid": "",
			
 
				+        "pageSource": 1
			
 
				+
			
 
				+    }
			
 
				+    return requests.post(url, headers=headers, json=data).text
			
 
				+
			
 
				+
			
 
				+def video_info():
			
 
				+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/photo/info?'
			
 
				+    data = {
			
 
				+        "kpn": "WECHAT_SMALL_APP",
			
 
				+        "photoId": "5254293468891588895",
			
 
				+        "authorId": "1346454001",
			
 
				+        "usePrefetch": True,
			
 
				+        "pageType": 1,
			
 
				+        "pageSource": 3
			
 
				+    }
			
 
				+    return requests.post(url, headers=headers, json=data).text
			
 
				+
			
 
				+
			
 
				+def video_comment():
			
 
				+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/photo/comment/list?'
			
 
				+    data = {
			
 
				+        "photoId": "5254293468891588895",
			
 
				+        "count": 20
			
 
				+    }
			
 
				+    return requests.post(url, headers=headers, json=data).text
			
 
				+
			
 
				+
			
 
				+def user_profile():
			
 
				+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/user/profile?'
			
 
				+    data = {
			
 
				+        "eid": "1084678836"
			
 
				+    }
			
 
				+    return requests.post(url, headers=headers, json=data).text
			
 
				+
			
 
				+
			
 
				+user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
			
 
				+referer = 'https://servicewechat.com/wx79a83b1a1e8a7978/591/page-frame.html'
			
 
				+cookie = 'did=填入你的'
			
 
				+
			
 
				+headers = {
			
 
				+    'Host': 'wxmini-api.uyouqu.com',
			
 
				+    # 'referer':referer,
			
 
				+    'User-Agent': user_agent,
			
 
				+    'cookie': cookie
			
 
				+}
			
 
				+data = {
			
 
				+    "keyword": "河南到底有多热",
			
 
				+    "pcursor": "",
			
 
				+    "ussid": "",
			
 
				+    "pageSource": 1
			
 
				+
			
 
				+}
			
 
				+
			
 
				+print(search_video('lx'))
			
--- a/spider/crawler_search/kuaishou_search.py
+++ b/spider/crawler_search/kuaishou_search.py
@@ -0,0 +1,228 @@
 
				+"""
			
 
				+快手搜索爬虫
			
 
				+@Author: luojunhui
			
 
				+"""
			
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import time
			
 
				+import uuid
			
 
				+import random
			
 
				+import datetime
			
 
				+
			
 
				+import requests
			
 
				+from lxml import etree
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from application.items import VideoItem
			
 
				+from application.pipeline import PiaoQuanPipeline
			
 
				+from application.common.messageQueue import MQ
			
 
				+from application.common.proxies import tunnel_proxies
			
 
				+from application.common.log import AliyunLogger
			
 
				+
			
 
				+
			
 
				+class KuaiShouSearch(object):
			
 
				+    """
			
 
				+    快手 Search
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
			
 
				+        self.platform = platform
			
 
				+        self.mode = mode
			
 
				+        self.rule_dict = rule_dict
			
 
				+        self.user_list = user_list
			
 
				+        self.env = env
			
 
				+        self.download_cnt = 0
			
 
				+        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
			
 
				+        self.expire_flag = False
			
 
				+        self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
			
 
				+
			
 
				+    def search_videos(self, keyword):
			
 
				+        """
			
 
				+        search, 一次搜索只抓 20 条视频
			
 
				+        :param keyword: 关键词
			
 
				+        :return: video_list
			
 
				+        """
			
 
				+        url = 'https://www.kuaishou.com/graphql'
			
 
				+        headers = {
			
 
				+            'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8',
			
 
				+            'Connection': 'keep-alive',
			
 
				+            'Cookie': 'kpf=PC_WEB; clientid=3; did=web_5db53a9e49dca57728b58cecb7863868; didv=1698736264000; kpn=KUAISHOU_VISION',
			
 
				+            'Origin': 'https://www.kuaishou.com',
			
 
				+            'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E8%80%81%E5%B9%B4%E5%A4%A7%E5%AD%A6',
			
 
				+            'Sec-Fetch-Dest': 'empty',
			
 
				+            'Sec-Fetch-Mode': 'cors',
			
 
				+            'Sec-Fetch-Site': 'same-origin',
			
 
				+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
			
 
				+            'accept': '*/*',
			
 
				+            'content-type': 'application/json',
			
 
				+            'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
			
 
				+            'sec-ch-ua-mobile': '?0',
			
 
				+            'sec-ch-ua-platform': '"macOS"',
			
 
				+        }
			
 
				+        data = {
			
 
				+            "operationName": "visionSearchPhoto",
			
 
				+            "variables": {
			
 
				+                "keyword": keyword,
			
 
				+                "pcursor": "",
			
 
				+                "page": "search"
			
 
				+            },
			
 
				+            "query": """
			
 
				+            fragment photoContent on PhotoEntity {
			
 
				+              __typename
			
 
				+              id
			
 
				+              duration
			
 
				+              caption
			
 
				+              originCaption
			
 
				+              likeCount
			
 
				+              viewCount
			
 
				+              commentCount
			
 
				+              realLikeCount
			
 
				+              coverUrl
			
 
				+              photoUrl
			
 
				+              photoH265Url
			
 
				+              manifest
			
 
				+              manifestH265
			
 
				+              videoResource
			
 
				+              coverUrls {
			
 
				+                url
			
 
				+                __typename
			
 
				+              }
			
 
				+              timestamp
			
 
				+              expTag
			
 
				+              animatedCoverUrl
			
 
				+              distance
			
 
				+              videoRatio
			
 
				+              liked
			
 
				+              stereoType
			
 
				+              profileUserTopPhoto
			
 
				+              musicBlocked
			
 
				+              riskTagContent
			
 
				+              riskTagUrl
			
 
				+            }
			
 
				+
			
 
				+            fragment recoPhotoFragment on recoPhotoEntity {
			
 
				+              __typename
			
 
				+              id
			
 
				+              duration
			
 
				+              caption
			
 
				+              originCaption
			
 
				+              likeCount
			
 
				+              viewCount
			
 
				+              commentCount
			
 
				+              realLikeCount
			
 
				+              coverUrl
			
 
				+              photoUrl
			
 
				+              photoH265Url
			
 
				+              manifest
			
 
				+              manifestH265
			
 
				+              videoResource
			
 
				+              coverUrls {
			
 
				+                url
			
 
				+                __typename
			
 
				+              }
			
 
				+              timestamp
			
 
				+              expTag
			
 
				+              animatedCoverUrl
			
 
				+              distance
			
 
				+              videoRatio
			
 
				+              liked
			
 
				+              stereoType
			
 
				+              profileUserTopPhoto
			
 
				+              musicBlocked
			
 
				+              riskTagContent
			
 
				+              riskTagUrl
			
 
				+            }
			
 
				+
			
 
				+            fragment feedContent on Feed {
			
 
				+              type
			
 
				+              author {
			
 
				+                id
			
 
				+                name
			
 
				+                headerUrl
			
 
				+                following
			
 
				+                headerUrls {
			
 
				+                  url
			
 
				+                  __typename
			
 
				+                }
			
 
				+                __typename
			
 
				+              }
			
 
				+              photo {
			
 
				+                ...photoContent
			
 
				+                ...recoPhotoFragment
			
 
				+                __typename
			
 
				+              }
			
 
				+              canAddComment
			
 
				+              llsid
			
 
				+              status
			
 
				+              currentPcursor
			
 
				+              tags {
			
 
				+                type
			
 
				+                name
			
 
				+                __typename
			
 
				+              }
			
 
				+              __typename
			
 
				+            }
			
 
				+
			
 
				+            query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
			
 
				+              visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
			
 
				+                result
			
 
				+                llsid
			
 
				+                webPageArea
			
 
				+                feeds {
			
 
				+                  ...feedContent
			
 
				+                  __typename
			
 
				+                }
			
 
				+                searchSessionId
			
 
				+                pcursor
			
 
				+                aladdinBanner {
			
 
				+                  imgUrl
			
 
				+                  link
			
 
				+                  __typename
			
 
				+                }
			
 
				+                __typename
			
 
				+              }
			
 
				+            }
			
 
				+            """
			
 
				+        }
			
 
				+        response = requests.post(url, headers=headers, json=data).json()
			
 
				+        video_list = response['data']['visionSearchPhoto']['feeds']
			
 
				+        return video_list
			
 
				+
			
 
				+    def process_video_obj(self, video_obj):
			
 
				+        """
			
 
				+        处理视频信息
			
 
				+        :return:
			
 
				+        """
			
 
				+        # print(json.dumps(video_obj, ensure_ascii=False, indent=4))
			
 
				+        trace_id = self.platform + str(uuid.uuid1())
			
 
				+        our_user = random.choice(self.user_list)
			
 
				+        publish_time_stamp = int(video_obj["photo"]["timestamp"] / 1000)
			
 
				+
			
 
				+        item = VideoItem()
			
 
				+        item.add_video_info("user_id", our_user["uid"])
			
 
				+        item.add_video_info("user_name", our_user["nick_name"])
			
 
				+        item.add_video_info("video_id", video_obj["photo"]["manifest"]["videoId"])
			
 
				+        item.add_video_info("video_title", video_obj["photo"]['caption'])
			
 
				+        # item.add_video_info("publish_time_str", video_obj["photo"]['timestamp'])
			
 
				+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
			
 
				+        item.add_video_info("video_url", video_obj["photo"]['manifest']['adaptationSet'][0]['representation'][0]['url'])
			
 
				+        item.add_video_info(
			
 
				+            "cover_url", video_obj["photo"]["coverUrl"]
			
 
				+        )
			
 
				+        item.add_video_info("like_cnt", video_obj["photo"]["realLikeCount"])
			
 
				+        item.add_video_info("play_cnt", video_obj["photo"]["viewCount"])
			
 
				+        item.add_video_info("out_video_id", video_obj["photo"]["manifest"]["videoId"])
			
 
				+        item.add_video_info("platform", self.platform)
			
 
				+        item.add_video_info("strategy", self.mode)
			
 
				+        item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
			
 
				+        mq_obj = item.produce_item()
			
 
				+        print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    KS = KuaiShouSearch(platform="kuaishou", mode="search", rule_dict={}, user_list=[{"uid": 1, "nick_name": "ljh"}])
			
 
				+    video_list = KS.search_videos("王者荣耀")
			
 
				+    for i in video_list:
			
 
				+        KS.process_video_obj(i)
			
--- a/spider/crawler_search/xigua_search.py
+++ b/spider/crawler_search/xigua_search.py