소스 검색

监测,搜索代码

罗俊辉 1 년 전
부모
커밋
edbd2f7651

+ 25 - 14
spider/ad_click/piaoquan_vlog.py

@@ -7,6 +7,7 @@ import json
 import random
 import time
 import requests
+import datetime
 
 sys.path.append(os.getcwd())
 
@@ -76,7 +77,8 @@ class PiaoQuanVlog(object):
             "abExpInfo": '{"ab_test004":[{"abExpCode":"126","configValue":""},{"abExpCode":"211","configValue":""}],"ab_test005":[],"ab_test006":[{"abExpCode":"310","configValue":""},{"abExpCode":"321","configValue":""},{"abExpCode":"331","configValue":""},{"abExpCode":"356","configValue":"{\\"playProgress\\": 10, \\"delayHide\\": 8}"},{"abExpCode":"371","configValue":"{\\"playIcon\\": \\"http://weapppiccdn.yishihui.com/wxicon/common/icon_play_btn_font.png?v=2\\", \\"width\\": 125}"}],"ab_test001":[{"abExpCode":"223","configValue":""},{"abExpCode":"201","configValue":""},{"abExpCode":"410","configValue":"{\\"layerStyle\\": 1, \\"oneDayShowCount\\": 3, \\"everyRecommendVideo\\": 5, \\"playProgress\\": 90, \\"closePosition\\": \\"top\\", \\"guideDialogText\\": [\\"您可以点击‘关注票圈公众号’\\", \\"每日最新资讯不错过\\"], \\"guideButtonText\\": \\"关注票圈公众号\\", \\"topImage\\": \\"https://weapppiccdn.yishihui.com/wxicon/common/img_cgi_image3.png\\", \\"jumpUrl\\": \\"https://mp.weixin.qq.com/s?__biz=MzIxMjg2MzE2Mg==&mid=2247483675&idx=1&sn=0338228015ba7a5b0a1937b14e610efc&chksm=97bed0cea0c959d81d90a5d9ce82502ca24fa418df70d6e619a88d4e007a8b14b2b3b3e62386#rd\\", \\"gzhId\\": 105}"}]}',
             "extParams": '{"eventIds":"22040202,ab100,ab100,ab100,ab100,ab100,ab100,ab100,ab100,ab100,ab100","eventInfos":{"ab_test001":"ab100","ab_test002":"ab100","ab_test003":"ab100","ab_test004":"ab100","ab_test005":"ab100","ab_test006":"ab100","ab_test007":"ab100","ab_test008":"ab100","ab_test009":"ab100","ab_test010":"ab100"}}'
         }
-        basic_response = requests.request("POST", self.url, headers=self.headers, data=payload, proxies=tunnel_proxies()).json()
+        basic_response = requests.request("POST", self.url, headers=self.headers, data=payload,
+                                          proxies=tunnel_proxies()).json()
         self.process_video_list(basic_response['data'])
 
     def process_video_list(self, video_list):
@@ -96,6 +98,7 @@ class PiaoQuanVlog(object):
                 "shareCount": video_obj['shareCountFriend'],
                 "favorCount": video_obj['favoriteds']
             }
+            time.sleep(12)
             # print(json.dumps(video_item, ensure_ascii=False, indent=4))
             self.aliyun_log.logging(
                 code="7001",
@@ -105,22 +108,30 @@ class PiaoQuanVlog(object):
 
     def run(self):
         """
-        执行函数
+        一天抓取 24h, 每个小时的 0-15min 不抓取,每一个小时抓取条数为 110,每天抓取条数为 24 * 110 = 2640  条
+        110 / 4 =~ 28, 每一小时大抓取 28 页
         :return: None
         """
         while True:
-            for index in range(1, 51):
-                try:
-                    self.send_request(index)
-                    # 随机休息 1 - 50  秒
-                    time.sleep(random.randint(1, 50))
-                except Exception as e:
-                    self.aliyun_log.logging(
-                        code="3000",
-                        message="扫描第{}页失败, 原因是{}".format(index, e)
-                    )
-            # 抓完 50 页后休息 10 分钟
-            time.sleep(60 * 10)
+            # 每一小时执行一次
+            current_time = datetime.datetime.now()
+            if 0 <= current_time.minute < 15:
+                # 计算需要等待的秒数,直到15分钟过去
+                wait_time = (15 - current_time.minute) * 60 - current_time.second
+                time.sleep(wait_time)
+            else:
+                # 平均 96 秒抓一页,即 96秒抓 4 条,每条视频之间等待时间是 24s
+                # 一共抓取 28 页
+                for index in range(1, 29):
+                    try:
+                        self.send_request(index)
+                        # 随机休息 1 - 50  秒
+                        time.sleep(random.randint(4 * 10, 4 * 14))
+                    except Exception as e:
+                        self.aliyun_log.logging(
+                            code="3000",
+                            message="扫描第{}页失败, 原因是{}".format(index, e)
+                        )
 
 
 if __name__ == '__main__':

+ 3 - 0
spider/crawler_author/__init__.py

@@ -0,0 +1,3 @@
+"""
+piaoquan账号爬虫
+"""

+ 0 - 0
spider/crawler_online/gongzhonghao_author.py → spider/crawler_author/gongzhonghao_author.py


+ 0 - 0
spider/crawler_online/xiaoniangao.py → spider/crawler_author/xiaoniangao.py


+ 0 - 0
spider/crawler_online/xigua_author.py → spider/crawler_author/xigua_author.py


+ 5 - 0
spider/crawler_online/jiajiezhufuxishiduoduo.py

@@ -0,0 +1,5 @@
+"""
+佳节祝福喜事多多——推荐爬虫
+@author: LuoJunhui
+"""
+

+ 74 - 0
spider/crawler_search/__init__.py

@@ -0,0 +1,74 @@
+import requests
+
+
+def search_user(keyword):
+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/search/user?'
+    data = {
+        "keyword": keyword,
+        "pcursor": "",
+        "ussid": ""
+    }
+    return requests.post(url, headers=headers, json=data).text
+
+
+def search_video(keyword):
+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/search/feed?'
+    data = {
+        "keyword": keyword,
+        "pcursor": "",
+        "ussid": "",
+        "pageSource": 1
+
+    }
+    return requests.post(url, headers=headers, json=data).text
+
+
+def video_info():
+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/photo/info?'
+    data = {
+        "kpn": "WECHAT_SMALL_APP",
+        "photoId": "5254293468891588895",
+        "authorId": "1346454001",
+        "usePrefetch": True,
+        "pageType": 1,
+        "pageSource": 3
+    }
+    return requests.post(url, headers=headers, json=data).text
+
+
+def video_comment():
+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/photo/comment/list?'
+    data = {
+        "photoId": "5254293468891588895",
+        "count": 20
+    }
+    return requests.post(url, headers=headers, json=data).text
+
+
+def user_profile():
+    url = 'https://wxmini-api.uyouqu.com/rest/wd/wechatApp/user/profile?'
+    data = {
+        "eid": "1084678836"
+    }
+    return requests.post(url, headers=headers, json=data).text
+
+
+user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat'
+referer = 'https://servicewechat.com/wx79a83b1a1e8a7978/591/page-frame.html'
+cookie = 'did=填入你的'
+
+headers = {
+    'Host': 'wxmini-api.uyouqu.com',
+    # 'referer':referer,
+    'User-Agent': user_agent,
+    'cookie': cookie
+}
+data = {
+    "keyword": "河南到底有多热",
+    "pcursor": "",
+    "ussid": "",
+    "pageSource": 1
+
+}
+
+print(search_video('lx'))

+ 228 - 0
spider/crawler_search/kuaishou_search.py

@@ -0,0 +1,228 @@
+"""
+快手搜索爬虫
+@Author: luojunhui
+"""
+import os
+import sys
+import json
+import time
+import uuid
+import random
+import datetime
+
+import requests
+from lxml import etree
+
+sys.path.append(os.getcwd())
+
+from application.items import VideoItem
+from application.pipeline import PiaoQuanPipeline
+from application.common.messageQueue import MQ
+from application.common.proxies import tunnel_proxies
+from application.common.log import AliyunLogger
+
+
+class KuaiShouSearch(object):
+    """
+    快手 Search
+    """
+
+    def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
+        self.platform = platform
+        self.mode = mode
+        self.rule_dict = rule_dict
+        self.user_list = user_list
+        self.env = env
+        self.download_cnt = 0
+        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
+        self.expire_flag = False
+        self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
+
+    def search_videos(self, keyword):
+        """
+        search, 一次搜索只抓 20 条视频
+        :param keyword: 关键词
+        :return: video_list
+        """
+        url = 'https://www.kuaishou.com/graphql'
+        headers = {
+            'Accept-Language': 'zh,en;q=0.9,zh-CN;q=0.8',
+            'Connection': 'keep-alive',
+            'Cookie': 'kpf=PC_WEB; clientid=3; did=web_5db53a9e49dca57728b58cecb7863868; didv=1698736264000; kpn=KUAISHOU_VISION',
+            'Origin': 'https://www.kuaishou.com',
+            'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E8%80%81%E5%B9%B4%E5%A4%A7%E5%AD%A6',
+            'Sec-Fetch-Dest': 'empty',
+            'Sec-Fetch-Mode': 'cors',
+            'Sec-Fetch-Site': 'same-origin',
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
+            'accept': '*/*',
+            'content-type': 'application/json',
+            'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"macOS"',
+        }
+        data = {
+            "operationName": "visionSearchPhoto",
+            "variables": {
+                "keyword": keyword,
+                "pcursor": "",
+                "page": "search"
+            },
+            "query": """
+            fragment photoContent on PhotoEntity {
+              __typename
+              id
+              duration
+              caption
+              originCaption
+              likeCount
+              viewCount
+              commentCount
+              realLikeCount
+              coverUrl
+              photoUrl
+              photoH265Url
+              manifest
+              manifestH265
+              videoResource
+              coverUrls {
+                url
+                __typename
+              }
+              timestamp
+              expTag
+              animatedCoverUrl
+              distance
+              videoRatio
+              liked
+              stereoType
+              profileUserTopPhoto
+              musicBlocked
+              riskTagContent
+              riskTagUrl
+            }
+
+            fragment recoPhotoFragment on recoPhotoEntity {
+              __typename
+              id
+              duration
+              caption
+              originCaption
+              likeCount
+              viewCount
+              commentCount
+              realLikeCount
+              coverUrl
+              photoUrl
+              photoH265Url
+              manifest
+              manifestH265
+              videoResource
+              coverUrls {
+                url
+                __typename
+              }
+              timestamp
+              expTag
+              animatedCoverUrl
+              distance
+              videoRatio
+              liked
+              stereoType
+              profileUserTopPhoto
+              musicBlocked
+              riskTagContent
+              riskTagUrl
+            }
+
+            fragment feedContent on Feed {
+              type
+              author {
+                id
+                name
+                headerUrl
+                following
+                headerUrls {
+                  url
+                  __typename
+                }
+                __typename
+              }
+              photo {
+                ...photoContent
+                ...recoPhotoFragment
+                __typename
+              }
+              canAddComment
+              llsid
+              status
+              currentPcursor
+              tags {
+                type
+                name
+                __typename
+              }
+              __typename
+            }
+
+            query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {
+              visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {
+                result
+                llsid
+                webPageArea
+                feeds {
+                  ...feedContent
+                  __typename
+                }
+                searchSessionId
+                pcursor
+                aladdinBanner {
+                  imgUrl
+                  link
+                  __typename
+                }
+                __typename
+              }
+            }
+            """
+        }
+        response = requests.post(url, headers=headers, json=data).json()
+        video_list = response['data']['visionSearchPhoto']['feeds']
+        return video_list
+
+    def process_video_obj(self, video_obj):
+        """
+        处理视频信息
+        :return:
+        """
+        # print(json.dumps(video_obj, ensure_ascii=False, indent=4))
+        trace_id = self.platform + str(uuid.uuid1())
+        our_user = random.choice(self.user_list)
+        publish_time_stamp = int(video_obj["photo"]["timestamp"] / 1000)
+
+        item = VideoItem()
+        item.add_video_info("user_id", our_user["uid"])
+        item.add_video_info("user_name", our_user["nick_name"])
+        item.add_video_info("video_id", video_obj["photo"]["manifest"]["videoId"])
+        item.add_video_info("video_title", video_obj["photo"]['caption'])
+        # item.add_video_info("publish_time_str", video_obj["photo"]['timestamp'])
+        item.add_video_info("publish_time_stamp", int(publish_time_stamp))
+        item.add_video_info("video_url", video_obj["photo"]['manifest']['adaptationSet'][0]['representation'][0]['url'])
+        item.add_video_info(
+            "cover_url", video_obj["photo"]["coverUrl"]
+        )
+        item.add_video_info("like_cnt", video_obj["photo"]["realLikeCount"])
+        item.add_video_info("play_cnt", video_obj["photo"]["viewCount"])
+        item.add_video_info("out_video_id", video_obj["photo"]["manifest"]["videoId"])
+        item.add_video_info("platform", self.platform)
+        item.add_video_info("strategy", self.mode)
+        item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
+        mq_obj = item.produce_item()
+        print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
+
+
+if __name__ == '__main__':
+    KS = KuaiShouSearch(platform="kuaishou", mode="search", rule_dict={}, user_list=[{"uid": 1, "nick_name": "ljh"}])
+    video_list = KS.search_videos("王者荣耀")
+    for i in video_list:
+        KS.process_video_obj(i)

+ 0 - 0
spider/crawler_online/xigua_search.py → spider/crawler_search/xigua_search.py