wangkun 2 tahun lalu
induk
melakukan
7a636d7b26

TEMPAT SAMPAH
.DS_Store


+ 1 - 0
README.MD

@@ -20,6 +20,7 @@ ${nohup_dir}:       nohup日志存储路径,如: ./youtube/nohup.log
 ```
 youtube定向榜运行命令: 
 sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="prod" --machine="aliyun_hk" youtube/nohup.log
+sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="dev" --machine="aliyun_hk" youtube/nohup.log
 youtube定向榜杀进程命令: 
 ps aux | grep run_youtube | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep run_youtube | grep Python | grep -v grep | awk '{print $2}' | xargs kill -9

+ 5 - 3
common/common.py

@@ -72,7 +72,7 @@ class Common:
         else:
             for file in all_logs[:len(all_logs) - 10]:
                 os.remove(log_dir + file)
-        cls.logger(log_type, crawler).info("清除日志成功")
+        cls.logger(log_type, crawler).info("清除日志成功\n")
 
     # 删除 charles 缓存文件,只保留最近的两个文件
     @classmethod
@@ -81,7 +81,7 @@ class Common:
         all_file = sorted(os.listdir(f"./{crawler}/{crawler}_chlsfiles/"))
         for file in all_file[0:-3]:
             os.remove(f"./{crawler}/{crawler}_chlsfiles/{file}")
-        cls.logger(log_type, crawler).info("删除 charles 缓存文件成功")
+        cls.logger(log_type, crawler).info("删除 charles 缓存文件成功\n")
 
     # 保存视频信息至 "./videos/{video_dict['video_title}/info.txt"
     @classmethod
@@ -145,7 +145,9 @@ class Common:
             # 视频名
             video_name = "video.mp4"
             try:
-                download_cmd = f"yt-dlp -f 'bv[height=720][ext=mp4]+ba[ext=m4a]' --merge-output-format mp4 {video_url} -o {video_name}"
+                download_cmd = f'yt-dlp -f "bv[height=720][ext=mp4]+ba[ext=m4a]" --merge-output-format mp4 {video_url}-U -o {video_name}'
+                # 'yt-dlp -f "bv[height=720][ext=mp4]+ba[ext=m4a]" --merge-output-format mp4 https://www.youtube.com/watch?v=Q4MtXQY0aHM-U -o video.mp4'
+                Common.logger(log_type, crawler).info(f"download_cmd:{download_cmd}")
                 os.system(download_cmd)
                 move_cmd = f"mv {video_name} {video_dir}"
                 os.system(move_cmd)

+ 3 - 1
common/publish.py

@@ -173,7 +173,7 @@ class Publish:
         if env == 'dev':
             uids_dev = [6267140, 6267141]
             return random.choice(uids_dev)
-        elif crawler == 'kanyikan' and env == 'prod' and strategy == 'kanyikan_moment':
+        elif crawler == 'kanyikan':
             uids_prod_kanyikan_moment = [20631208, 20631209, 20631210, 20631211, 20631212,
                                           20631213, 20631214, 20631215, 20631216, 20631217,
                                           20631223, 20631224, 20631225, 20631226, 20631227]
@@ -232,6 +232,8 @@ class Publish:
             return 'GONGZHONGHAO_XINXIN'
         elif crawler == 'weixinzhishu':
             return 'WEIXINZHISHU'
+        else:
+            return "CRAWLER"
 
     @classmethod
     def local_file_path(cls, crawler):

+ 10 - 4
kanyikan/kanyikan_main/run_kanyikan_moment.py

@@ -15,13 +15,13 @@ from common.feishu import Feishu
 from kanyikan.kanyikan_moment.kanyikan_moment import Moment
 
 
-def main(log_type, crawler, strategy, our_uid, env, oss_endpoint):
+def main(log_type, crawler, strategy, oss_endpoint, env, machine):
     """
     主函数入口
     :param log_type: 日志命名: monent
     :param crawler: 哪款爬虫: kanyikan
     :param strategy: 爬虫策略: kanyikan_moment
-    :param our_uid: 站内 UID: kanyikan_moment
+    :param machine: 爬虫运行机器,阿里云服务器: aliyun_hk / aliyun / macpro / macair / local
     :param env: 正式环境: prod;测试环境: dev
     :param oss_endpoint: 阿里云102服务器: inner ;其它: out
     :return: None
@@ -33,7 +33,7 @@ def main(log_type, crawler, strategy, our_uid, env, oss_endpoint):
             moment_video_list = Feishu.get_sheet_content(log_type, crawler, 'iK58HX')
             for moment_video_id in moment_video_list:
                 Common.logger(log_type, crawler).info(f"开始抓取{moment_video_id}朋友圈推荐视频\n")
-                Moment.get_videos(log_type, crawler, strategy, our_uid, env, oss_endpoint, moment_video_id)
+                Moment.get_videos(log_type, crawler, strategy, oss_endpoint, env, machine, moment_video_id)
 
             Common.del_logs(log_type, crawler)
             Common.logger(log_type, crawler).info("抓取完一轮,休眠 10 秒\n")
@@ -48,6 +48,12 @@ if __name__ == "__main__":
     parser.add_argument('--our_uid')  ## 添加参数
     parser.add_argument('--oss_endpoint')  ## 添加参数
     parser.add_argument('--env')  ## 添加参数
+    parser.add_argument('--machine')  ## 添加参数
     args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
     # print(args)
-    main(args.log_type, args.crawler, args.strategy, args.our_uid, args.env, args.oss_endpoint)
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         strategy=args.strategy,
+         oss_endpoint=args.oss_endpoint,
+         env=args.env,
+         machine=args.machine)

+ 9 - 4
kanyikan/kanyikan_moment/kanyikan_moment.py

@@ -37,7 +37,7 @@ class Moment:
 
     # 获取推荐视频列表
     @classmethod
-    def get_videos(cls, log_type, crawler, strategy, our_uid, env, oss_endpoint, moment_video_id):
+    def get_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine, moment_video_id):
         url = "https://search.weixin.qq.com/cgi-bin/recwxa/snsgetvideoinfo?"
         headers = {
             "content-type": "application/json",
@@ -207,13 +207,13 @@ class Moment:
                     elif video_id in [j for m in Feishu.get_values_batch(log_type, crawler, "20ce0c") for j in m]:
                         Common.logger(log_type, crawler).info("视频已下载\n")
                     else:
-                        cls.download_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint, video_dict)
+                        cls.download_publish(log_type, crawler, strategy, oss_endpoint, env, video_dict)
         except Exception as e:
             Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
 
     # 下载/上传视频
     @classmethod
-    def download_publish(cls, log_type, crawler, strategy, our_uid, env, oss_endpoint, video_dict):
+    def download_publish(cls, log_type, crawler, strategy, oss_endpoint, env, video_dict):
         try:
             # 过滤空行及空标题视频
             if video_dict['video_id'] == 0 \
@@ -241,7 +241,12 @@ class Moment:
 
                 # 上传视频
                 Common.logger(log_type, crawler).info(f"开始上传视频:{video_dict['video_title']}")
-                our_video_id = Publish.upload_and_publish(log_type, crawler, strategy, our_uid, env, oss_endpoint)
+                our_video_id = Publish.upload_and_publish(log_type=log_type,
+                                                          crawler=crawler,
+                                                          strategy=strategy,
+                                                          oss_endpoint=oss_endpoint,
+                                                          our_uid="kanyikan_moment",
+                                                          env=env)
                 if env == 'dev':
                     our_video_link = f"https://testadmin.piaoquantv.com/cms/post-detail/{our_video_id}/info"
                 else:

+ 2 - 1
main/main.sh

@@ -48,7 +48,8 @@ ps aux | grep ${grep_str} | grep Python | grep -v grep | awk '{print $2}' | xarg
 echo "$(date "+%Y-%m-%d %H:%M:%S") 进程已杀死!"
 
 if [ ${machine} = "--machine=aliyun_hk" ];then
-  echo "无需更新代码"
+  echo "升级yt-dlp"
+  pip3 install yt-dlp -U
 else
   echo "$(date "+%Y-%m-%d %H:%M:%S") 正在更新代码..."
   cd ${piaoquan_crawler_dir} && git pull origin master --force && rm -f ${piaoquan_crawler_dir}main/nohup.log && rm -f ${piaoquan_crawler_dir}${nohup_dir}

+ 2 - 1
requirements.txt

@@ -8,4 +8,5 @@ requests==2.27.1
 selenium~=4.2.0
 urllib3==1.26.9
 emoji~=2.2.0
-Appium-Python-Client~=2.8.1
+Appium-Python-Client~=2.8.1
+atomac~=1.2.0

TEMPAT SAMPAH
weixinzhishu/.DS_Store


TEMPAT SAMPAH
weixinzhishu/logs/.DS_Store


+ 250 - 0
weixinzhishu/weixinzhishu_main/get_weixinzhishu.py

@@ -0,0 +1,250 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/10
+import os
+import sys
+import time
+from datetime import date, timedelta
+import requests
+import json
+sys.path.append(os.getcwd())
+from common.feishu import Feishu
+from common.common import Common
+
+
+class Weixinzhishu:
+    pageNum = 1
+
+    # 获取微信 key / openid
+    @classmethod
+    def get_wechat_key(cls, log_type, crawler):
+        """
+        获取微信 key / openid
+        https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k
+        :param log_type: 日志名
+        :param crawler: 哪款爬虫,填写:weixinzhishu
+        :return: search_key, openid
+        """
+        try:
+            sheet = Feishu.get_values_batch(log_type, crawler, 'sVL74k')
+            for i in range(len(sheet)):
+                search_key = sheet[1][1]
+                openid = sheet[1][2]
+                return search_key, openid
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"wechat_key:{e}\n")
+
+    # 获取热词
+    @classmethod
+    def get_word(cls, log_type, crawler, host):
+        try:
+            url = '/hot/word/getAllWords'
+            params = {
+                'pageNum': cls.pageNum,  # 第几页,默认1,int
+                'pageSize': 100  # 请求条目数,默认为100,int
+            }
+            response = requests.post(url=host+url, json=params)
+            cls.pageNum += 1
+            if response.status_code != 200:
+                Common.logger(log_type, crawler).warning(f"get_word_response:{response.text}\n")
+            elif response.json()['message'] != "success":
+                Common.logger(log_type, crawler).warning(f"get_word_response:{response.json()}\n")
+            else:
+                word_list = response.json()['data']['words']
+                return word_list
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"get_word:{e}\n")
+
+    # 获取热词分数
+    @classmethod
+    def get_word_score(cls, log_type, crawler, word_id, word):
+        """
+        获取热词分数
+        :param log_type: 日志名
+        :param crawler: 哪款爬虫,填写:weixinzhishu
+        :param word_id: 热词 ID
+        :param word: 热词
+        :return: 热词 7 天指数,例如:
+        {'id': 1,
+        'word': '消息',
+        'wechatScores': [
+        {'score': 95521022, 'scoreDate': '2023-02-07'},
+        {'score': 97315283, 'scoreDate': '2023-02-08'},
+        {'score': 109845849, 'scoreDate': '2023-02-09'},
+        {'score': 107089560, 'scoreDate': '2023-02-10'},
+        {'score': 102658391, 'scoreDate': '2023-02-11'},
+        {'score': 93843701, 'scoreDate': '2023-02-12'},
+        {'score': 100211894, 'scoreDate': '2023-02-13'}]}
+        """
+        try:
+            while True:
+                wechat_key = cls.get_wechat_key(log_type, crawler)
+                search_key = wechat_key[0]
+                openid = wechat_key[-1]
+                start_ymd = (date.today() + timedelta(days=-7)).strftime("%Y%m%d")
+                end_ymd = (date.today() + timedelta(days=0)).strftime("%Y%m%d")
+                url = "https://search.weixin.qq.com/cgi-bin/wxaweb/wxindex"
+                payload = json.dumps({
+                    "openid": openid,
+                    "search_key": search_key,
+                    "cgi_name": "GetDefaultIndex",
+                    "start_ymd": start_ymd,
+                    "end_ymd": end_ymd,
+                    "query": word
+                })
+                headers = {
+                    'Host': 'search.weixin.qq.com',
+                    'content-type': 'application/json',
+                    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.32(0x1800202a) NetType/WIFI Language/zh_CN',
+                    'Referer': 'https://servicewechat.com/wxc026e7662ec26a3a/42/page-frame.html'
+                }
+                response = requests.request("POST", url=url, headers=headers, data=payload)
+                wechat_score_list = []
+                word_wechat_score_dict = {
+                    "id": word_id,
+                    "word": word,
+                    "wechatScores": wechat_score_list,
+                }
+                if response.json()['code'] == -10000:
+                    Common.logger(log_type, crawler).warning(f"response:{response.json()['msg']} 休眠 10 秒,重新获取")
+                    time.sleep(10)
+                elif response.json()['code'] == -10002:
+                    Common.logger(log_type, crawler).info(f'{word}:该词暂未收录')
+                    # # 数据写入飞书
+                    # now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+                    # values = [[now, word, "该词暂未收录"]]
+                    # Feishu.insert_columns(log_type, crawler, "5011a2", "ROWS", 1, 2)
+                    # time.sleep(0.5)
+                    # Feishu.update_values(log_type, crawler, "5011a2", "F2:Z2", values)
+                    # Common.logger(log_type, crawler).info(f'热词"{word}"微信指数数据写入飞书成功\n')
+                    return word_wechat_score_dict
+                elif response.json()['code'] != 0:
+                    Common.logger(log_type, crawler).info(f'response:{response.text}\n')
+                    return word_wechat_score_dict
+                else:
+                    time_index = response.json()['content']['resp_list'][0]['indexes'][0]['time_indexes']
+                    for i in range(len(time_index)):
+                        score_time = time_index[i]['time']
+                        score_time_str = f"{str(score_time)[:4]}-{str(score_time)[4:6]}-{str(score_time)[6:]}"
+                        score = time_index[i]['score']
+                        wechat_score_dict = {"score": score, "scoreDate": score_time_str}
+                        wechat_score_list.append(wechat_score_dict)
+                        # # 数据写入飞书
+                        # now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
+                        # values = [[now, word, score_time_str, score]]
+                        # Feishu.insert_columns(log_type, crawler, "5011a2", "ROWS", 1, 2)
+                        # time.sleep(0.5)
+                        # Feishu.update_values(log_type, crawler, "5011a2", "F2:Z2", values)
+                        # Common.logger(log_type, crawler).info(f'热词"{word}"微信指数数据写入飞书成功\n')
+                    return word_wechat_score_dict
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"get_word_score异常:{e}\n")
+
+    # 获取微信指数
+    @classmethod
+    def get_wechat_score(cls, log_type, crawler, host):
+        """
+        获取微信指数
+        :param log_type: 日志名
+        :param crawler: 哪款爬虫
+        :param host: 域名
+        :return: 热词指数列表
+        """
+        while True:
+            word_list = cls.get_word(log_type, crawler, host)
+            if len(word_list) == 0:
+                Common.logger(log_type, crawler).info(f"热词更新完毕\n")
+                cls.pageNum = 1
+                return
+            else:
+                wechat_score_data = []
+                Common.logger(log_type, crawler).info(f"len(word_list):{len(word_list)}")
+                for i in range(len(word_list)):
+                    word_id = word_list[i]['id']
+                    word = word_list[i]['word']
+                    Common.logger(log_type, crawler).info(f"word_id:{word_id}")
+                    Common.logger(log_type, crawler).info(f"word:{word}")
+                    word_score_dict = cls.get_word_score(log_type, crawler, word_id, word)
+                    Common.logger(log_type, crawler).info(f"word_score_dict:{word_score_dict}\n")
+                    wechat_score_data.append(word_score_dict)
+                Common.logger(log_type, crawler).info(f"wechat_score_data:{wechat_score_data}\n")
+                cls.update_wechat_score(log_type, crawler, wechat_score_data, host)
+
+    # 更新微信指数
+    @classmethod
+    def update_wechat_score(cls, log_type, crawler, data, host):
+        """
+        更新热词微信指数
+        :param log_type: 日志名
+        :param crawler: 哪款爬虫
+        :param data: 热词微信指数
+        :param host: 域名
+        :return: {"code":200, "message":"success"}
+        """
+        try:
+            url = '/hot/word/updateWechatScore'
+            params = {'data': data}
+            response = requests.post(url=host+url, json=params)
+            if response.status_code != 200:
+                Common.logger(log_type, crawler).warning(f"update_wechat_score_response:{response.text}\n")
+            elif response.json()["message"] != "success":
+                Common.logger(log_type, crawler).warning(f"update_wechat_score_response:{response.json()}\n")
+            else:
+                Common.logger(log_type, crawler).info(f"更新热词微信指数:{response.json()['message']}\n")
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"update_wechat_score:{e}\n")
+
+    @classmethod
+    def get_score_test(cls, log_type, crawler, word_id, word):
+        wechat_key = cls.get_wechat_key(log_type, crawler)
+        search_key = wechat_key[0]
+        openid = wechat_key[-1]
+        end_ymd = (date.today() + timedelta(days=0)).strftime("%Y%m%d")
+        start_ymd = (date.today() + timedelta(days=-7)).strftime("%Y%m%d")
+        url = "https://search.weixin.qq.com/cgi-bin/wxaweb/wxindex"
+        payload = json.dumps({
+            "openid": openid,
+            "search_key": search_key,
+            "cgi_name": "GetDefaultIndex",
+            "start_ymd": start_ymd,
+            "end_ymd": end_ymd,
+            "query": word
+        })
+        headers = {
+            'Host': 'search.weixin.qq.com',
+            'content-type': 'application/json',
+            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.32(0x1800202a) NetType/WIFI Language/zh_CN',
+            'Referer': 'https://servicewechat.com/wxc026e7662ec26a3a/42/page-frame.html'
+        }
+        response = requests.request("POST", url, headers=headers, data=payload)
+        wechat_score_list = []
+        word_wechat_score_dict = {
+            "id": word_id,
+            "word": word,
+            "wechatScores": wechat_score_list,
+        }
+        if response.json()['code'] == -10000:
+            print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))} response:{response.json()['msg']} 休眠 10 秒,重新获取")
+            time.sleep(10)
+            cls.get_score_test(log_type, crawler, word_id, word)
+        elif response.json()['code'] == -10002:
+            print("该词暂未收录")
+            print(f"{word_wechat_score_dict}")
+        elif response.json()['code'] != 0:
+            print(f"{word_wechat_score_dict}")
+        else:
+            time_index = response.json()['content']['resp_list'][0]['indexes'][0]['time_indexes']
+            for i in range(len(time_index)):
+                score_time = time_index[i]['time']
+                score_time_str = f"{str(score_time)[:4]}-{str(score_time)[4:6]}-{str(score_time)[6:]}"
+                score = time_index[i]['score']
+                wechat_score_dict = {"score": score, "scoreDate": score_time_str}
+                wechat_score_list.append(wechat_score_dict)
+                print(f"wechat_score_dict:{wechat_score_dict}")
+            print(word_wechat_score_dict)
+
+
+if __name__ == "__main__":
+    Weixinzhishu.get_score_test('weixin', 'weixinzhishu', 1 , "社保")
+
+    pass

+ 10 - 6
weixinzhishu/weixinzhishu_main/run_weixinzhishu.py

@@ -1,19 +1,24 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/2/13
-import argparse
+# import argparse
 import os
 import sys
 sys.path.append(os.getcwd())
 from common.common import Common
-from weixinzhishu.weixinzhishu_main.weixinzhishu import Weixinzhishu
+from weixinzhishu.weixinzhishu_main.get_weixinzhishu import Weixinzhishu
 
 
 class Main:
     @classmethod
-    def main(cls, log_type, crawler):
+    def main(cls, log_type, crawler, env):
+        if env == "dev":
+            host = 'http://testhot-words-internal.piaoquantv.com'
+        else:
+            host = 'http://hot-words-internal.piaoquantv.com'
         Common.logger(log_type, crawler).info("开始抓取微信指数\n")
-        Weixinzhishu.update_wechat_score(log_type, crawler)
+        Weixinzhishu.get_wechat_score(log_type, crawler, host)
+        Common.del_logs(log_type, crawler)
 
 
 if __name__ == "__main__":
@@ -28,5 +33,4 @@ if __name__ == "__main__":
     # args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
     # # print(args)
     # Main.main(log_type=args.log_type, crawler=args.crawler)
-
-    Main.main("weixin", "weixinzhishu")
+    Main.main("weixin", "weixinzhishu", "prod")

+ 4 - 3
weixinzhishu/weixinzhishu_main/search_key.py

@@ -2,6 +2,7 @@
 # @Author: wangkun
 # @Time: 2023/2/10
 """
+部署机器: Windows 笔记本
 获取微信指数小程序请求参数:search_key
     1. 启动 WinAppDriver.exe
     2. 启动 Charles.exe:
@@ -10,7 +11,7 @@
     3. 启动 Python 脚本:
         3.1 cd D:\piaoquan_crawler
         3.2 python .\weixinzhishu\weixinzhishu_main\search_key.py
-每分钟获取最新search_key,写入飞书: https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k
+每 10 秒获取最新search_key,写入飞书: https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k
 """
 import json
 import os
@@ -181,7 +182,7 @@ class Searchkey:
 if __name__ == '__main__':
     while True:
         Searchkey.write_search_key_to_feishu('searchkey', 'weixinzhishu')
-        Common.logger('searchkey', 'weixinzhishu').info('休眠 1 分钟')
-        time.sleep(60)
+        Common.logger('searchkey', 'weixinzhishu').info('休眠 10 秒')
+        time.sleep(10)
 
     # Searchkey.start_wechat('searchkey', 'weixinzhishu')

+ 26 - 0
weixinzhishu/weixinzhishu_main/search_key_mac.py

@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/20
+import time
+
+import atomac
+
+
+class SearchKey:
+    @classmethod
+    def start_wechat(cls):
+        bundle_id = "com.tencent.xinWeChat"
+        atomac.launchAppByBundleId(bundle_id)
+        automator = atomac.getAppRefByBundleId(bundle_id)
+        time.sleep(3)
+
+        window = automator.windows()[0]
+
+        msg_box = window.findFirstR(AXRole="AXCell", AXIdentifier="MMChatsTableCellView_0")
+        print(msg_box.getAttributes())
+
+
+
+
+if __name__ == "__main__":
+    SearchKey.start_wechat()

+ 0 - 277
weixinzhishu/weixinzhishu_main/weixinzhishu.py

@@ -1,277 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/2/10
-import os
-import sys
-import time
-from datetime import date, timedelta
-import requests
-import json
-sys.path.append(os.getcwd())
-from common.feishu import Feishu
-from common.common import Common
-
-
-class Weixinzhishu:
-    pageNum = 1
-
-    # 获取微信 key / openid
-    @classmethod
-    def get_wechat_key(cls, log_type, crawler):
-        """
-        获取微信 key / openid
-        https://w42nne6hzg.feishu.cn/sheets/shtcnqhMRUGunIfGnGXMOBYiy4K?sheet=sVL74k
-        :param log_type: 日志名
-        :param crawler: 哪款爬虫,填写:weixinzhishu
-        :return: search_key, openid
-        """
-        try:
-            sheet = Feishu.get_values_batch(log_type, crawler, 'sVL74k')
-            for i in range(len(sheet)):
-                search_key = sheet[1][1]
-                openid = sheet[1][2]
-                return search_key, openid
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"wechat_key:{e}\n")
-
-    # 获取热词
-    @classmethod
-    def get_word(cls):
-        url = '/hot/word/getAllWords'
-        params = {
-            'pageNum': cls.pageNum,  # 第几页,默认1,int
-            'pageSize': 100  # 请求条目数,默认为100,int
-        }
-        response = requests.post(url=url, json=params)
-        cls.pageNum += 1
-        word_list = []
-        print(response.text)
-        return word_list
-
-    # 获取热词分数
-    @classmethod
-    def get_word_score(cls, log_type, crawler, word_id, word):
-        """
-        获取热词分数
-        :param log_type: 日志名
-        :param crawler: 哪款爬虫,填写:weixinzhishu
-        :param word_id: 热词 ID
-        :param word: 热词
-        :return: 热词 7 天指数,例如:
-        {'id': 1,
-        'word': '消息',
-        'wechatScores': [
-        {'score': 95521022, 'scoreDate': '2023-02-07'},
-        {'score': 97315283, 'scoreDate': '2023-02-08'},
-        {'score': 109845849, 'scoreDate': '2023-02-09'},
-        {'score': 107089560, 'scoreDate': '2023-02-10'},
-        {'score': 102658391, 'scoreDate': '2023-02-11'},
-        {'score': 93843701, 'scoreDate': '2023-02-12'},
-        {'score': 100211894, 'scoreDate': '2023-02-13'}]}
-        """
-        try:
-            wechat_key = cls.get_wechat_key(log_type, crawler)
-            search_key = wechat_key[0]
-            openid = wechat_key[-1]
-            end_ymd = (date.today() + timedelta(days=0)).strftime("%Y%m%d")
-            start_ymd = (date.today() + timedelta(days=-7)).strftime("%Y%m%d")
-            url = "https://search.weixin.qq.com/cgi-bin/wxaweb/wxindex"
-            payload = json.dumps({
-                "openid": openid,
-                "search_key": search_key,
-                "cgi_name": "GetDefaultIndex",
-                "start_ymd": start_ymd,
-                "end_ymd": end_ymd,
-                "query": word
-            })
-            headers = {
-                'Host': 'search.weixin.qq.com',
-                'content-type': 'application/json',
-                'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.32(0x1800202a) NetType/WIFI Language/zh_CN',
-                'Referer': 'https://servicewechat.com/wxc026e7662ec26a3a/42/page-frame.html'
-            }
-            response = requests.request("POST", url, headers=headers, data=payload)
-            wechat_score_list = []
-            word_wechat_score_dict = {
-                "id": word_id,
-                "word": word,
-                "wechatScores": wechat_score_list,
-            }
-            if response.json()['code'] == -10000:
-                # Common.logger(log_type, crawler).warning(f"response:{response.json()['msg']} 休眠 10 秒,重新获取\n")
-                # time.sleep(10)
-                # cls.get_word_score(log_type, crawler, word_id, word)
-                return None
-            elif response.json()['code'] == -10002:
-                # Common.logger(log_type, crawler).info(f'{word}:该词暂未收录')
-                # # 数据写入飞书
-                # now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
-                # values = [[now, word, "该词暂未收录"]]
-                # Feishu.insert_columns(log_type, crawler, "5011a2", "ROWS", 1, 2)
-                # time.sleep(0.5)
-                # Feishu.update_values(log_type, crawler, "5011a2", "F2:Z2", values)
-                # Common.logger(log_type, crawler).info(f'热词"{word}"微信指数数据写入飞书成功\n')
-                return word_wechat_score_dict
-            elif response.json()['code'] != 0:
-                Common.logger(log_type, crawler).info(f'response:{response.text}\n')
-                return word_wechat_score_dict
-            else:
-                time_index = response.json()['content']['resp_list'][0]['indexes'][0]['time_indexes']
-                for i in range(len(time_index)):
-                    score_time = time_index[i]['time']
-                    score_time_str = f"{str(score_time)[:4]}-{str(score_time)[4:6]}-{str(score_time)[6:]}"
-                    score = time_index[i]['score']
-                    wechat_score_dict = {"score": score, "scoreDate": score_time_str}
-                    wechat_score_list.append(wechat_score_dict)
-
-                    # # 数据写入飞书
-                    # now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
-                    # values = [[now, word, score_time_str, score]]
-                    # Feishu.insert_columns(log_type, crawler, "5011a2", "ROWS", 1, 2)
-                    # time.sleep(0.5)
-                    # Feishu.update_values(log_type, crawler, "5011a2", "F2:Z2", values)
-                    # Common.logger(log_type, crawler).info(f'热词"{word}"微信指数数据写入飞书成功\n')
-
-                return word_wechat_score_dict
-        except Exception as e:
-            Common.logger(log_type, crawler).error(f"weixinzhishu异常:{e}\n")
-
-    # 获取微信指数
-    @classmethod
-    def get_wechat_score(cls, log_type, crawler):
-        """
-        获取微信指数
-        :param log_type: 日志名
-        :param crawler: 哪款爬虫
-        :return: 热词指数列表
-        """
-        while True:
-            word_list = cls.get_word()
-            if len(word_list) == 0:
-                Common.logger(log_type, crawler).info(f"热词更新完毕\n")
-                cls.pageNum = 1
-                return []
-            else:
-                wechat_score_data = []
-                for i in range(len(word_list)):
-                    word_id = word_list[i]['Id']
-                    word = word_list[i]['word']
-                    word_score_dict = cls.get_word_score(log_type, crawler, word_id, word)
-                    wechat_score_data.append(word_score_dict)
-                    return wechat_score_data
-
-    # 更新微信指数
-    @classmethod
-    def update_wechat_score(cls, log_type, crawler):
-        """
-        更新热词微信指数
-        :param log_type: 日志名
-        :param crawler: 哪款爬虫
-        :return: {"code":200, "message":"success"}
-        """
-        data = {
-            'data': cls.get_wechat_score(log_type, crawler)
-        }
-
-        url = '/hot/word/updateWechatScore'
-        params = {
-            'data': data
-        }
-        response = requests.post(url=url, json=params)
-        print(response.text)
-
-    @classmethod
-    def update_wechat_score_test(cls, log_type, crawler):
-        our_word_list = []
-        out_word_list = []
-        our_word_sheet = Feishu.get_values_batch(log_type, 'weixinzhishu_search_word', "nCudsM")
-        out_word_sheet = Feishu.get_values_batch(log_type, 'weixinzhishu_search_word', "D80uEf")
-        for x in our_word_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    our_word_list.append(y)
-        for x in out_word_sheet:
-            for y in x:
-                if y is None:
-                    pass
-                else:
-                    out_word_list.append(y)
-        word_list = our_word_list+out_word_list
-        word_score_list = []
-        # for i in range(len(word_list)):
-        for i in range(100):
-            while True:
-                Common.logger(log_type, crawler).info(f"word_id:{i + 1}, word:{word_list[i]}")
-                word_score = cls.get_word_score(log_type, crawler, int(i + 1), word_list[i])
-                if word_score is None:
-                    Common.logger(log_type, crawler).info("微信key过期,10秒钟后重试")
-                    time.sleep(10)
-                else:
-                    word_score_list.append(word_score)
-                    Common.logger(log_type, crawler).info(f'微信指数:{word_score}\n')
-                    break
-
-        word_dict = {
-            "data": word_score_list
-        }
-        return word_dict
-
-    @classmethod
-    def get_score_test(cls, log_type, crawler, word_id, word):
-        wechat_key = cls.get_wechat_key(log_type, crawler)
-        search_key = wechat_key[0]
-        openid = wechat_key[-1]
-        end_ymd = (date.today() + timedelta(days=0)).strftime("%Y%m%d")
-        start_ymd = (date.today() + timedelta(days=-7)).strftime("%Y%m%d")
-        url = "https://search.weixin.qq.com/cgi-bin/wxaweb/wxindex"
-        payload = json.dumps({
-            "openid": openid,
-            "search_key": search_key,
-            "cgi_name": "GetDefaultIndex",
-            "start_ymd": start_ymd,
-            "end_ymd": end_ymd,
-            "query": word
-        })
-        headers = {
-            'Host': 'search.weixin.qq.com',
-            'content-type': 'application/json',
-            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.32(0x1800202a) NetType/WIFI Language/zh_CN',
-            'Referer': 'https://servicewechat.com/wxc026e7662ec26a3a/42/page-frame.html'
-        }
-        response = requests.request("POST", url, headers=headers, data=payload)
-        wechat_score_list = []
-        word_wechat_score_dict = {
-            "id": word_id,
-            "word": word,
-            "wechatScores": wechat_score_list,
-        }
-        if response.json()['code'] == -10000:
-            print(f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(time.time())))} response:{response.json()['msg']} 休眠 10 秒,重新获取\n")
-            time.sleep(10)
-            cls.get_score_test(log_type, crawler, word_id, word)
-        elif response.json()['code'] == -10002:
-            print("该词暂未收录")
-            print(f"{word_wechat_score_dict}")
-        elif response.json()['code'] != 0:
-            print(f"{word_wechat_score_dict}")
-        else:
-            time_index = response.json()['content']['resp_list'][0]['indexes'][0]['time_indexes']
-            for i in range(len(time_index)):
-                score_time = time_index[i]['time']
-                score_time_str = f"{str(score_time)[:4]}-{str(score_time)[4:6]}-{str(score_time)[6:]}"
-                score = time_index[i]['score']
-                wechat_score_dict = {"score": score, "scoreDate": score_time_str}
-                wechat_score_list.append(wechat_score_dict)
-                print(f"wechat_score_dict:{wechat_score_dict}")
-            print(word_wechat_score_dict)
-
-
-if __name__ == "__main__":
-    Weixinzhishu.get_score_test('weixin', 'weixinzhishu', 1 , "春晚")
-    #
-    # word_dict_demo = Weixinzhishu.update_wechat_score_test('weixin', 'weixinzhishu')
-    # print(word_dict_demo)
-
-    pass

+ 3 - 0
xigua/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/17

+ 3 - 0
xigua/xigua_follow/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/17

+ 3 - 0
xigua/xigua_follow/xigua_demo.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/17

+ 431 - 0
xigua/xigua_follow/xigua_follow.py

@@ -0,0 +1,431 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/17
+import base64
+import json
+import os
+import sys
+import time
+
+import requests
+import urllib3
+from selenium.webdriver import DesiredCapabilities
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from seleniumwire import webdriver
+
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.feishu import Feishu
+from common.publish import Publish
+proxies = {"http": None, "https": None}
+
+
+class Follow:
+    # 个人主页视频翻页参数
+    offset = 0
+
+    # 下载规则
+    @staticmethod
+    def download_rule(duration, width, height):
+        if int(duration) >= 60:
+            if int(width) >= 720 or int(height) >= 720:
+                return True
+            else:
+                return False
+        else:
+            return False
+
+    # 过滤词库
+    @classmethod
+    def filter_words(cls, log_type, crawler):
+        try:
+            filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
+            filter_words_list = []
+            for x in filter_words_sheet:
+                for y in x:
+                    if y is None:
+                        pass
+                    else:
+                        filter_words_list.append(y)
+            return filter_words_list
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
+
+    # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
+    @classmethod
+    def get_user_info_from_feishu(cls, log_type, crawler):
+        try:
+            user_sheet = Feishu.get_values_batch(log_type, crawler, '5tlTYB')
+            user_dict = {}
+            for i in range(1, len(user_sheet)):
+                user_name = user_sheet[i][0]
+                out_id = user_sheet[i][1]
+                our_id = user_sheet[i][3]
+                if user_name is None or out_id is None or our_id is None:
+                    pass
+                else:
+                    user_dict[user_name] = str(out_id) + ',' + str(our_id)
+            return user_dict
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')
+
+    @classmethod
+    def get_signature(cls, log_type, crawler, out_uid, machine):
+        try:
+            # 打印请求配置
+            ca = DesiredCapabilities.CHROME
+            ca["goog:loggingPrefs"] = {"performance": "ALL"}
+
+            # 不打开浏览器运行
+            chrome_options = webdriver.ChromeOptions()
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
+            chrome_options.add_argument("--no-sandbox")
+
+            # driver初始化
+            if machine == 'aliyun' or machine == 'aliyun_hk':
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
+            elif machine == 'macpro':
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
+                                          service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
+            elif machine == 'macair':
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
+                                          service=Service('/Users/piaoquan/Downloads/chromedriver'))
+            else:
+                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
+            driver.implicitly_wait(10)
+            driver.get(f'https://www.ixigua.com/home/{out_uid}/')
+            time.sleep(3)
+            data_src = driver.find_elements(By.XPATH, '//img[@class="tt-img BU-MagicImage tt-img-loaded"]')[1].get_attribute("data-src")
+            signature = data_src.split("x-signature=")[-1]
+            # print(f"data_src:{data_src}")
+            # print(f"signature:{signature}")
+            return signature
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f'get_signature异常:{e}\n')
+
+    # 获取视频详情
+    @classmethod
+    def get_video_url(cls, log_type, crawler, gid):
+        # try:
+        url = 'https://www.ixigua.com/api/mixVideo/information?'
+        headers = {
+            "accept-encoding": "gzip, deflate",
+            "accept-language": "zh-CN,zh-Hans;q=0.9",
+            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
+            "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
+        }
+        params = {
+            'mixId': gid,
+            'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
+                       'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
+            'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
+            '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
+                          'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
+        }
+        cookies = {
+            'ixigua-a-s': '1',
+            'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
+                       'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
+            'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
+                     '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
+            'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
+            'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
+            '__ac_nonce': '06304878000964fdad287',
+            '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
+                              'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
+            'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
+            '_tea_utm_cache_1300': 'undefined',
+            'support_avif': 'false',
+            'support_webp': 'false',
+            'xiguavideopcwebid': '7134967546256016900',
+            'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
+        }
+        urllib3.disable_warnings()
+        response = requests.get(url=url, headers=headers, params=params, cookies=cookies, verify=False)
+        if 'data' not in response.json() or response.json()['data'] == '':
+            Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
+        else:
+            video_info = response.json()['data']['gidInformation']['packerData']['video']
+            video_url_dict = {}
+            # video_url
+            if 'videoResource' not in video_info:
+                video_url_dict["video_url"] = ''
+                video_url_dict["audio_url"] = ''
+                video_url_dict["video_width"] = 0
+                video_url_dict["video_height"] = 0
+
+            elif 'dash_120fps' in video_info['videoResource']:
+                if "video_list" in video_info['videoResource']['dash_120fps'] and len(video_info['videoResource']['dash_120fps']['video_list']) != 0:
+                    video_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1']
+                    audio_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1']
+                    if len(video_url) % 3 == 1:
+                        video_url += '=='
+                    elif len(video_url) % 3 == 2:
+                        video_url += '='
+                    elif len(audio_url) % 3 == 1:
+                        audio_url += '=='
+                    elif len(audio_url) % 3 == 2:
+                        audio_url += '='
+                    video_url = base64.b64decode(video_url).decode('utf8')
+                    audio_url = base64.b64decode(audio_url).decode('utf8')
+                    video_width = video_info['videoResource']['dash_120fps']['video_list'][-1]['vwidth']
+                    video_height = video_info['videoResource']['dash_120fps']['video_list'][-1]['vheight']
+                    video_url_dict["video_url"] = video_url
+                    video_url_dict["audio_url"] = audio_url
+                    video_url_dict["video_width"] = video_width
+                    video_url_dict["video_height"] = video_height
+                elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
+                        and 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
+                        and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
+                        and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
+                        and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
+                        and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:
+
+                    video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
+                    audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
+                    if len(video_url) % 3 == 1:
+                        video_url += '=='
+                    elif len(video_url) % 3 == 2:
+                        video_url += '='
+                    elif len(audio_url) % 3 == 1:
+                        audio_url += '=='
+                    elif len(audio_url) % 3 == 2:
+                        audio_url += '='
+                    video_url = base64.b64decode(video_url).decode('utf8')
+                    audio_url = base64.b64decode(audio_url).decode('utf8')
+                    video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
+                    video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
+                    video_url_dict["video_url"] = video_url
+                    video_url_dict["audio_url"] = audio_url
+                    video_url_dict["video_width"] = video_width
+                    video_url_dict["video_height"] = video_height
+
+
+            elif 'dash' in video_info['videoResource'] \
+                    and 'dynamic_video' in video_info['videoResource']['dash'] \
+                    and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video']:
+                video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
+                audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
+                if len(video_url) % 3 == 1:
+                    video_url += '=='
+                elif len(video_url) % 3 == 2:
+                    video_url += '='
+                elif len(audio_url) % 3 == 1:
+                    audio_url += '=='
+                elif len(audio_url) % 3 == 2:
+                    audio_url += '='
+                video_url = base64.b64decode(video_url).decode('utf8')
+                audio_url = base64.b64decode(audio_url).decode('utf8')
+                video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
+                video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']
+
+
+
+            elif 'normal' in video_info['videoResource']:
+                video_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1']
+                audio_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1']
+                if len(video_url) % 3 == 1:
+                    video_url += '=='
+                elif len(video_url) % 3 == 2:
+                    video_url += '='
+                elif len(audio_url) % 3 == 1:
+                    audio_url += '=='
+                elif len(audio_url) % 3 == 2:
+                    audio_url += '='
+                video_url = base64.b64decode(video_url).decode('utf8')
+                audio_url = base64.b64decode(audio_url).decode('utf8')
+                video_width = video_info['videoResource']['normal']['video_list'][-1]['vwidth']
+                video_height = video_info['videoResource']['normal']['video_list'][-1]['vheight']
+            else:
+                video_url = 0
+                audio_url = 0
+                video_width = 0
+                video_height = 0
+
+            return video_url_dict
+
+
+        # except Exception as e:
+        #     Common.logger(log_type).error(f'get_video_info异常:{e}\n')
+
+    @classmethod
+    def get_videolist(cls, log_type, crawler, out_uid, machine):
+        signature = cls.get_signature(log_type, crawler, out_uid, machine)
+        url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
+        params = {
+            'to_user_id': str(out_uid),
+            'offset': str(cls.offset),
+            'limit': '30',
+            'maxBehotTime': '0',
+            'order': 'new',
+            'isHome': '0',
+            'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
+            'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
+            '_signature': signature,
+        }
+        headers = {
+            'authority': 'www.ixigua.com',
+            'accept': 'application/json, text/plain, */*',
+            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+            'cache-control': 'no-cache',
+            'cookie': f'MONITOR_WEB_ID=7168304743566296612; __ac_signature={signature}; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; msToken=G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==; tt_scid=o4agqz7u9SKPwfBoPt6S82Cw0q.9KDtqmNe0JHxMqmpxNHQWq1BmrQdgVU6jEoX7ed99; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1676618894%7Cee5ad95378275f282f230a7ffa9947ae7eff40d0829c5a2568672a6dc90a1c96; ixigua-a-s=1',
+            'pragma': 'no-cache',
+            'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
+            'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"macOS"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
+            'x-secsdk-csrf-token': '00010000000119e3f9454d1dcbb288704cda1960f241e2d19bd21f2fd283520c3615a990ac5a17448bfbb902a249'
+        }
+        urllib3.disable_warnings()
+        response = requests.get(url=url, headers=headers, params=params, proxies=proxies, verify=False)
+        cls.offset += 30
+        if response.status_code != 200:
+            Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+        elif 'data' not in response.text:
+            Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+        elif 'videoList' not in response.json()["data"]:
+            Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
+        else:
+            videoList = response.json()['data']['videoList']
+            for i in range(len(videoList)):
+                # video_title
+                if 'title' not in videoList[i]:
+                    video_title = 0
+                else:
+                    video_title = videoList[i]['title'].strip().replace('手游', '') \
+                        .replace('/', '').replace('\/', '').replace('\n', '')
+
+                # video_id
+                if 'video_id' not in videoList[i]:
+                    video_id = 0
+                else:
+                    video_id = videoList[i]['video_id']
+
+                # gid
+                if 'gid' not in videoList[i]:
+                    gid = 0
+                else:
+                    gid = videoList[i]['gid']
+
+                # play_cnt
+                if 'video_detail_info' not in videoList[i]:
+                    play_cnt = 0
+                elif 'video_watch_count' not in videoList[i]['video_detail_info']:
+                    play_cnt = 0
+                else:
+                    play_cnt = videoList[i]['video_detail_info']['video_watch_count']
+
+                # comment_cnt
+                if 'comment_count' not in videoList[i]:
+                    comment_cnt = 0
+                else:
+                    comment_cnt = videoList[i]['comment_count']
+
+                # like_cnt
+                if 'digg_count' not in videoList[i]:
+                    like_cnt = 0
+                else:
+                    like_cnt = videoList[i]['digg_count']
+
+                # share_cnt
+                share_cnt = 0
+
+                # video_duration
+                if 'video_duration' not in videoList[i]:
+                    video_duration = 0
+                else:
+                    video_duration = videoList[i]['video_duration']
+
+                # send_time
+                if 'publish_time' not in videoList[i]:
+                    publish_time = 0
+                else:
+                    publish_time = videoList[i]['publish_time']
+
+                # is_top
+                if 'is_top' not in videoList[i]:
+                    is_top = 0
+                else:
+                    is_top = videoList[i]['is_top']
+
+                # user_name
+                if 'user_info' not in videoList[i]:
+                    user_name = 0
+                elif 'name' not in videoList[i]['user_info']:
+                    user_name = 0
+                else:
+                    user_name = videoList[i]['user_info']['name']
+
+                # user_id
+                if 'user_info' not in videoList[i]:
+                    user_id = 0
+                elif 'user_id' not in videoList[i]['user_info']:
+                    user_id = 0
+                else:
+                    user_id = videoList[i]['user_info']['user_id']
+
+                # avatar_url
+                if 'user_info' not in videoList[i]:
+                    avatar_url = 0
+                elif 'avatar_url' not in videoList[i]['user_info']:
+                    avatar_url = 0
+                else:
+                    avatar_url = videoList[i]['user_info']['avatar_url']
+
+                # cover_url
+                if 'video_detail_info' not in videoList[i]:
+                    cover_url = 0
+                elif 'detail_video_large_image' not in videoList[i]['video_detail_info']:
+                    cover_url = 0
+                elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
+                    cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
+                else:
+                    cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']
+
+                Common.logger(log_type, crawler).info(
+                    f'send_time:{time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time))}')
+
+                video_url_dict = cls.get_video_url(log_type, crawler, gid)
+                video_url = video_url_dict["video_url"]
+                audio_url = video_url_dict["audio_url"]
+                video_width = video_url_dict["video_width"]
+                video_height = video_url_dict["video_height"]
+
+                video_dict = {'video_title': video_title,
+                              'video_id': video_id,
+                              'gid': gid,
+                              'play_cnt': play_cnt,
+                              'comment_cnt': comment_cnt,
+                              'like_cnt': like_cnt,
+                              'share_cnt': share_cnt,
+                              'video_width': video_width,
+                              'video_height': video_height,
+                              'video_duration': video_duration,
+                              'publish_time': publish_time,
+                              'is_top': is_top,
+                              'user_name': user_name,
+                              'user_id': user_id,
+                              'avatar_url': avatar_url,
+                              'cover_url': cover_url,
+                              'audio_url': audio_url,
+                              'video_url': video_url}
+                for k, v in video_dict.items():
+                    print(f"{k}:{v}")
+                print("\n")
+
+
+
+
+if __name__ == '__main__':
+    # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
+    Follow.get_videolist("follow", "xigua", "95420624045", "local")
+
+
+    pass

+ 3 - 0
xigua/xigua_main/__init__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/17

+ 3 - 0
xigua/xigua_main/run_xigua_follow.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/17

+ 22 - 15
youtube/youtube_follow/youtube_follow.py

@@ -337,6 +337,8 @@ class Follow:
                                 out_fans = header['subscriberCountText']['accessibility']['accessibilityData']['label']
                                 if '万' in out_fans:
                                     out_fans = int(float(out_fans.split('万')[0])*10000)
+                                elif "位" in out_fans:
+                                    out_fans = int(out_fans.split('位')[0].replace(",", ""))
                                 else:
                                     pass
 
@@ -421,14 +423,14 @@ class Follow:
                         }
                         our_uid = Users.create_user(log_type, crawler, create_user_dict, env)
                         Common.logger(log_type, crawler).info(f'新创建的站内UID:{our_uid}')
-                        if env == 'prod':
-                            our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
-                        else:
+                        if env == 'dev':
                             our_user_link = f'https://testadmin.piaoquantv.com/ums/user/{our_uid}/post'
+                        else:
+                            our_user_link = f'https://admin.piaoquantv.com/ums/user/{our_uid}/post'
                         Common.logger(log_type, crawler).info(f'站内用户主页链接:{our_user_link}')
                         Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
                         Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!')
-                        Common.logger(log_type, crawler).info(f'sql:{sql}')
+
                         sql = f""" insert into crawler_user(user_id, 
                                             out_user_id, 
                                             out_user_name, 
@@ -447,6 +449,7 @@ class Follow:
                                             {out_fans}, 
                                             "{cls.platform}",
                                             "{tag}") """
+                        Common.logger(log_type, crawler).info(f'sql:{sql}')
                         MysqlHelper.update_values(log_type, crawler, sql, env, machine)
                         Common.logger(log_type, crawler).info('用户信息插入数据库成功!\n')
                     # 数据库中(youtube + out_user_id)返回数量 != 0,则直接把数据库中的站内 UID 写入飞书
@@ -699,10 +702,10 @@ class Follow:
                         video_dict = cls.get_video_info(log_type, crawler, out_uid, video_id, machine)
                         # 发布时间<=30天
                         publish_time = int(time.mktime(time.strptime(video_dict['publish_time'], "%Y-%m-%d")))
-                        if int(time.time()) - publish_time <= 3600*24*30:
+                        if int(time.time()) - publish_time <= 3600*24*180:
                             cls.download_publish(log_type, crawler, video_dict, strategy, our_uid, env, oss_endpoint, machine)
                         else:
-                            Common.logger(log_type, crawler).info('发布时间超过30天\n')
+                            Common.logger(log_type, crawler).info('发布时间超过180天\n')
                             return
         except Exception as e:
             Common.logger(log_type, crawler).error(f"get_videos异常:{e}\n")
@@ -912,7 +915,9 @@ class Follow:
                 if 'title' not in  videoDetails:
                     video_title = ''
                 else:
-                    video_title = videoDetails['title']
+                    video_title = videoDetails['title'].replace("&", "").strip().replace("\n", "") \
+                            .replace("/", "").replace("\r", "").replace("#", "") \
+                            .replace(".", "。").replace("\\", "").replace("&NBSP", "")
                 video_title = cls.filter_emoji(video_title)
                 # if Translate.is_contains_chinese(video_title) is False:
                 video_title = Translate.google_translate(video_title, machine)  # 自动翻译标题为中文
@@ -1008,7 +1013,7 @@ class Follow:
             # repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
             if video_dict['video_title'] == '' or  video_dict['video_url'] == '':
                 Common.logger(log_type, crawler).info('无效视频\n')
-            elif video_dict['duration'] > 600 or video_dict['duration'] < 60:
+            elif video_dict['duration'] > 1200 or video_dict['duration'] < 60:
                 Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足规则\n")
             # elif repeat_video is not None and len(repeat_video) != 0:
             elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
@@ -1020,16 +1025,18 @@ class Follow:
                 Common.logger(log_type, crawler).info('开始下载视频...')
                 # Common.download_method(log_type, crawler, 'video', video_dict['video_title'], video_dict['video_url'])
                 Common.download_method(log_type, crawler, 'youtube_video', video_dict['video_title'], video_dict['video_url'])
-                ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
-                video_width = int(ffmpeg_dict['width'])
-                video_height = int(ffmpeg_dict['height'])
-                duration = int(ffmpeg_dict['duration'])
-                video_size = int(ffmpeg_dict['size'])
+                # ffmpeg_dict = Common.ffmpeg(log_type, crawler, f"./{crawler}/videos/{video_dict['video_title']}/video.mp4")
+                # video_width = int(ffmpeg_dict['width'])
+                video_width = 1280
+                # video_height = int(ffmpeg_dict['height'])
+                video_height = 720
+                duration = int(video_dict['duration'])
+                # video_size = int(ffmpeg_dict['size'])
 
                 Common.logger(log_type, crawler).info(f'video_width:{video_width}')
                 Common.logger(log_type, crawler).info(f'video_height:{video_height}')
                 Common.logger(log_type, crawler).info(f'duration:{duration}')
-                Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
+                # Common.logger(log_type, crawler).info(f'video_size:{video_size}\n')
 
                 video_dict['video_width'] = video_width
                 video_dict['video_height'] = video_height
@@ -1045,7 +1052,7 @@ class Follow:
                 #     shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
                 #     Common.logger(log_type, crawler).info(f"时长:{video_dict['duration']}不满足抓取规则,删除成功\n")
                 #     return
-                if video_size == 0 or duration == 0 or video_size is None or duration is None:
+                if duration == 0  or duration is None:
                     # 删除视频文件夹
                     shutil.rmtree(f"./{crawler}/videos/{video_dict['video_title']}/")
                     Common.logger(log_type, crawler).info(f"视频下载出错,删除成功\n")