wangkun 2 years ago
parent
commit
88b75775bd

BIN
.DS_Store


+ 13 - 2
README.MD

@@ -18,9 +18,20 @@ ${nohup_dir}:       nohup日志存储路径,如: ./youtube/nohup.log
 
 ### 已上线爬虫运行命令示例
 ```
+西瓜视频运行命令: 
+阿里云 102 服务器
+sh ./main/main.sh ./xigua/xigua_main/run_xigua_follow.py --log_type="follow" --crawler="xigua" --strategy="定向爬虫策略" --oss_endpoint="inner" --env="prod" --machine="aliyun" xigua/nohup.log
+本机
+sh ./main/main.sh ./xigua/xigua_main/run_xigua_follow.py --log_type="follow" --crawler="xigua" --strategy="定向爬虫策略" --oss_endpoint="out" --env="dev" --machine="local" xigua/nohup.log
+杀进程命令:
+ps aux | grep run_xigua | grep -v grep | awk '{print $2}' | xargs kill -9
+
 youtube定向榜运行命令: 
 sh ./main/main.sh ./youtube/youtube_main/run_youtube_follow.py --log_type="follow" --crawler="youtube" --strategy="定向爬虫策略" --oss_endpoint="hk" --env="prod" --machine="aliyun_hk" youtube/nohup.log
-youtube定向榜杀进程命令: 
+youtube杀进程命令: 
 ps aux | grep run_youtube | grep -v grep | awk '{print $2}' | xargs kill -9
-ps aux | grep run_youtube | grep Python | grep -v grep | awk '{print $2}' | xargs kill -9
+
+微信指数杀进程
+ps aux | grep run_weixinzhishu | grep -v grep | awk '{print $2}' | xargs kill -9
+
 ```

+ 1 - 0
common/common.py

@@ -12,6 +12,7 @@ import time
 import requests
 import ffmpeg
 import urllib3
+import subprocess
 proxies = {"http": None, "https": None}
 
 

+ 0 - 16
common/demo.py

@@ -1,16 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/2/10
-import time
-
-
-class Demo:
-    @classmethod
-    def test_time(cls):
-        time_str = '2023-02-07'
-        time_stamp = int(time.mktime(time.strptime(time_str, "%Y-%m-%d")))
-        print(time_stamp)
-
-
-if __name__ == "__main__":
-    Demo.test_time()

+ 312 - 145
common/feishu.py

@@ -69,12 +69,6 @@ class Feishu:
     # 微信指数_搜索词
     weixinzhishu_search_word = 'https://w42nne6hzg.feishu.cn/sheets/shtcnHxCj6dZBYMuK1Q3tIJVlqg?'
 
-    # 手机号
-    wangkun = "13426262515"
-    gaonannan = "18501180073"
-    xinxin = "15546206651"
-    huxinxue = "18832292015"
-
     # 飞书路径token
     @classmethod
     def spreadsheettoken(cls, crawler):
@@ -158,18 +152,18 @@ class Feishu:
         获取表格元数据
         :return:
         """
-        get_metainfo_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
-                           + cls.spreadsheettoken(crawler) + "/metainfo"
-
-        headers = {
-            "Authorization": "Bearer " + cls.get_token(log_type, crawler),
-            "Content-Type": "application/json; charset=utf-8"
-        }
-        params = {
-            "extFields": "protectedRange",  # 额外返回的字段,extFields=protectedRange时返回保护行列信息
-            "user_id_type": "open_id"  # 返回的用户id类型,可选open_id,union_id
-        }
         try:
+            get_metainfo_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                               + cls.spreadsheettoken(crawler) + "/metainfo"
+
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                "extFields": "protectedRange",  # 额外返回的字段,extFields=protectedRange时返回保护行列信息
+                "user_id_type": "open_id"  # 返回的用户id类型,可选open_id,union_id
+            }
             urllib3.disable_warnings()
             r = requests.get(url=get_metainfo_url, headers=headers, params=params, proxies=proxies, verify=False)
             response = json.loads(r.content.decode("utf8"))
@@ -187,29 +181,29 @@ class Feishu:
         :param sheetid: 哪张表
         :return: 所有数据
         """
-        get_values_batch_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
-                               + cls.spreadsheettoken(crawler) + "/values_batch_get"
-        headers = {
-            "Authorization": "Bearer " + cls.get_token(log_type, crawler),
-            "Content-Type": "application/json; charset=utf-8"
-        }
-        params = {
-            # 多个查询范围 如 url?ranges=range1,range2 ,其中 range 包含 sheetId 与单元格范围两部分
-            "ranges": sheetid,
-
-            # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
-            # valueRenderOption=FormattedValue 计算并格式化单元格;
-            # valueRenderOption=Formula单元格中含有公式时返回公式本身;
-            # valueRenderOption=UnformattedValue计算但不对单元格进行格式化
-            "valueRenderOption": "ToString",
-
-            # dateTimeRenderOption=FormattedString 计算并将时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
-            "dateTimeRenderOption": "",
-
-            # 返回的用户id类型,可选open_id,union_id
-            "user_id_type": "open_id"
-        }
         try:
+            get_values_batch_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                   + cls.spreadsheettoken(crawler) + "/values_batch_get"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                # 多个查询范围 如 url?ranges=range1,range2 ,其中 range 包含 sheetId 与单元格范围两部分
+                "ranges": sheetid,
+
+                # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+                # valueRenderOption=FormattedValue 计算并格式化单元格;
+                # valueRenderOption=Formula单元格中含有公式时返回公式本身;
+                # valueRenderOption=UnformattedValue计算但不对单元格进行格式化
+                "valueRenderOption": "ToString",
+
+                # dateTimeRenderOption=FormattedString 计算并将时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+                "dateTimeRenderOption": "",
+
+                # 返回的用户id类型,可选open_id,union_id
+                "user_id_type": "open_id"
+            }
             urllib3.disable_warnings()
             r = requests.get(url=get_values_batch_url, headers=headers, params=params, proxies=proxies, verify=False)
             # print(r.text)
@@ -231,22 +225,23 @@ class Feishu:
         :param startindex:开始位置
         :param endindex:结束位置
         """
-        insert_columns_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
-                             + cls.spreadsheettoken(crawler) + "/insert_dimension_range"
-        headers = {
-            "Authorization": "Bearer " + cls.get_token(log_type, crawler),
-            "Content-Type": "application/json; charset=utf-8"
-        }
-        body = {
-            "dimension": {
-                "sheetId": sheetid,
-                "majorDimension": majordimension,  # 默认 ROWS ,可选 ROWS、COLUMNS
-                "startIndex": startindex,  # 开始的位置
-                "endIndex": endindex  # 结束的位置
-            },
-            "inheritStyle": "AFTER"  # BEFORE 或 AFTER,不填为不继承 style
-        }
         try:
+            insert_columns_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                 + cls.spreadsheettoken(crawler) + "/insert_dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": majordimension,  # 默认 ROWS ,可选 ROWS、COLUMNS
+                    "startIndex": startindex,  # 开始的位置
+                    "endIndex": endindex  # 结束的位置
+                },
+                "inheritStyle": "AFTER"  # BEFORE 或 AFTER,不填为不继承 style
+            }
+
             urllib3.disable_warnings()
             r = requests.post(url=insert_columns_url, headers=headers, json=body, proxies=proxies, verify=False)
             Common.logger(log_type, crawler).info("插入行或列:{}", r.json()["msg"])
@@ -264,22 +259,21 @@ class Feishu:
         :param ranges:单元格范围
         :param values:写入的具体数据,list
         """
-        update_values_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
-                            + cls.spreadsheettoken(crawler) + "/values_batch_update"
-        headers = {
-            "Authorization": "Bearer " + cls.get_token(log_type, crawler),
-            "Content-Type": "application/json; charset=utf-8"
-        }
-        body = {
-            "valueRanges": [
-                {
-                    "range": sheetid + "!" + ranges,
-                    "values": values
-                },
-            ],
-        }
-
         try:
+            update_values_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                + cls.spreadsheettoken(crawler) + "/values_batch_update"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "valueRanges": [
+                    {
+                        "range": sheetid + "!" + ranges,
+                        "values": values
+                    },
+                ],
+            }
             urllib3.disable_warnings()
             r = requests.post(url=update_values_url, headers=headers, json=body, proxies=proxies, verify=False)
             Common.logger(log_type, crawler).info("写入数据:{}", r.json()["msg"])
@@ -296,19 +290,18 @@ class Feishu:
         :param sheetid:哪张工作表
         :param ranges:需要合并的单元格范围
         """
-        merge_cells_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
-                          + cls.spreadsheettoken(crawler) + "/merge_cells"
-        headers = {
-            "Authorization": "Bearer " + cls.get_token(log_type, crawler),
-            "Content-Type": "application/json; charset=utf-8"
-        }
-
-        body = {
-            "range": sheetid + "!" + ranges,
-            "mergeType": "MERGE_ROWS"
-        }
-
         try:
+            merge_cells_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                              + cls.spreadsheettoken(crawler) + "/merge_cells"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+
+            body = {
+                "range": sheetid + "!" + ranges,
+                "mergeType": "MERGE_ROWS"
+            }
             urllib3.disable_warnings()
             r = requests.post(url=merge_cells_url, headers=headers, json=body, proxies=proxies, verify=False)
             Common.logger(log_type, crawler).info("合并单元格:{}", r.json()["msg"])
@@ -326,26 +319,26 @@ class Feishu:
         :param cell: 哪个单元格
         :return: 单元格内容
         """
-        get_range_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
-                              + cls.spreadsheettoken(crawler) + "/values/" + sheetid + "!" + cell
-        headers = {
-            "Authorization": "Bearer " + cls.get_token(log_type, crawler),
-            "Content-Type": "application/json; charset=utf-8"
-        }
-        params = {
-            # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
-            # valueRenderOption=FormattedValue 计算并格式化单元格;
-            # valueRenderOption=Formula 单元格中含有公式时返回公式本身;
-            # valueRenderOption=UnformattedValue 计算但不对单元格进行格式化。
-            "valueRenderOption": "FormattedValue",
-
-            # dateTimeRenderOption=FormattedString 计算并对时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
-            "dateTimeRenderOption": "",
-
-            # 返回的用户id类型,可选open_id,union_id
-            "user_id_type": "open_id"
-        }
         try:
+            get_range_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/values/" + sheetid + "!" + cell
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            params = {
+                # valueRenderOption=ToString 可返回纯文本的值(数值类型除外);
+                # valueRenderOption=FormattedValue 计算并格式化单元格;
+                # valueRenderOption=Formula 单元格中含有公式时返回公式本身;
+                # valueRenderOption=UnformattedValue 计算但不对单元格进行格式化。
+                "valueRenderOption": "FormattedValue",
+
+                # dateTimeRenderOption=FormattedString 计算并对时间日期按照其格式进行格式化,但不会对数字进行格式化,返回格式化后的字符串。
+                "dateTimeRenderOption": "",
+
+                # 返回的用户id类型,可选open_id,union_id
+                "user_id_type": "open_id"
+            }
             urllib3.disable_warnings()
             r = requests.get(url=get_range_value_url, headers=headers, params=params, proxies=proxies, verify=False)
             # print(r.text)
@@ -382,21 +375,21 @@ class Feishu:
         :param endindex:结束的位置
         :return:
         """
-        dimension_range_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
-                              + cls.spreadsheettoken(crawler) + "/dimension_range"
-        headers = {
-            "Authorization": "Bearer " + cls.get_token(log_type, crawler),
-            "Content-Type": "application/json; charset=utf-8"
-        }
-        body = {
-            "dimension": {
-                "sheetId": sheetid,
-                "majorDimension": major_dimension,
-                "startIndex": startindex,
-                "endIndex": endindex
-            }
-        }
         try:
+            dimension_range_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/" \
+                                  + cls.spreadsheettoken(crawler) + "/dimension_range"
+            headers = {
+                "Authorization": "Bearer " + cls.get_token(log_type, crawler),
+                "Content-Type": "application/json; charset=utf-8"
+            }
+            body = {
+                "dimension": {
+                    "sheetId": sheetid,
+                    "majorDimension": major_dimension,
+                    "startIndex": startindex,
+                    "endIndex": endindex
+                }
+            }
             urllib3.disable_warnings()
             r = requests.delete(url=dimension_range_url, headers=headers, json=body, proxies=proxies, verify=False)
             Common.logger(log_type, crawler).info("删除视频数据:{}", r.json()["msg"])
@@ -412,32 +405,219 @@ class Feishu:
                 "Authorization": "Bearer " + cls.get_token(log_type, crawler),
                 "Content-Type": "application/json; charset=utf-8"
             }
+            # 手机号
+            wangkun = "13426262515"
+            gaonannan = "18501180073"
+            xinxin = "15546206651"
+            huxinxue = "18832292015"
+            wuchaoyue = "15712941385"
+            lijinchao = '18524120540'
+
             if username == "wangkun":
-                username = cls.wangkun
+                username = wangkun
             elif username == "gaonannan":
-                username = cls.gaonannan
+                username = gaonannan
             elif username == "xinxin":
-                username = cls.xinxin
+                username = xinxin
             elif username == "huxinxue":
-                username = cls.huxinxue
+                username = huxinxue
+            elif username == "wuchaoyue":
+                username = wuchaoyue
+            elif username == "lijinchao":
+                username = lijinchao
+
             data = {"mobiles": [username]}
             urllib3.disable_warnings()
             r = requests.get(url=url, headers=headers, params=data, verify=False, proxies=proxies)
             open_id = r.json()["data"]["mobile_users"][username][0]["open_id"]
-            Common.logger(log_type, crawler).info("{}:{}", username, open_id)
+            Common.logger(log_type, crawler).info(f"{username}:{open_id}")
             # print(f"{username}:{open_id}")
             return open_id
         except Exception as e:
-            Common.logger(log_type, crawler).error("get_userid异常:{}", e)
+            Common.logger(log_type, crawler).error(f"get_userid异常:{e}\n")
 
     # 飞书机器人
     @classmethod
-    def bot(cls, log_type, crawler, content):
+    def bot(cls, log_type, crawler, text):
         try:
             url = "https://open.feishu.cn/open-apis/bot/v2/hook/96989577-50e7-4653-9ec2-308fe3f2c5fe"
-            headers = {
-                'Content-Type': 'application/json'
-            }
+            headers = {'Content-Type': 'application/json'}
+            if crawler == "kanyikan":
+                content = "看一看爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcngRPoDYAi24x52j2nDuHMih"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "xiaoniangao_hour":
+                content = "小年糕_小时级_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=yatRv2"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+            elif crawler == "xiaoniangao_person":
+                content = "小年糕_用户主页_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=Wu0CeL"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+            elif crawler == "xiaoniangao_play":
+                content = "小年糕_播放量_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh?sheet=c85k1C"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == 'xigua':
+                content = '西瓜视频_用户主页_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=e075e9'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == 'xigua_little_video':
+                content = '西瓜视频_小视频_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnvOpx2P8vBXiV91Ot1MKIw8?sheet=hDSDnv'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == 'zhihu_hot':
+                content = '知乎_热门_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?sheet=8871e3'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+            elif crawler == 'zhihu_follow':
+                content = '知乎_定向_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnkGPBmGsjaqapgzouuj8MXe?sheet=4MGuux'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == 'haokan_hot':
+                content = '好看_热榜_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=5pWipX'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == 'haokan_channel':
+                content = '好看_频道_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=7f05d8'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == 'haokan_follow':
+                content = '好看_定向_已下载表'
+                sheet_url = 'https://w42nne6hzg.feishu.cn/sheets/shtcnaYz8Nhv8q6DbWtlL6rMEBd?sheet=kVaSjf'
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == "music_album":
+                content = "音乐相册爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnT6zvmfsYe1g0iv4pt7855g"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "ssyy":
+                content = "胜胜影音爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnz1ymxHL1u8WHblfqfys7qe"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "ggdc":
+                content = "刚刚都传爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnTuJgeZU2bc7VaesAqk3QJx"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "bszf":
+                content = "本山祝福爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnGh2rrsPYM4iVNEBO7OqWrb"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "jxxf":
+                content = "吉祥幸福爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnSx4nafMbLTq7xl7RHBwHBf"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "zmyx":
+                content = "众妙音信爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnbZIxstPeM0xshW07b26sve"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "zhufumao":
+                content = "祝福猫视频爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnXfIJthvkjhI5zlEJq84i6g"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "gaonannan")) + "></at>\n"
+
+            elif crawler == "kuaishou_follow":
+                content = "快手_用户主页_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?sheet=fYdA8F"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+            elif crawler == "kuaishou_recommend":
+                content = "快手_推荐榜_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf?sheet=3cd128"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "ssnnyfq":
+                content = "岁岁年年迎福气_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnyJmJSJynHDLLbLTkySfvZe?sheet=290bae"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "kdjsfq":
+                content = "看到就是福气_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnEokBkIjOUPAk8vbbPKnXgb?sheet=ad3b6d"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "gzh":
+                content = "公众号爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnexNXnpDLHhARw0QdiwbYuA"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "huxinxue")) + "></at>\n"
+
+            elif crawler == "gongzhonghao_xinxin":
+                content = "公众号_信欣_爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcna98M2mX7TbivTj9Sb7WKBN?"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "weiqun":
+                content = "微群爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnoKThNquYRweaylMFVyo9Hc"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "weishi":
+                content = "微视爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn5YSWg91JfVGzj0SFZIRRPh"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "xinxin")) + "></at>\n"
+
+            elif crawler == "shipinhao_recommend":
+                content = "视频号_推荐_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?sheet=c77cf9"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == "shipinhao_follow":
+                content = "视频号_定向_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn9rOdZRAGFbRkWpn7hqEHGc?sheet=KsVtLe"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+            elif crawler == "youtube":
+                content = "youtube_定向_已下载表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnrLyr1zbYbhhZyqpN7Xrd5f?sheet=GVxlYk"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "wuchaoyue")) + "></at>\n"
+
+            elif crawler == "zongjiao":
+                content = "宗教公众号爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcn73NW0CyoOeF21HWO15KBsb"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at> <at id=" + str(
+                    cls.get_userid(log_type, crawler, "huxinxue")) + "></at>\n"
+
+            else:
+                content = "小年糕爬虫表"
+                sheet_url = "https://w42nne6hzg.feishu.cn/sheets/shtcnYxiyQ1wLklo1W5Kdqc9cGh"
+                users = "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at>\n"
+
             data = json.dumps({
                 "msg_type": "interactive",
                 "card": {
@@ -448,38 +628,25 @@ class Feishu:
                     "elements": [{
                         "tag": "div",
                         "text": {
-                            "content": "\n<at id=" + str(cls.get_userid(log_type, crawler, "wangkun")) + "></at>\n" + content,
+                            "content": users + text,
                             "tag": "lark_md"
                         }
                     }, {
                         "actions": [{
                             "tag": "button",
                             "text": {
-                                "content": "快手爬虫表",
+                                "content": content,
                                 "tag": "lark_md"
                             },
-                            "url": "https://w42nne6hzg.feishu.cn/sheets/shtcnICEfaw9llDNQkKgdymM1xf",
+                            "url": sheet_url,
                             "type": "default",
                             "value": {}
-                        },
-                            {
-                                "tag": "button",
-                                "text": {
-                                    "content": "快手Jenkins",
-                                    "tag": "lark_md"
-                                },
-                                "url": "https://jenkins-on.yishihui.com/view/%E7%88%AC%E8%99%AB-Spider/job/%E5%BF%"
-                                       "AB%E6%89%8B%E5%B0%8F%E7%A8%8B%E5%BA%8F-%E8%A7%86%E9%A2%91%E7%88%AC%E5%8F%96/",
-                                "type": "default",
-                                "value": {}
-                            }
-
-                        ],
+                        }],
                         "tag": "action"
                     }],
                     "header": {
                         "title": {
-                            "content": "📣有新的报警,请注意查处",
+                            "content": "📣您有新的报警,请注意查收",
                             "tag": "plain_text"
                         }
                     }
@@ -487,10 +654,10 @@ class Feishu:
             })
             urllib3.disable_warnings()
             r = requests.post(url, headers=headers, data=data, verify=False, proxies=proxies)
-            Common.logger(log_type, crawler).info("触发机器人消息:{}, {}", r, r.json()["StatusMessage"])
+            Common.logger(log_type, crawler).info(f'触发机器人消息:{r}, {r.json()["StatusMessage"]}')
         except Exception as e:
-            Common.logger(log_type, crawler).error("bot异常:{}", e)
+            Common.logger(log_type, crawler).error(f"bot异常:{e}\n")
 
 
 if __name__ == "__main__":
-    pass
+    Feishu.bot('follow', 'xigua', '测试一下,请忽略 ~')

BIN
kanyikan/.DS_Store


+ 3 - 3
weixinzhishu/weixinzhishu_main/get_weixinzhishu.py

@@ -194,8 +194,8 @@ class Weixinzhishu:
         wechat_key = cls.get_wechat_key(log_type, crawler)
         search_key = wechat_key[0]
         openid = wechat_key[-1]
-        end_ymd = (date.today() + timedelta(days=0)).strftime("%Y%m%d")
-        start_ymd = (date.today() + timedelta(days=-7)).strftime("%Y%m%d")
+        end_ymd = (date.today() + timedelta(days=-5)).strftime("%Y%m%d")
+        start_ymd = (date.today() + timedelta(days=-9)).strftime("%Y%m%d")
         url = "https://search.weixin.qq.com/cgi-bin/wxaweb/wxindex"
         payload = json.dumps({
             "openid": openid,
@@ -240,6 +240,6 @@ class Weixinzhishu:
 
 
 if __name__ == "__main__":
-    Weixinzhishu.get_score_test('weixin', 'weixinzhishu', 1 , "社保")
+    Weixinzhishu.get_score_test('weixin', 'weixinzhishu', 1 , "俞仁波")
 
     pass

BIN
xigua/.DS_Store


BIN
xigua/logs/.DS_Store


BIN
xigua/videos/.DS_Store


+ 119 - 0
xigua/xigua_follow/insert_videos.py

@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/2/23
+import json
+import os
+import sys
+
+sys.path.append(os.getcwd())
+from common.common import Common
+from common.db import MysqlHelper
+from common.feishu import Feishu
+
+
+class Insert:
+    @classmethod
+    def insert_video_from_feishu_to_mysql(cls, log_type, crawler, env, machine):
+        xigua_sheetid_list = ["QOWqMo", "3Ul6wZ", "e075e9"]
+        for sheetid in xigua_sheetid_list:
+            xigua_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
+            for i in range(1, len(xigua_sheet)):
+            # for i in range(1, 3):
+                if xigua_sheet[i][5] is None:
+                    continue
+                video_id = xigua_sheet[i][9].replace("https://admin.piaoquantv.com/cms/post-detail/", "").replace("/info", "")
+                if video_id == "None":
+                    continue
+                video_id = int(video_id)
+                user_id = 0
+                out_user_id = str(xigua_sheet[i][19])
+                platform = "西瓜视频"
+                strategy = "定向爬虫策略"
+                out_video_id = str(xigua_sheet[i][8])
+                video_title = str(xigua_sheet[i][7])
+                cover_url = str(xigua_sheet[i][21])
+                video_url = str(xigua_sheet[i][22])
+                duration = int(xigua_sheet[i][15])
+                publish_time = str(xigua_sheet[i][17].replace("/", "-"))
+                play_cnt = int(xigua_sheet[i][11])
+                like_cnt = int(xigua_sheet[i][13])
+                share_cnt = int(xigua_sheet[i][14])
+                # collection_cnt = 0
+                comment_cnt = int(xigua_sheet[i][12])
+                crawler_rule = json.dumps({"play_cnt": 0, "comment_cnt": 0, "like_cnt": 0, "duration": 60, "publish_time": 10, "video_width": 720, "video_height": 720})
+                width = int(xigua_sheet[i][16].split("*")[0])
+                height = int(xigua_sheet[i][16].split("*")[1])
+
+                # print(f"video_id:{video_id}, type:{type(video_id)}")
+                # print(f"user_id:{user_id}, type:{type(user_id)}")
+                # print(f"out_user_id:{out_user_id}, type:{type(out_user_id)}")
+                # print(f"platform:{platform}, type:{type(platform)}")
+                # print(f"strategy:{strategy}, type:{type(strategy)}")
+                # print(f"out_video_id:{out_video_id}, type:{type(out_video_id)}")
+                # print(f"video_title:{video_title}, type:{type(video_title)}")
+                # print(f"cover_url:{cover_url}, type:{type(cover_url)}")
+                # print(f"video_url:{video_url}, type:{type(video_url)}")
+                # print(f"duration:{duration}, type:{type(duration)}")
+                # print(f"publish_time:{publish_time}, type:{type(publish_time)}")
+                # print(f"play_cnt:{play_cnt}, type:{type(play_cnt)}")
+                # print(f"like_cnt:{like_cnt}, type:{type(like_cnt)}")
+                # print(f"share_cnt:{share_cnt}, type:{type(share_cnt)}")
+                # print(f"collection_cnt:{collection_cnt}, type:{type(collection_cnt)}")
+                # print(f"comment_cnt:{comment_cnt}, type:{type(comment_cnt)}")
+                # print(f"crawler_rule:{crawler_rule}, type:{type(crawler_rule)}")
+                # print(f"width:{width}, type:{type(width)}")
+                # print(f"height:{height}, type:{type(height)}\n")
+
+                select_sql = f""" select * from crawler_video where platform="{platform}" and out_video_id="{out_video_id}" """
+                Common.logger(log_type, crawler).info(f"select_sql:{select_sql}")
+                repeat_video = MysqlHelper.get_values(log_type, crawler, select_sql, env, machine)
+                Common.logger(log_type, crawler).info(f"repeat_video:{repeat_video}")
+
+                if repeat_video is not None and len(repeat_video) != 0:
+                    Common.logger(log_type, crawler).info(f"{video_title} 已存在数据库中\n")
+                else:
+                    # 视频信息保存数据库
+                    insert_sql = f""" insert into crawler_video(video_id,
+                                    user_id,
+                                    out_user_id,
+                                    platform,
+                                    strategy,
+                                    out_video_id,
+                                    video_title,
+                                    cover_url,
+                                    video_url,
+                                    duration,
+                                    publish_time,
+                                    play_cnt,
+                                    like_cnt,
+                                    share_cnt,
+                                    comment_cnt,
+                                    crawler_rule,
+                                    width,
+                                    height)
+                                    values({video_id},
+                                    {user_id},
+                                    "{out_user_id}",
+                                    "{platform}",
+                                    "{strategy}",
+                                    "{out_video_id}",
+                                    "{video_title}",
+                                    "{cover_url}",
+                                    "{video_url}",
+                                    {duration},
+                                    "{publish_time}",
+                                    {play_cnt},
+                                    {like_cnt},
+                                    {share_cnt},
+                                    {comment_cnt},
+                                    '{crawler_rule}',
+                                    {width},
+                                    {height}) """
+                    Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+                    MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
+                    Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
+
+
+
+if __name__ == "__main__":
+    Insert.insert_video_from_feishu_to_mysql("insert", "xigua", "dev", "local")

+ 0 - 3
xigua/xigua_follow/xigua_demo.py

@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/2/17

+ 164 - 89
xigua/xigua_follow/xigua_follow.py

@@ -1,10 +1,8 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/2/17
-import json
-
-from lxml import etree
 import base64
+import json
 import os
 import random
 import shutil
@@ -18,6 +16,7 @@ from selenium.webdriver import DesiredCapabilities
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from seleniumwire import webdriver
+from lxml import etree
 
 sys.path.append(os.getcwd())
 from common.db import MysqlHelper
@@ -35,12 +34,44 @@ class Follow:
     platform = "西瓜视频"
     tag = "西瓜视频爬虫,定向爬虫策略"
 
+    @classmethod
+    def get_rule(cls, log_type, crawler):
+        try:
+            while True:
+                rule_sheet = Feishu.get_values_batch(log_type, crawler, "4kxd31")
+                if rule_sheet is None:
+                    Common.logger(log_type, crawler).warning("rule_sheet is None! 10秒后重新获取")
+                    time.sleep(10)
+                    continue
+                rule_dict = {
+                    "play_cnt": int(rule_sheet[1][2]),
+                    "comment_cnt": int(rule_sheet[2][2]),
+                    "like_cnt": int(rule_sheet[3][2]),
+                    "duration": int(rule_sheet[4][2]),
+                    "publish_time": int(rule_sheet[5][2]),
+                    "video_width": int(rule_sheet[6][2]),
+                    "video_height": int(rule_sheet[7][2]),
+                }
+                return rule_dict
+        except Exception as e:
+            Common.logger(log_type, crawler).error(f"get_rule:{e}\n")
+
     # 下载规则
-    @staticmethod
-    def download_rule(duration, width, height):
-        if int(duration) >= 60:
-            if int(width) >= 720 or int(height) >= 720:
-                return True
+    @classmethod
+    def download_rule(cls, video_info_dict, rule_dict):
+        if video_info_dict['play_cnt'] >= rule_dict['play_cnt']:
+            if video_info_dict['comment_cnt'] >= rule_dict['comment_cnt']:
+                if video_info_dict['like_cnt'] >= rule_dict['like_cnt']:
+                    if video_info_dict['duration'] >= rule_dict['duration']:
+                        if video_info_dict['video_width'] >= rule_dict['video_width'] \
+                                or video_info_dict['video_height'] >= rule_dict['video_height']:
+                            return True
+                        else:
+                            return False
+                    else:
+                        return False
+                else:
+                    return False
             else:
                 return False
         else:
@@ -50,15 +81,19 @@ class Follow:
     @classmethod
     def filter_words(cls, log_type, crawler):
         try:
-            filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
-            filter_words_list = []
-            for x in filter_words_sheet:
-                for y in x:
-                    if y is None:
-                        pass
-                    else:
-                        filter_words_list.append(y)
-            return filter_words_list
+            while True:
+                filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
+                if filter_words_sheet is None:
+                    Common.logger(log_type, crawler).warning(f"filter_words_sheet:{filter_words_sheet} 10秒钟后重试")
+                    continue
+                filter_words_list = []
+                for x in filter_words_sheet:
+                    for y in x:
+                        if y is None:
+                            pass
+                        else:
+                            filter_words_list.append(y)
+                return filter_words_list
         except Exception as e:
             Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')
 
@@ -100,53 +135,56 @@ class Follow:
         except Exception as e:
             Common.logger(log_type, crawler).error(f"get_out_user_info:{e}\n")
 
-
     # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str
     @classmethod
     def get_user_list(cls, log_type, crawler, sheetid, env, machine):
         try:
-            user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
-            our_user_list = []
-            for i in range(1, len(user_sheet)):
-                out_uid = user_sheet[i][2]
-                user_name = user_sheet[i][3]
-                our_uid = user_sheet[i][6]
-                our_user_link = user_sheet[i][7]
-                if out_uid is None or user_name is None:
-                    Common.logger(log_type, crawler).info("空行\n")
-                else:
-                    Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
-                    if our_uid is None:
-                        out_user_info = cls.get_out_user_info(log_type, crawler, out_uid)
-                        out_user_dict = {
-                            "out_uid": out_uid,
-                            "user_name": user_name,
-                            "out_avatar_url": out_user_info["out_avatar_url"],
-                            "out_create_time": '',
-                            "out_tag": '',
-                            "out_play_cnt": 0,
-                            "out_fans": out_user_info["out_fans"],
-                            "out_follow": out_user_info["out_follow"],
-                            "out_friend": 0,
-                            "out_like": out_user_info["out_like"],
-                            "platform": cls.platform,
-                            "tag": cls.tag,
-                        }
-                        our_user_dict = Users.create_user(log_type=log_type, crawler=crawler, out_user_dict=out_user_dict, env=env, machine=machine)
-                        our_uid = our_user_dict['our_uid']
-                        our_user_link = our_user_dict['our_user_link']
-                        Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
-                        Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
-                        our_user_list.append(our_user_dict)
+            while True:
+                user_sheet = Feishu.get_values_batch(log_type, crawler, sheetid)
+                if user_sheet is None:
+                    Common.logger(log_type, crawler).warning(f"user_sheet:{user_sheet} 10秒钟后重试")
+                    continue
+                our_user_list = []
+                for i in range(1, len(user_sheet)):
+                    out_uid = user_sheet[i][2]
+                    user_name = user_sheet[i][3]
+                    our_uid = user_sheet[i][6]
+                    our_user_link = user_sheet[i][7]
+                    if out_uid is None or user_name is None:
+                        Common.logger(log_type, crawler).info("空行\n")
                     else:
-                        our_user_dict = {
-                            'out_uid': out_uid,
-                            'user_name': user_name,
-                            'our_uid': our_uid,
-                            'our_user_link': our_user_link,
-                        }
-                        our_user_list.append(our_user_dict)
-            return our_user_list
+                        Common.logger(log_type, crawler).info(f"正在更新 {user_name} 用户信息\n")
+                        if our_uid is None:
+                            out_user_info = cls.get_out_user_info(log_type, crawler, out_uid)
+                            out_user_dict = {
+                                "out_uid": out_uid,
+                                "user_name": user_name,
+                                "out_avatar_url": out_user_info["out_avatar_url"],
+                                "out_create_time": '',
+                                "out_tag": '',
+                                "out_play_cnt": 0,
+                                "out_fans": out_user_info["out_fans"],
+                                "out_follow": out_user_info["out_follow"],
+                                "out_friend": 0,
+                                "out_like": out_user_info["out_like"],
+                                "platform": cls.platform,
+                                "tag": cls.tag,
+                            }
+                            our_user_dict = Users.create_user(log_type=log_type, crawler=crawler, out_user_dict=out_user_dict, env=env, machine=machine)
+                            our_uid = our_user_dict['our_uid']
+                            our_user_link = our_user_dict['our_user_link']
+                            Feishu.update_values(log_type, crawler, sheetid, f'G{i + 1}:H{i + 1}', [[our_uid, our_user_link]])
+                            Common.logger(log_type, crawler).info(f'站内用户信息写入飞书成功!\n')
+                            our_user_list.append(our_user_dict)
+                        else:
+                            our_user_dict = {
+                                'out_uid': out_uid,
+                                'user_name': user_name,
+                                'our_uid': our_uid,
+                                'our_user_link': our_user_link,
+                            }
+                            our_user_list.append(our_user_dict)
+                return our_user_list
         except Exception as e:
             Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')
 
@@ -695,7 +733,7 @@ class Follow:
                     if 'video_duration' not in videoList[i]:
                         video_duration = 0
                     else:
-                        video_duration = videoList[i]['video_duration']
+                        video_duration = int(videoList[i]['video_duration'])
 
                     # send_time
                     if 'publish_time' not in videoList[i]:
@@ -745,12 +783,20 @@ class Follow:
                     else:
                         cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']
 
+                    while True:
+                        rule_dict = cls.get_rule(log_type, crawler)
+                        if rule_dict is None:
+                            Common.logger(log_type, crawler).warning(f"rule_dict:{rule_dict}, 10秒后重试")
+                            time.sleep(10)
+                        else:
+                            break
+
                     if gid == 0 or video_id == 0 or cover_url == 0:
                         Common.logger(log_type, crawler).info('无效视频\n')
-                    elif is_top is True and int(time.time()) - int(publish_time) > 3600 * 24 * 10:
-                        Common.logger(log_type, crawler).info(f'置顶视频,且发布时间超过10天:{publish_time_str}\n')
-                    elif int(time.time()) - int(publish_time) > 3600 * 24 * 10:
-                        Common.logger(log_type, crawler).info(f'发布时间超过10天:{publish_time_str}\n')
+                    elif is_top is True and int(time.time()) - int(publish_time) > 3600 * 24 * rule_dict['publish_time']:
+                        Common.logger(log_type, crawler).info(f'置顶视频,且发布时间:{publish_time_str}超过{rule_dict["publish_time"]}天\n')
+                    elif int(time.time()) - int(publish_time) > 3600 * 24 * rule_dict['publish_time']:
+                        Common.logger(log_type, crawler).info(f'发布时间:{publish_time_str}超过{rule_dict["publish_time"]}天\n')
                         cls.offset = 0
                         return
                     else:
@@ -782,31 +828,40 @@ class Follow:
                                       'session': signature}
                         for k, v in video_dict.items():
                             Common.logger(log_type, crawler).info(f"{k}:{v}")
-                        # cls.download_publish(log_type=log_type,
-                        #                      crawler=crawler,
-                        #                      video_dict=video_dict,
-                        #                      strategy=strategy,
-                        #                      our_uid=our_uid,
-                        #                      oss_endpoint=oss_endpoint,
-                        #                      env=env,
-                        #                      machine=machine)
+                        cls.download_publish(log_type=log_type,
+                                             crawler=crawler,
+                                             video_dict=video_dict,
+                                             rule_dict=rule_dict,
+                                             strategy=strategy,
+                                             our_uid=our_uid,
+                                             oss_endpoint=oss_endpoint,
+                                             env=env,
+                                             machine=machine)
+
+    @classmethod
+    def repeat_video(cls, log_type, crawler, video_id, env, machine):
+        sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
+        repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env, machine)
+        return len(repeat_video)
 
     # 下载 / 上传
     @classmethod
-    def download_publish(cls, log_type, crawler, strategy, video_dict, our_uid, oss_endpoint, env, machine):
+    def download_publish(cls, log_type, crawler, strategy, video_dict, rule_dict, our_uid, oss_endpoint, env, machine):
         # try:
-        if cls.download_rule(video_dict['duration'], video_dict['video_width'], video_dict['video_height']) is False:
+        if cls.download_rule(video_dict, rule_dict) is False:
             Common.logger(log_type, crawler).info('不满足抓取规则\n')
         elif any(word if word in video_dict['video_title'] else False for word in cls.filter_words(log_type, crawler)) is True:
             Common.logger(log_type, crawler).info('标题已中过滤词:{}\n', video_dict['video_title'])
-        elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'e075e9') for x in y]:
+        elif cls.repeat_video(log_type, crawler, video_dict['video_id'], env, machine) != 0:
             Common.logger(log_type, crawler).info('视频已下载\n')
-        elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', '3Ul6wZ') for x in y]:
-            Common.logger(log_type, crawler).info('视频已下载\n')
-        elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'QOWqMo') for x in y]:
-            Common.logger(log_type, crawler).info('视频已下载\n')
-        elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'wjhpDs') for x in y]:
-            Common.logger(log_type, crawler).info('视频已存在\n')
+        # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'e075e9') for x in y]:
+        #     Common.logger(log_type, crawler).info('视频已下载\n')
+        # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', '3Ul6wZ') for x in y]:
+        #     Common.logger(log_type, crawler).info('视频已下载\n')
+        # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'QOWqMo') for x in y]:
+        #     Common.logger(log_type, crawler).info('视频已下载\n')
+        # elif str(video_dict['video_id']) in [x for y in Feishu.get_values_batch(log_type, 'xigua', 'wjhpDs') for x in y]:
+        #     Common.logger(log_type, crawler).info('视频已存在\n')
         else:
             # 下载封面
             Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
@@ -865,7 +920,7 @@ class Follow:
             Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
 
             # 视频信息保存数据库
-            sql = f""" insert into crawler_video(video_id,
+            insert_sql = f""" insert into crawler_video(video_id,
                             user_id,
                             out_user_id,
                             platform,
@@ -881,7 +936,7 @@ class Follow:
                             width,
                             height)
                             values({our_video_id},
-                            "{our_uid}",
+                            {our_uid},
                             "{video_dict['user_id']}",
                             "{cls.platform}",
                             "定向爬虫策略",
@@ -892,14 +947,34 @@ class Follow:
                             {int(video_dict['duration'])},
                             "{video_dict['publish_time_str']}",
                             {int(video_dict['play_cnt'])},
-                            "4,5,6",
+                            '{json.dumps(rule_dict)}',
                             {int(video_dict['video_width'])},
                             {int(video_dict['video_height'])}) """
-            MysqlHelper.update_values(log_type, crawler, sql, env, machine)
+            Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+            MysqlHelper.update_values(log_type, crawler, insert_sql, env, machine)
             Common.logger(log_type, crawler).info('视频信息插入数据库成功!\n')
         # except Exception as e:
         #     Common.logger(log_type, crawler).error(f'download_publish异常:{e}\n')
 
+    @classmethod
+    def get_follow_videos(cls, log_type, crawler, strategy, oss_endpoint, env, machine):
+        user_list = cls.get_user_list(log_type=log_type, crawler=crawler, sheetid="5tlTYB", env=env, machine=machine)
+        for user in user_list:
+            out_uid = user["out_uid"]
+            user_name = user["user_name"]
+            our_uid = user["our_uid"]
+            Common.logger(log_type, crawler).info(f"开始抓取 {user_name} 用户主页视频\n")
+            cls.get_videolist(log_type=log_type,
+                              crawler=crawler,
+                              strategy=strategy,
+                              our_uid=our_uid,
+                              out_uid=out_uid,
+                              oss_endpoint=oss_endpoint,
+                              env=env,
+                              machine=machine)
+            cls.offset = 0
+            time.sleep(3)
+
 
 if __name__ == '__main__':
     # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
@@ -912,8 +987,8 @@ if __name__ == '__main__':
     #                      env="dev",
     #                      machine="local")
     # print(Follow.random_signature())
-    user_list = Follow.get_user_list(log_type="follow", crawler="xigua", sheetid="5tlTYB", env="dev", machine="local")
-    print(len(user_list))
-    for user in user_list:
-        print(user)
+    rule = Follow.get_rule("follow", "xigua")
+    print(type(rule))
+    print(type(json.dumps(rule)))
+    print(json.dumps(rule))
     pass

+ 36 - 0
xigua/xigua_main/run_xigua_follow.py

@@ -1,3 +1,39 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2023/2/17
+import argparse
+import os
+import sys
+import time
+
+sys.path.append(os.getcwd())
+from common.common import Common
+from xigua.xigua_follow.xigua_follow import Follow
+
+
+def main(log_type, crawler, strategy, oss_endpoint, env, machine):
+    while True:
+        Common.logger(log_type, crawler).info('开始抓取 西瓜视频 定向榜\n')
+        Follow.get_follow_videos(log_type, crawler, strategy, oss_endpoint, env, machine)
+        Common.del_logs(log_type, crawler)
+        Common.logger(log_type, crawler).info('抓取完一轮,休眠 1 分钟\n')
+        time.sleep(60)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  ## 新建参数解释器对象
+    parser.add_argument('--log_type', type=str)  ## 添加参数,注明参数类型
+    parser.add_argument('--crawler')  ## 添加参数
+    parser.add_argument('--strategy')  ## 添加参数
+    parser.add_argument('--our_uid')  ## 添加参数
+    parser.add_argument('--oss_endpoint')  ## 添加参数
+    parser.add_argument('--env')  ## 添加参数
+    parser.add_argument('--machine')  ## 添加参数
+    args = parser.parse_args()  ### 参数赋值,也可以通过终端赋值
+    # print(args)
+    main(log_type=args.log_type,
+         crawler=args.crawler,
+         strategy=args.strategy,
+         oss_endpoint=args.oss_endpoint,
+         env=args.env,
+         machine=args.machine)

BIN
youtube/.DS_Store


+ 0 - 3
youtube/youtube_main/run_youtube_search.py

@@ -1,3 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Author: wangkun
-# @Time: 2023/2/3