wangkun 1 rok temu
rodzic
commit
08f0f609a9

+ 40 - 0
README.MD

@@ -161,4 +161,44 @@ ps aux | grep xigua | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9
 ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
+```
+
+#### 生成 requirements.txt
+```commandline
+cd ./piaoquan_crawler && pipreqs ./ --force
+
+# pip3 install Appium-Python-Client
+Appium_Python_Client==2.10.1
+# 翻墙, pip3 install git+https://github.com/pyatom/pyatom/
+atomac==1.2.0
+# pip3 install ffmpeg-python
+ffmpeg==1.4
+# pip3 install loguru
+loguru==0.6.0
+# pip3 install lxml
+lxml==4.9.1
+# pip3 install mq_http_sdk, 若您使用的SDK版本为v1.0.0,您需要安装大于等于2.5且小于3.0版本的Python。若您使用的SDK版本大于v1.0.0,您需要安装2.5及以上版本的Python。
+mq_http_sdk==1.0.3
+# sudo pip3 install oss2
+oss2==2.15.0
+# pip3 install psutil
+psutil==5.9.2
+# pip3 install PyExecJS
+PyExecJS==1.5.1
+# pip3 install PyMysql
+PyMySQL==1.0.2
+# pip3 install redis
+redis==4.5.1
+# pip3 install requests
+requests==2.27.1
+# pip3 install selenium
+selenium==4.9.1
+# pip3 install urllib3
+urllib3==1.26.9
+# pip3 install jieba
+jieba==0.42.1
+# pip3 install workalendar
+workalendar==17.0.0
+# pip3 install aliyun_python_sdk
+aliyun_python_sdk==2.2.0
 ```

+ 58 - 1
common/common.py

@@ -4,6 +4,7 @@
 """
 公共方法,包含:生成log / 删除log / 下载方法 / 删除 weixinzhishu_chlsfiles / 过滤词库 / 保存视频信息至本地 txt / 翻译 / ffmpeg
 """
+from aliyun.log import LogClient, PutLogsRequest, LogItem
 from datetime import date, timedelta
 from loguru import logger
 from hashlib import md5
@@ -55,6 +56,63 @@ class Common:
 
         return logger
 
+    # 写入阿里云日志
+    @staticmethod
+    def logging(log_type, crawler, env, message):
+        """
+        写入阿里云日志
+        测试库: https://sls.console.aliyun.com/lognext/project/crawler-log-dev/logsearch/crawler-log-dev
+        正式库: https://sls.console.aliyun.com/lognext/project/crawler-log-prod/logsearch/crawler-log-prod
+        :param log_type: 爬虫策略
+        :param crawler: 哪款爬虫
+        :param env: 环境
+        :param message:日志内容
+        :return: None
+        """
+        # 设置阿里云日志服务的访问信息
+        accessKeyId = 'LTAIWYUujJAm7CbH'
+        accessKey = 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P'
+        if env == "dev":
+            project = 'crawler-log-dev'
+            logstore = 'crawler-log-dev'
+            endpoint = 'cn-hangzhou.log.aliyuncs.com'
+        else:
+            project = 'crawler-log-prod'
+            logstore = 'crawler-log-prod'
+            endpoint = 'cn-hangzhou-intranet.log.aliyuncs.com'
+
+        # 创建 LogClient 实例
+        # print("创建 LogClient 实例")
+        client = LogClient(endpoint, accessKeyId, accessKey)
+
+
+        if '\r' in message:
+            message = message.replace('\r', ' ')
+        if '\n' in message:
+            message = message.replace('\n', ' ')
+        # print(f"message:{message}")
+        log_group = []
+        log_item = LogItem()
+        # print(f"log_item:{type(log_item), log_item}")
+        contents = [(f"{crawler}-{log_type}", message)]
+        # print(f"contents:{type(contents), contents}")
+        log_item.set_contents(contents)
+        log_group.append(log_item)
+        # print(f"log_group:{type(log_group), log_group}")
+
+        # 写入日志
+        # print("开始PutLogsRequest")
+        request = PutLogsRequest(project=project,
+                                 logstore=logstore,
+                                 topic="",
+                                 source="",
+                                 logitems=log_group,
+                                 compress=False)
+        # print(f"request:{request}")
+        # print("put_logs...")
+        client.put_logs(request)
+        # print("put_logs...done")
+
     # 清除日志,保留最近 10 个文件
     @classmethod
     def del_logs(cls, log_type, crawler):
@@ -342,4 +400,3 @@ class Common:
 if __name__ == "__main__":
     Common.tunnel_proxies()
     pass
-

+ 4 - 19
requirements.txt

@@ -1,32 +1,17 @@
-# pip3 install Appium-Python-Client
-Appium_Python_Client==2.10.1
-# 翻墙, pip3 install git+https://github.com/pyatom/pyatom/
+aliyun_python_sdk==2.2.0
+Appium_Python_Client==2.11.0
 atomac==1.2.0
-# pip3 install ffmpeg-python
 ffmpeg==1.4
-# pip3 install loguru
+jieba==0.42.1
 loguru==0.6.0
-# pip3 install lxml
 lxml==4.9.1
-# pip3 install mq_http_sdk, 若您使用的SDK版本为v1.0.0,您需要安装大于等于2.5且小于3.0版本的Python。若您使用的SDK版本大于v1.0.0,您需要安装2.5及以上版本的Python。
 mq_http_sdk==1.0.3
-# sudo pip3 install oss2
 oss2==2.15.0
-# pip3 install psutil
 psutil==5.9.2
-# pip3 install PyExecJS
 PyExecJS==1.5.1
-# pip3 install PyMysql
 PyMySQL==1.0.2
-# pip3 install redis
 redis==4.5.1
-# pip3 install requests
 requests==2.27.1
-# pip3 install selenium
-selenium==4.9.1
-# pip3 install urllib3
+selenium==4.10.0
 urllib3==1.26.9
-# pip3 install jieba
-jieba==0.42.1
-# pip3 install workalendar
 workalendar==17.0.0

+ 19 - 0
xigua/xigua_author/xigua_author_scheduling.py

@@ -574,12 +574,15 @@ class XiguaauthorScheduling:
             offset += 30
             if response.status_code != 200:
                 Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+                Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
                 return
             elif 'data' not in response.text:
                 Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+                Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
                 return
             elif not response.json()["data"]['videoList']:
                 Common.logger(log_type, crawler).warning(f"没有更多数据啦~:{response.json()}\n")
+                Common.logging(log_type, crawler, env, f"没有更多数据啦~:{response.json()}\n")
                 return
             feeds = response.json()['data']['videoList']
             for i in range(len(feeds)):
@@ -587,19 +590,24 @@ class XiguaauthorScheduling:
                     item_id = feeds[i].get("item_id", "")
                     if item_id == "":
                         Common.logger(log_type, crawler).info("无效视频\n")
+                        Common.logging(log_type, crawler, env, "无效视频\n")
                         continue
 
                     video_dict = cls.get_video_info(log_type, crawler, item_id)
                     if video_dict is None:
                         Common.logger(log_type, crawler).info("无效视频\n")
+                        Common.logging(log_type, crawler, env, "无效视频\n")
                         continue
                     for k, v in video_dict.items():
                         Common.logger(log_type, crawler).info(f"{k}:{v}")
+                    Common.logging(log_type, crawler, env, f"{video_dict}")
                     if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
                         Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
+                        Common.logging(log_type, crawler, env, f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
                         return
                     if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
                         Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                        Common.logging(log_type, crawler, env, "不满足抓取规则\n")
                     elif any(str(word) if str(word) in video_dict["video_title"] else False
                              for word in get_config_from_mysql(log_type=log_type,
                                                                source=crawler,
@@ -607,8 +615,10 @@ class XiguaauthorScheduling:
                                                                text="filter",
                                                                action="")) is True:
                         Common.logger(log_type, crawler).info('已中过滤词\n')
+                        Common.logging(log_type, crawler, env, "已中过滤词\n")
                     elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
                         Common.logger(log_type, crawler).info('视频已下载\n')
+                        Common.logging(log_type, crawler, env, "视频已下载\n")
                     else:
                         cls.download_publish(log_type=log_type,
                                              crawler=crawler,
@@ -618,6 +628,7 @@ class XiguaauthorScheduling:
                                              env=env)
                 except Exception as e:
                     Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
+                    Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
     @classmethod
     def repeat_video(cls, log_type, crawler, video_id, env):
         sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
@@ -639,11 +650,13 @@ class XiguaauthorScheduling:
                 # 删除视频文件夹
                 shutil.rmtree(f"./{crawler}/videos/{md_title}")
                 Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
+                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
                 return
         except FileNotFoundError:
             # 删除视频文件夹
             shutil.rmtree(f"./{crawler}/videos/{md_title}")
             Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
+            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
             return
         # 下载封面
         Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
@@ -652,6 +665,7 @@ class XiguaauthorScheduling:
 
         # 上传视频
         Common.logger(log_type, crawler).info("开始上传视频...")
+        Common.logging(log_type, crawler, env, "开始上传视频...")
         if env == "dev":
             oss_endpoint = "out"
             our_video_id = Publish.upload_and_publish(log_type=log_type,
@@ -712,8 +726,10 @@ class XiguaauthorScheduling:
                         {int(video_dict['video_width'])},
                         {int(video_dict['video_height'])}) """
         Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
         MysqlHelper.update_values(log_type, crawler, insert_sql, env)
         Common.logger(log_type, crawler).info('视频信息写入数据库成功')
+        Common.logging(log_type, crawler, env, '视频信息写入数据库成功')
 
         # 视频写入飞书
         Feishu.insert_columns(log_type, crawler, "e075e9", "ROWS", 1, 2)
@@ -740,12 +756,14 @@ class XiguaauthorScheduling:
         time.sleep(0.5)
         Feishu.update_values(log_type, crawler, "e075e9", "F2:Z2", values)
         Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
+        Common.logging(log_type, crawler, env, f"视频已保存至云文档\n")
 
     @classmethod
     def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
         for user_dict in user_list:
             try:
                 Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 用户主页视频\n")
+                Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['nick_name']} 用户主页视频\n")
                 cls.get_videoList(log_type=log_type,
                                   crawler=crawler,
                                   user_dict=user_dict,
@@ -753,6 +771,7 @@ class XiguaauthorScheduling:
                                   env=env)
             except Exception as e:
                 Common.logger(log_type, crawler).error(f"抓取{user_dict['nick_name']}视频时异常:{e}\n")
+                Common.logging(log_type, crawler, env, f"抓取{user_dict['nick_name']}视频时异常:{e}\n")
 
 
 if __name__ == '__main__':

+ 20 - 0
xigua/xigua_main/run_xg_author.py

@@ -23,6 +23,10 @@ def main(log_type, crawler, topic_name, group_id, env):
                                           f'WaitSeconds:{wait_seconds}\n'
                                           f'TopicName:{topic_name}\n'
                                           f'MQConsumer:{group_id}')
+    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                           f'WaitSeconds:{wait_seconds}\n'
+                                           f'TopicName:{topic_name}\n'
+                                           f'MQConsumer:{group_id}')
     while True:
         try:
             # 长轮询消费消息。
@@ -38,6 +42,16 @@ def main(log_type, crawler, topic_name, group_id, env):
                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
                                                       f"Properties:{msg.properties}")
+                Common.logging(log_type, crawler, env, f"Receive\n"
+                                                       f"MessageId:{msg.message_id}\n"
+                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                       f"MessageTag:{msg.message_tag}\n"
+                                                       f"ConsumedTimes:{msg.consumed_times}\n"
+                                                       f"PublishTime:{msg.publish_time}\n"
+                                                       f"Body:{msg.message_body}\n"
+                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                       f"Properties:{msg.properties}")
                 # ack_mq_message
                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
 
@@ -48,9 +62,12 @@ def main(log_type, crawler, topic_name, group_id, env):
                 select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
                 user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}")
                 # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
                 Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                Common.logging(log_type, crawler, env, f'开始抓取 {task_dict["taskName"]}\n')
                 XiguaauthorScheduling.get_author_videos(log_type=log_type,
                                                         crawler=crawler,
                                                         rule_dict=rule_dict,
@@ -58,14 +75,17 @@ def main(log_type, crawler, topic_name, group_id, env):
                                                         env=env)
                 Common.del_logs(log_type, crawler)
                 Common.logger(log_type, crawler).info('抓取一轮结束\n')
+                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
 
         except MQExceptionBase as err:
             # Topic中没有消息可消费。
             if err.type == "MessageNotExist":
                 Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
                 continue
 
             Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
             time.sleep(2)
             continue
 

+ 20 - 0
xigua/xigua_main/run_xg_recommend.py

@@ -24,6 +24,10 @@ def main(log_type, crawler, topic_name, group_id, env):
                                           f'WaitSeconds:{wait_seconds}\n'
                                           f'TopicName:{topic_name}\n'
                                           f'MQConsumer:{group_id}')
+    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                          f'WaitSeconds:{wait_seconds}\n'
+                                          f'TopicName:{topic_name}\n'
+                                          f'MQConsumer:{group_id}')
     while True:
         try:
             # 长轮询消费消息。
@@ -39,6 +43,16 @@ def main(log_type, crawler, topic_name, group_id, env):
                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
                                                       f"Properties:{msg.properties}")
+                Common.logging(log_type, crawler, env, f"Receive\n"
+                                                      f"MessageId:{msg.message_id}\n"
+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                      f"MessageTag:{msg.message_tag}\n"
+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
+                                                      f"PublishTime:{msg.publish_time}\n"
+                                                      f"Body:{msg.message_body}\n"
+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                      f"Properties:{msg.properties}")
                 # ack_mq_message
                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
 
@@ -53,9 +67,12 @@ def main(log_type, crawler, topic_name, group_id, env):
                     our_uid_list.append(user["uid"])
                 our_uid = random.choice(our_uid_list)
                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}")
                 # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
                 Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                Common.logging(log_type, crawler, env, f'开始抓取 {task_dict["taskName"]}\n')
                 XiguarecommendScheduling.get_videoList(log_type=log_type,
                                                        crawler=crawler,
                                                        rule_dict=rule_dict,
@@ -63,14 +80,17 @@ def main(log_type, crawler, topic_name, group_id, env):
                                                        env=env)
                 Common.del_logs(log_type, crawler)
                 Common.logger(log_type, crawler).info('抓取一轮结束\n')
+                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
 
         except MQExceptionBase as err:
             # Topic中没有消息可消费。
             if err.type == "MessageNotExist":
                 Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
                 continue
 
             Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
             time.sleep(2)
             continue
 

+ 24 - 0
xigua/xigua_main/run_xg_recommend_dev.py

@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2023/6/12
+import os
+import sys
+sys.path.append(os.getcwd())
+from common.common import Common
+from xigua.xigua_recommend.xigua_recommend_scheduling import XiguarecommendScheduling
+
+
+def main(log_type, crawler, env):
+    Common.logger(log_type, crawler).info(f'开始抓取 西瓜推荐\n')
+    Common.logging(log_type, crawler, env, "开始抓取 西瓜推荐\n")
+    XiguarecommendScheduling.get_videoList(log_type=log_type,
+                                           crawler=crawler,
+                                           rule_dict={},
+                                           our_uid=6267140,
+                                           env=env)
+    Common.logger(log_type, crawler).info("抓取一轮结束\n")
+    Common.logging(log_type, crawler, env, "抓取一轮结束\n")
+
+
+if __name__ == "__main__":
+    main("recommend", "xigua", "dev")

+ 20 - 0
xigua/xigua_main/run_xg_search.py

@@ -23,6 +23,10 @@ def main(log_type, crawler, topic_name, group_id, env):
                                           f'WaitSeconds:{wait_seconds}\n'
                                           f'TopicName:{topic_name}\n'
                                           f'MQConsumer:{group_id}')
+    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
+                                           f'WaitSeconds:{wait_seconds}\n'
+                                           f'TopicName:{topic_name}\n'
+                                           f'MQConsumer:{group_id}')
     while True:
         try:
             # 长轮询消费消息。
@@ -38,6 +42,16 @@ def main(log_type, crawler, topic_name, group_id, env):
                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
                                                       f"Properties:{msg.properties}")
+                Common.logging(log_type, crawler, env, f"Receive\n"
+                                                       f"MessageId:{msg.message_id}\n"
+                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
+                                                       f"MessageTag:{msg.message_tag}\n"
+                                                       f"ConsumedTimes:{msg.consumed_times}\n"
+                                                       f"PublishTime:{msg.publish_time}\n"
+                                                       f"Body:{msg.message_body}\n"
+                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
+                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
+                                                       f"Properties:{msg.properties}")
                 # ack_mq_message
                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
 
@@ -48,9 +62,12 @@ def main(log_type, crawler, topic_name, group_id, env):
                 select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
                 user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
+                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
+                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}")
                 # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
                 Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
+                Common.logging(log_type, crawler, env, f'开始抓取 {task_dict["taskName"]}\n')
                 XiguasearchScheduling.get_search_videos(log_type=log_type,
                                                         crawler=crawler,
                                                         rule_dict=rule_dict,
@@ -60,14 +77,17 @@ def main(log_type, crawler, topic_name, group_id, env):
                 os.system("ps aux | grep chromedriver | grep -v grep | awk '{print $2}' | xargs kill -9")
                 Common.del_logs(log_type, crawler)
                 Common.logger(log_type, crawler).info('抓取一轮结束\n')
+                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
 
         except MQExceptionBase as err:
             # Topic中没有消息可消费。
             if err.type == "MessageNotExist":
                 Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
+                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
                 continue
 
             Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
+            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
             time.sleep(2)
             continue
 

+ 21 - 0
xigua/xigua_recommend/xigua_recommend_scheduling.py

@@ -605,10 +605,12 @@ class XiguarecommendScheduling:
         queryCount = 1
         while True:
             Common.logger(log_type, crawler).info(f"正在抓取第{queryCount}页视频")
+            Common.logging(log_type, crawler, env, f"正在抓取第{queryCount}页视频")
             try:
                 signature = cls.get_signature(env)
                 if signature is None:
                     Common.logger(log_type, crawler).warning(f"signature:{signature}")
+                    Common.logging(log_type, crawler, env, f"signature:{signature}")
                     time.sleep(1)
                     continue
                 url = "https://www.ixigua.com/api/feedv2/feedById?"
@@ -654,18 +656,23 @@ class XiguarecommendScheduling:
                 queryCount += 1
                 if response.status_code != 200:
                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
                     return
                 elif 'data' not in response.text:
                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
                     return
                 elif 'channelFeed' not in response.json()['data']:
                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.json()}\n")
                     return
                 elif 'Data' not in response.json()['data']['channelFeed']:
                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.json()}\n")
                     return
                 elif len(response.json()['data']['channelFeed']['Data']) == 0:
                     Common.logger(log_type, crawler).warning(f"没有更多数据啦 ~ :{response.json()}\n")
+                    Common.logging(log_type, crawler, env, f"没有更多数据啦 ~ :{response.json()}\n")
                     return
                 else:
                     feeds = response.json()['data']['channelFeed']['Data']
@@ -674,15 +681,19 @@ class XiguarecommendScheduling:
                             item_id = feeds[i].get("data", {}).get("item_id", "")
                             if item_id == "":
                                 Common.logger(log_type, crawler).info("无效视频\n")
+                                Common.logging(log_type, crawler, env, "无效视频\n")
                                 continue
                             video_dict = cls.get_video_info(log_type, crawler, item_id)
                             if video_dict is None:
                                 Common.logger(log_type, crawler).info("无效视频\n")
+                                Common.logging(log_type, crawler, env, "无效视频\n")
                                 continue
                             for k, v in video_dict.items():
                                 Common.logger(log_type, crawler).info(f"{k}:{v}")
+                            Common.logging(log_type, crawler, env, f"{video_dict}")
                             if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
                                 Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                                Common.logging(log_type, crawler, env, "不满足抓取规则\n")
                             elif any(str(word) if str(word) in video_dict["video_title"] else False
                                      for word in get_config_from_mysql(log_type=log_type,
                                                                        source=crawler,
@@ -690,8 +701,10 @@ class XiguarecommendScheduling:
                                                                        text="filter",
                                                                        action="")) is True:
                                 Common.logger(log_type, crawler).info('已中过滤词\n')
+                                Common.logging(log_type, crawler, env, "已中过滤词\n")
                             elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
                                 Common.logger(log_type, crawler).info('视频已下载\n')
+                                Common.logging(log_type, crawler, env, "视频已下载\n")
                             else:
                                 cls.download_publish(log_type=log_type,
                                                      crawler=crawler,
@@ -701,8 +714,10 @@ class XiguarecommendScheduling:
                                                      env=env)
                         except Exception as e:
                             Common.logger(log_type, crawler).error(f"抓取单条视频时异常:{e}\n")
+                            Common.logging(log_type, crawler, env, f"抓取单条视频时异常:{e}\n")
             except Exception as e:
                 Common.logger(log_type, crawler).error(f"抓取第{queryCount}页时异常:{e}\n")
+                Common.logging(log_type, crawler, env, f"抓取第{queryCount}页时异常:{e}\n")
 
     @classmethod
     def download_publish(cls, log_type, crawler, our_uid, video_dict, rule_dict, env):
@@ -718,11 +733,13 @@ class XiguarecommendScheduling:
                 # 删除视频文件夹
                 shutil.rmtree(f"./{crawler}/videos/{md_title}")
                 Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
+                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
                 return
         except FileNotFoundError:
             # 删除视频文件夹
             shutil.rmtree(f"./{crawler}/videos/{md_title}")
             Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
+            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
             return
         # 下载封面
         Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
@@ -731,6 +748,7 @@ class XiguarecommendScheduling:
 
         # 上传视频
         Common.logger(log_type, crawler).info("开始上传视频...")
+        Common.logging(log_type, crawler, env, "开始上传视频...")
         if env == "dev":
             oss_endpoint = "out"
             our_video_id = Publish.upload_and_publish(log_type=log_type,
@@ -791,8 +809,10 @@ class XiguarecommendScheduling:
                                         {int(video_dict['video_width'])},
                                         {int(video_dict['video_height'])}) """
         Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
         MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
         Common.logger(log_type, crawler).info('视频信息写入数据库成功')
+        Common.logging(log_type, crawler, env, f"视频信息写入数据库成功")
 
         # 视频写入飞书
         Feishu.insert_columns(log_type, crawler, "1iKGF1", "ROWS", 1, 2)
@@ -819,6 +839,7 @@ class XiguarecommendScheduling:
         time.sleep(1)
         Feishu.update_values(log_type, 'xigua', "1iKGF1", "F2:Z2", values)
         Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
+        Common.logging(log_type, crawler, env, f"视频已保存至云文档\n")
 
 
 if __name__ == "__main__":

+ 20 - 0
xigua/xigua_search/xigua_search_scheduling.py

@@ -565,6 +565,7 @@ class XiguasearchScheduling:
         driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
         driver.implicitly_wait(10)
         Common.logger(log_type, crawler).info(f"打开搜索页:{user_dict['link']}")
+        Common.logging(log_type, crawler, env, f"打开搜索页:{user_dict['link']}")
         driver.get(f"https://www.ixigua.com/search/{user_dict['link']}/")
         time.sleep(3)
         # driver.get_screenshot_as_file(f"./{crawler}/logs/打开搜索页.jpg")
@@ -594,20 +595,24 @@ class XiguasearchScheduling:
             video_element_temp = video_elements[index:]
             if len(video_element_temp) == 0:
                 Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
+                Common.logging(log_type, crawler, env, '到底啦~~~~~~~~~~~~~\n')
                 driver.quit()
                 return
             for i, video_element in enumerate(video_element_temp):
                 try:
                     if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
                         Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
+                        Common.logging(log_type, crawler, env, f"搜索词: {user_dict['link']},已下载视频数: {cls.download_cnt}\n")
                         driver.quit()
                         return
                     if video_element is None:
                         Common.logger(log_type, crawler).info('到底啦~\n')
+                        Common.logging(log_type, crawler, env, '到底啦~\n')
                         driver.quit()
                         return
                     num += 1
                     Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
+                    Common.logging(log_type, crawler, env, f'拖动"视频"列表第{num}个至屏幕中间')
                     driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
                     time.sleep(3)
                     # driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
@@ -616,15 +621,18 @@ class XiguasearchScheduling:
                     video_dict = cls.get_video_info(log_type, crawler, item_id)
                     if video_dict is None:
                         Common.logger(log_type, crawler).info("无效视频\n")
+                        Common.logging(log_type, crawler, env, "无效视频\n")
                         continue
                     for k, v in video_dict.items():
                         Common.logger(log_type, crawler).info(f"{k}:{v}")
+                    Common.logging(log_type, crawler, env, f"{video_dict}")
                     # if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
                     #     Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
                     #     driver.quit()
                     #     return
                     if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
                         Common.logger(log_type, crawler).info("不满足抓取规则\n")
+                        Common.logging(log_type, crawler, env, "不满足抓取规则\n")
                     elif any(str(word) if str(word) in video_dict["video_title"] else False
                              for word in get_config_from_mysql(log_type=log_type,
                                                                source=crawler,
@@ -632,8 +640,10 @@ class XiguasearchScheduling:
                                                                text="filter",
                                                                action="")) is True:
                         Common.logger(log_type, crawler).info('已中过滤词\n')
+                        Common.logging(log_type, crawler, env, '已中过滤词\n')
                     elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
                         Common.logger(log_type, crawler).info('视频已下载\n')
+                        Common.logging(log_type, crawler, env, '视频已下载\n')
                     else:
                         cls.download_publish(log_type=log_type,
                                              crawler=crawler,
@@ -643,8 +653,10 @@ class XiguasearchScheduling:
                                              env=env)
                 except Exception as e:
                     Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
+                    Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
 
             Common.logger(log_type, crawler).info('已抓取完一组视频,休眠10秒\n')
+            Common.logging(log_type, crawler, env, '已抓取完一组视频,休眠10秒\n')
             time.sleep(10)
             index = index + len(video_element_temp)
 
@@ -672,11 +684,13 @@ class XiguasearchScheduling:
                 # 删除视频文件夹
                 shutil.rmtree(f"./{crawler}/videos/{md_title}")
                 Common.logger(log_type, crawler).info("视频size=0,删除成功\n")
+                Common.logging(log_type, crawler, env, "视频size=0,删除成功\n")
                 return
         except FileNotFoundError:
             # 删除视频文件夹
             shutil.rmtree(f"./{crawler}/videos/{md_title}")
             Common.logger(log_type, crawler).info("视频文件不存在,删除文件夹成功\n")
+            Common.logging(log_type, crawler, env, "视频文件不存在,删除文件夹成功\n")
             return
         # 下载封面
         Common.download_method(log_type=log_type, crawler=crawler, text='cover',
@@ -686,6 +700,7 @@ class XiguasearchScheduling:
 
         # 上传视频
         Common.logger(log_type, crawler).info("开始上传视频...")
+        Common.logging(log_type, crawler, env, "开始上传视频...")
         if env == "dev":
             oss_endpoint = "out"
             our_video_id = Publish.upload_and_publish(log_type=log_type,
@@ -746,9 +761,11 @@ class XiguasearchScheduling:
                                 {int(video_dict['video_width'])},
                                 {int(video_dict['video_height'])}) """
         Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
+        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
         MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
         cls.download_cnt += 1
         Common.logger(log_type, crawler).info("视频信息写入数据库完成")
+        Common.logging(log_type, crawler, env, "视频信息写入数据库完成")
 
         # 视频信息写入飞书
         Feishu.insert_columns(log_type, crawler, "BUNvGC", "ROWS", 1, 2)
@@ -775,6 +792,7 @@ class XiguasearchScheduling:
         time.sleep(0.5)
         Feishu.update_values(log_type, crawler, "BUNvGC", "E2:Z2", values)
         Common.logger(log_type, crawler).info('视频信息写入飞书完成\n')
+        Common.logging(log_type, crawler, env, '视频信息写入飞书完成\n')
 
     @classmethod
     def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
@@ -782,6 +800,7 @@ class XiguasearchScheduling:
             try:
                 cls.download_cnt = 0
                 Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
+                Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['link']} 视频\n")
                 cls.get_videoList(log_type=log_type,
                                   crawler=crawler,
                                   user_dict=user_dict,
@@ -789,6 +808,7 @@ class XiguasearchScheduling:
                                   env=env)
             except Exception as e:
                 Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
+                Common.logging(log_type, crawler, env, f"抓取{user_dict['link']}视频时异常:{e}\n")
 
 
 if __name__ == '__main__':