1 rok temu · 08f0f609a9
--- a/README.MD
+++ b/README.MD
@@ -161,4 +161,44 @@ ps aux | grep xigua | grep -v grep | awk '{print $2}' | xargs kill -9
 
				 ps aux | grep kuaishou | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				 ps aux | grep douyin | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				 ps aux | grep xiaoniangao | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				+```
			
 
				+
			
 
				+#### 生成 requirements.txt
			
 
				+```commandline
			
 
				+cd ./piaoquan_crawler && pipreqs ./ --force
			
 
				+
			
 
				+# pip3 install Appium-Python-Client
			
 
				+Appium_Python_Client==2.10.1
			
 
				+# 翻墙, pip3 install git+https://github.com/pyatom/pyatom/
			
 
				+atomac==1.2.0
			
 
				+# pip3 install ffmpeg-python
			
 
				+ffmpeg==1.4
			
 
				+# pip3 install loguru
			
 
				+loguru==0.6.0
			
 
				+# pip3 install lxml
			
 
				+lxml==4.9.1
			
 
				+# pip3 install mq_http_sdk, 若您使用的SDK版本为v1.0.0，您需要安装大于等于2.5且小于3.0版本的Python。若您使用的SDK版本大于v1.0.0，您需要安装2.5及以上版本的Python。
			
 
				+mq_http_sdk==1.0.3
			
 
				+# sudo pip3 install oss2
			
 
				+oss2==2.15.0
			
 
				+# pip3 install psutil
			
 
				+psutil==5.9.2
			
 
				+# pip3 install PyExecJS
			
 
				+PyExecJS==1.5.1
			
 
				+# pip3 install PyMysql
			
 
				+PyMySQL==1.0.2
			
 
				+# pip3 install redis
			
 
				+redis==4.5.1
			
 
				+# pip3 install requests
			
 
				+requests==2.27.1
			
 
				+# pip3 install selenium
			
 
				+selenium==4.9.1
			
 
				+# pip3 install urllib3
			
 
				+urllib3==1.26.9
			
 
				+# pip3 install jieba
			
 
				+jieba==0.42.1
			
 
				+# pip3 install workalendar
			
 
				+workalendar==17.0.0
			
 
				+# pip3 install aliyun_python_sdk
			
 
				+aliyun_python_sdk==2.2.0
			
 
				 ```
			
--- a/common/common.py
+++ b/common/common.py
@@ -4,6 +4,7 @@
 
				 """
			
 
				 公共方法，包含：生成log / 删除log / 下载方法 / 删除 weixinzhishu_chlsfiles / 过滤词库 / 保存视频信息至本地 txt / 翻译 / ffmpeg
			
 
				 """
			
 
				+from aliyun.log import LogClient, PutLogsRequest, LogItem
			
 
				 from datetime import date, timedelta
			
 
				 from loguru import logger
			
 
				 from hashlib import md5
			
@@ -55,6 +56,63 @@ class Common:
 
				 
			
 
				         return logger
			
 
				 
			
 
				+    # 写入阿里云日志
			
 
				+    @staticmethod
			
 
				+    def logging(log_type, crawler, env, message):
			
 
				+        """
			
 
				+        写入阿里云日志
			
 
				+        测试库: https://sls.console.aliyun.com/lognext/project/crawler-log-dev/logsearch/crawler-log-dev
			
 
				+        正式库: https://sls.console.aliyun.com/lognext/project/crawler-log-prod/logsearch/crawler-log-prod
			
 
				+        :param log_type: 爬虫策略
			
 
				+        :param crawler: 哪款爬虫
			
 
				+        :param env: 环境
			
 
				+        :param message:日志内容
			
 
				+        :return: None
			
 
				+        """
			
 
				+        # 设置阿里云日志服务的访问信息
			
 
				+        accessKeyId = 'LTAIWYUujJAm7CbH'
			
 
				+        accessKey = 'RfSjdiWwED1sGFlsjXv0DlfTnZTG1P'
			
 
				+        if env == "dev":
			
 
				+            project = 'crawler-log-dev'
			
 
				+            logstore = 'crawler-log-dev'
			
 
				+            endpoint = 'cn-hangzhou.log.aliyuncs.com'
			
 
				+        else:
			
 
				+            project = 'crawler-log-prod'
			
 
				+            logstore = 'crawler-log-prod'
			
 
				+            endpoint = 'cn-hangzhou-intranet.log.aliyuncs.com'
			
 
				+
			
 
				+        # 创建 LogClient 实例
			
 
				+        # print("创建 LogClient 实例")
			
 
				+        client = LogClient(endpoint, accessKeyId, accessKey)
			
 
				+
			
 
				+
			
 
				+        if '\r' in message:
			
 
				+            message = message.replace('\r', ' ')
			
 
				+        if '\n' in message:
			
 
				+            message = message.replace('\n', ' ')
			
 
				+        # print(f"message:{message}")
			
 
				+        log_group = []
			
 
				+        log_item = LogItem()
			
 
				+        # print(f"log_item:{type(log_item), log_item}")
			
 
				+        contents = [(f"{crawler}-{log_type}", message)]
			
 
				+        # print(f"contents:{type(contents), contents}")
			
 
				+        log_item.set_contents(contents)
			
 
				+        log_group.append(log_item)
			
 
				+        # print(f"log_group:{type(log_group), log_group}")
			
 
				+
			
 
				+        # 写入日志
			
 
				+        # print("开始PutLogsRequest")
			
 
				+        request = PutLogsRequest(project=project,
			
 
				+                                 logstore=logstore,
			
 
				+                                 topic="",
			
 
				+                                 source="",
			
 
				+                                 logitems=log_group,
			
 
				+                                 compress=False)
			
 
				+        # print(f"request:{request}")
			
 
				+        # print("put_logs...")
			
 
				+        client.put_logs(request)
			
 
				+        # print("put_logs...done")
			
 
				+
			
 
				     # 清除日志，保留最近 10 个文件
			
 
				     @classmethod
			
 
				     def del_logs(cls, log_type, crawler):
			
@@ -342,4 +400,3 @@ class Common:
 
				 if __name__ == "__main__":
			
 
				     Common.tunnel_proxies()
			
 
				     pass
			
 
				-
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,32 +1,17 @@
 
				-# pip3 install Appium-Python-Client
			
 
				-Appium_Python_Client==2.10.1
			
 
				-# 翻墙, pip3 install git+https://github.com/pyatom/pyatom/
			
 
				+aliyun_python_sdk==2.2.0
			
 
				+Appium_Python_Client==2.11.0
			
 
				 atomac==1.2.0
			
 
				-# pip3 install ffmpeg-python
			
 
				 ffmpeg==1.4
			
 
				-# pip3 install loguru
			
 
				+jieba==0.42.1
			
 
				 loguru==0.6.0
			
 
				-# pip3 install lxml
			
 
				 lxml==4.9.1
			
 
				-# pip3 install mq_http_sdk, 若您使用的SDK版本为v1.0.0，您需要安装大于等于2.5且小于3.0版本的Python。若您使用的SDK版本大于v1.0.0，您需要安装2.5及以上版本的Python。
			
 
				 mq_http_sdk==1.0.3
			
 
				-# sudo pip3 install oss2
			
 
				 oss2==2.15.0
			
 
				-# pip3 install psutil
			
 
				 psutil==5.9.2
			
 
				-# pip3 install PyExecJS
			
 
				 PyExecJS==1.5.1
			
 
				-# pip3 install PyMysql
			
 
				 PyMySQL==1.0.2
			
 
				-# pip3 install redis
			
 
				 redis==4.5.1
			
 
				-# pip3 install requests
			
 
				 requests==2.27.1
			
 
				-# pip3 install selenium
			
 
				-selenium==4.9.1
			
 
				-# pip3 install urllib3
			
 
				+selenium==4.10.0
			
 
				 urllib3==1.26.9
			
 
				-# pip3 install jieba
			
 
				-jieba==0.42.1
			
 
				-# pip3 install workalendar
			
 
				 workalendar==17.0.0
			
--- a/xigua/xigua_author/xigua_author_scheduling.py
+++ b/xigua/xigua_author/xigua_author_scheduling.py
@@ -574,12 +574,15 @@ class XiguaauthorScheduling:
 
				             offset += 30
			
 
				             if response.status_code != 200:
			
 
				                 Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
			
 
				                 return
			
 
				             elif 'data' not in response.text:
			
 
				                 Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
			
 
				                 return
			
 
				             elif not response.json()["data"]['videoList']:
			
 
				                 Common.logger(log_type, crawler).warning(f"没有更多数据啦~:{response.json()}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"没有更多数据啦~:{response.json()}\n")
			
 
				                 return
			
 
				             feeds = response.json()['data']['videoList']
			
 
				             for i in range(len(feeds)):
			
@@ -587,19 +590,24 @@ class XiguaauthorScheduling:
 
				                     item_id = feeds[i].get("item_id", "")
			
 
				                     if item_id == "":
			
 
				                         Common.logger(log_type, crawler).info("无效视频\n")
			
 
				+                        Common.logging(log_type, crawler, env, "无效视频\n")
			
 
				                         continue
			
 
				 
			
 
				                     video_dict = cls.get_video_info(log_type, crawler, item_id)
			
 
				                     if video_dict is None:
			
 
				                         Common.logger(log_type, crawler).info("无效视频\n")
			
 
				+                        Common.logging(log_type, crawler, env, "无效视频\n")
			
 
				                         continue
			
 
				                     for k, v in video_dict.items():
			
 
				                         Common.logger(log_type, crawler).info(f"{k}:{v}")
			
 
				+                    Common.logging(log_type, crawler, env, f"{video_dict}")
			
 
				                     if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
			
 
				                         Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
			
 
				+                        Common.logging(log_type, crawler, env, f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
			
 
				                         return
			
 
				                     if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
			
 
				                         Common.logger(log_type, crawler).info("不满足抓取规则\n")
			
 
				+                        Common.logging(log_type, crawler, env, "不满足抓取规则\n")
			
 
				                     elif any(str(word) if str(word) in video_dict["video_title"] else False
			
 
				                              for word in get_config_from_mysql(log_type=log_type,
			
 
				                                                                source=crawler,
			
@@ -607,8 +615,10 @@ class XiguaauthorScheduling:
 
				                                                                text="filter",
			
 
				                                                                action="")) is True:
			
 
				                         Common.logger(log_type, crawler).info('已中过滤词\n')
			
 
				+                        Common.logging(log_type, crawler, env, "已中过滤词\n")
			
 
				                     elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
			
 
				                         Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				+                        Common.logging(log_type, crawler, env, "视频已下载\n")
			
 
				                     else:
			
 
				                         cls.download_publish(log_type=log_type,
			
 
				                                              crawler=crawler,
			
@@ -618,6 +628,7 @@ class XiguaauthorScheduling:
 
				                                              env=env)
			
 
				                 except Exception as e:
			
 
				                     Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n")
			
 
				+                    Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
			
 
				     @classmethod
			
 
				     def repeat_video(cls, log_type, crawler, video_id, env):
			
 
				         sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """
			
@@ -639,11 +650,13 @@ class XiguaauthorScheduling:
 
				                 # 删除视频文件夹
			
 
				                 shutil.rmtree(f"./{crawler}/videos/{md_title}")
			
 
				                 Common.logger(log_type, crawler).info("视频size=0，删除成功\n")
			
 
				+                Common.logging(log_type, crawler, env, "视频size=0，删除成功\n")
			
 
				                 return
			
 
				         except FileNotFoundError:
			
 
				             # 删除视频文件夹
			
 
				             shutil.rmtree(f"./{crawler}/videos/{md_title}")
			
 
				             Common.logger(log_type, crawler).info("视频文件不存在，删除文件夹成功\n")
			
 
				+            Common.logging(log_type, crawler, env, "视频文件不存在，删除文件夹成功\n")
			
 
				             return
			
 
				         # 下载封面
			
 
				         Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
			
@@ -652,6 +665,7 @@ class XiguaauthorScheduling:
 
				 
			
 
				         # 上传视频
			
 
				         Common.logger(log_type, crawler).info("开始上传视频...")
			
 
				+        Common.logging(log_type, crawler, env, "开始上传视频...")
			
 
				         if env == "dev":
			
 
				             oss_endpoint = "out"
			
 
				             our_video_id = Publish.upload_and_publish(log_type=log_type,
			
@@ -712,8 +726,10 @@ class XiguaauthorScheduling:
 
				                         {int(video_dict['video_width'])},
			
 
				                         {int(video_dict['video_height'])}) """
			
 
				         Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
			
 
				+        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
			
 
				         MysqlHelper.update_values(log_type, crawler, insert_sql, env)
			
 
				         Common.logger(log_type, crawler).info('视频信息写入数据库成功')
			
 
				+        Common.logging(log_type, crawler, env, '视频信息写入数据库成功')
			
 
				 
			
 
				         # 视频写入飞书
			
 
				         Feishu.insert_columns(log_type, crawler, "e075e9", "ROWS", 1, 2)
			
@@ -740,12 +756,14 @@ class XiguaauthorScheduling:
 
				         time.sleep(0.5)
			
 
				         Feishu.update_values(log_type, crawler, "e075e9", "F2:Z2", values)
			
 
				         Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
			
 
				+        Common.logging(log_type, crawler, env, f"视频已保存至云文档\n")
			
 
				 
			
 
				     @classmethod
			
 
				     def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env):
			
 
				         for user_dict in user_list:
			
 
				             try:
			
 
				                 Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 用户主页视频\n")
			
 
				+                Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['nick_name']} 用户主页视频\n")
			
 
				                 cls.get_videoList(log_type=log_type,
			
 
				                                   crawler=crawler,
			
 
				                                   user_dict=user_dict,
			
@@ -753,6 +771,7 @@ class XiguaauthorScheduling:
 
				                                   env=env)
			
 
				             except Exception as e:
			
 
				                 Common.logger(log_type, crawler).error(f"抓取{user_dict['nick_name']}视频时异常:{e}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"抓取{user_dict['nick_name']}视频时异常:{e}\n")
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/xigua/xigua_main/run_xg_author.py
+++ b/xigua/xigua_main/run_xg_author.py
@@ -23,6 +23,10 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                           f'WaitSeconds:{wait_seconds}\n'
			
 
				                                           f'TopicName:{topic_name}\n'
			
 
				                                           f'MQConsumer:{group_id}')
			
 
				+    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
			
 
				+                                           f'WaitSeconds:{wait_seconds}\n'
			
 
				+                                           f'TopicName:{topic_name}\n'
			
 
				+                                           f'MQConsumer:{group_id}')
			
 
				     while True:
			
 
				         try:
			
 
				             # 长轮询消费消息。
			
@@ -38,6 +42,16 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
			
 
				                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
			
 
				                                                       f"Properties:{msg.properties}")
			
 
				+                Common.logging(log_type, crawler, env, f"Receive\n"
			
 
				+                                                       f"MessageId:{msg.message_id}\n"
			
 
				+                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
			
 
				+                                                       f"MessageTag:{msg.message_tag}\n"
			
 
				+                                                       f"ConsumedTimes:{msg.consumed_times}\n"
			
 
				+                                                       f"PublishTime:{msg.publish_time}\n"
			
 
				+                                                       f"Body:{msg.message_body}\n"
			
 
				+                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
			
 
				+                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
			
 
				+                                                       f"Properties:{msg.properties}")
			
 
				                 # ack_mq_message
			
 
				                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
			
 
				 
			
@@ -48,9 +62,12 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                 select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
			
 
				                 user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
			
 
				                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
			
 
				+                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
			
 
				                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
			
 
				+                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}")
			
 
				                 # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
			
 
				                 Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
			
 
				+                Common.logging(log_type, crawler, env, f'开始抓取 {task_dict["taskName"]}\n')
			
 
				                 XiguaauthorScheduling.get_author_videos(log_type=log_type,
			
 
				                                                         crawler=crawler,
			
 
				                                                         rule_dict=rule_dict,
			
@@ -58,14 +75,17 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                                         env=env)
			
 
				                 Common.del_logs(log_type, crawler)
			
 
				                 Common.logger(log_type, crawler).info('抓取一轮结束\n')
			
 
				+                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
			
 
				 
			
 
				         except MQExceptionBase as err:
			
 
				             # Topic中没有消息可消费。
			
 
				             if err.type == "MessageNotExist":
			
 
				                 Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
			
 
				                 continue
			
 
				 
			
 
				             Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
			
 
				+            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
			
 
				             time.sleep(2)
			
 
				             continue
			
 
				 
			
--- a/xigua/xigua_main/run_xg_recommend.py
+++ b/xigua/xigua_main/run_xg_recommend.py
@@ -24,6 +24,10 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                           f'WaitSeconds:{wait_seconds}\n'
			
 
				                                           f'TopicName:{topic_name}\n'
			
 
				                                           f'MQConsumer:{group_id}')
			
 
				+    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
			
 
				+                                          f'WaitSeconds:{wait_seconds}\n'
			
 
				+                                          f'TopicName:{topic_name}\n'
			
 
				+                                          f'MQConsumer:{group_id}')
			
 
				     while True:
			
 
				         try:
			
 
				             # 长轮询消费消息。
			
@@ -39,6 +43,16 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
			
 
				                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
			
 
				                                                       f"Properties:{msg.properties}")
			
 
				+                Common.logging(log_type, crawler, env, f"Receive\n"
			
 
				+                                                      f"MessageId:{msg.message_id}\n"
			
 
				+                                                      f"MessageBodyMD5:{msg.message_body_md5}\n"
			
 
				+                                                      f"MessageTag:{msg.message_tag}\n"
			
 
				+                                                      f"ConsumedTimes:{msg.consumed_times}\n"
			
 
				+                                                      f"PublishTime:{msg.publish_time}\n"
			
 
				+                                                      f"Body:{msg.message_body}\n"
			
 
				+                                                      f"NextConsumeTime:{msg.next_consume_time}\n"
			
 
				+                                                      f"ReceiptHandle:{msg.receipt_handle}\n"
			
 
				+                                                      f"Properties:{msg.properties}")
			
 
				                 # ack_mq_message
			
 
				                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
			
 
				 
			
@@ -53,9 +67,12 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                     our_uid_list.append(user["uid"])
			
 
				                 our_uid = random.choice(our_uid_list)
			
 
				                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
			
 
				+                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
			
 
				                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
			
 
				+                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}")
			
 
				                 # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
			
 
				                 Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
			
 
				+                Common.logging(log_type, crawler, env, f'开始抓取 {task_dict["taskName"]}\n')
			
 
				                 XiguarecommendScheduling.get_videoList(log_type=log_type,
			
 
				                                                        crawler=crawler,
			
 
				                                                        rule_dict=rule_dict,
			
@@ -63,14 +80,17 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                                        env=env)
			
 
				                 Common.del_logs(log_type, crawler)
			
 
				                 Common.logger(log_type, crawler).info('抓取一轮结束\n')
			
 
				+                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
			
 
				 
			
 
				         except MQExceptionBase as err:
			
 
				             # Topic中没有消息可消费。
			
 
				             if err.type == "MessageNotExist":
			
 
				                 Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
			
 
				                 continue
			
 
				 
			
 
				             Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
			
 
				+            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
			
 
				             time.sleep(2)
			
 
				             continue
			
 
				 
			
--- a/xigua/xigua_main/run_xg_recommend_dev.py
+++ b/xigua/xigua_main/run_xg_recommend_dev.py
@@ -0,0 +1,24 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: wangkun
			
 
				+# @Time: 2023/6/12
			
 
				+import os
			
 
				+import sys
			
 
				+sys.path.append(os.getcwd())
			
 
				+from common.common import Common
			
 
				+from xigua.xigua_recommend.xigua_recommend_scheduling import XiguarecommendScheduling
			
 
				+
			
 
				+
			
 
				+def main(log_type, crawler, env):
			
 
				+    Common.logger(log_type, crawler).info(f'开始抓取 西瓜推荐\n')
			
 
				+    Common.logging(log_type, crawler, env, "开始抓取 西瓜推荐\n")
			
 
				+    XiguarecommendScheduling.get_videoList(log_type=log_type,
			
 
				+                                           crawler=crawler,
			
 
				+                                           rule_dict={},
			
 
				+                                           our_uid=6267140,
			
 
				+                                           env=env)
			
 
				+    Common.logger(log_type, crawler).info("抓取一轮结束\n")
			
 
				+    Common.logging(log_type, crawler, env, "抓取一轮结束\n")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main("recommend", "xigua", "dev")
			
--- a/xigua/xigua_main/run_xg_search.py
+++ b/xigua/xigua_main/run_xg_search.py
@@ -23,6 +23,10 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                           f'WaitSeconds:{wait_seconds}\n'
			
 
				                                           f'TopicName:{topic_name}\n'
			
 
				                                           f'MQConsumer:{group_id}')
			
 
				+    Common.logging(log_type, crawler, env, f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
			
 
				+                                           f'WaitSeconds:{wait_seconds}\n'
			
 
				+                                           f'TopicName:{topic_name}\n'
			
 
				+                                           f'MQConsumer:{group_id}')
			
 
				     while True:
			
 
				         try:
			
 
				             # 长轮询消费消息。
			
@@ -38,6 +42,16 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
			
 
				                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
			
 
				                                                       f"Properties:{msg.properties}")
			
 
				+                Common.logging(log_type, crawler, env, f"Receive\n"
			
 
				+                                                       f"MessageId:{msg.message_id}\n"
			
 
				+                                                       f"MessageBodyMD5:{msg.message_body_md5}\n"
			
 
				+                                                       f"MessageTag:{msg.message_tag}\n"
			
 
				+                                                       f"ConsumedTimes:{msg.consumed_times}\n"
			
 
				+                                                       f"PublishTime:{msg.publish_time}\n"
			
 
				+                                                       f"Body:{msg.message_body}\n"
			
 
				+                                                       f"NextConsumeTime:{msg.next_consume_time}\n"
			
 
				+                                                       f"ReceiptHandle:{msg.receipt_handle}\n"
			
 
				+                                                       f"Properties:{msg.properties}")
			
 
				                 # ack_mq_message
			
 
				                 ack_message(log_type=log_type, crawler=crawler, recv_msgs=recv_msgs, consumer=consumer)
			
 
				 
			
@@ -48,9 +62,12 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                 select_user_sql = f"""select * from crawler_user_v3 where task_id={task_id}"""
			
 
				                 user_list = MysqlHelper.get_values(log_type, crawler, select_user_sql, env, action="")
			
 
				                 Common.logger(log_type, crawler).info(f"调度任务:{task_dict}")
			
 
				+                Common.logging(log_type, crawler, env, f"调度任务:{task_dict}")
			
 
				                 Common.logger(log_type, crawler).info(f"抓取规则:{rule_dict}")
			
 
				+                Common.logging(log_type, crawler, env, f"抓取规则:{rule_dict}")
			
 
				                 # Common.logger(log_type, crawler).info(f"用户列表:{user_list}\n")
			
 
				                 Common.logger(log_type, crawler).info(f'开始抓取 {task_dict["taskName"]}\n')
			
 
				+                Common.logging(log_type, crawler, env, f'开始抓取 {task_dict["taskName"]}\n')
			
 
				                 XiguasearchScheduling.get_search_videos(log_type=log_type,
			
 
				                                                         crawler=crawler,
			
 
				                                                         rule_dict=rule_dict,
			
@@ -60,14 +77,17 @@ def main(log_type, crawler, topic_name, group_id, env):
 
				                 os.system("ps aux | grep chromedriver | grep -v grep | awk '{print $2}' | xargs kill -9")
			
 
				                 Common.del_logs(log_type, crawler)
			
 
				                 Common.logger(log_type, crawler).info('抓取一轮结束\n')
			
 
				+                Common.logging(log_type, crawler, env, '抓取一轮结束\n')
			
 
				 
			
 
				         except MQExceptionBase as err:
			
 
				             # Topic中没有消息可消费。
			
 
				             if err.type == "MessageNotExist":
			
 
				                 Common.logger(log_type, crawler).info(f"No new message! RequestId:{err.req_id}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"No new message! RequestId:{err.req_id}\n")
			
 
				                 continue
			
 
				 
			
 
				             Common.logger(log_type, crawler).info(f"Consume Message Fail! Exception:{err}\n")
			
 
				+            Common.logging(log_type, crawler, env, f"Consume Message Fail! Exception:{err}\n")
			
 
				             time.sleep(2)
			
 
				             continue
			
 
				 
			
--- a/xigua/xigua_recommend/xigua_recommend_scheduling.py
+++ b/xigua/xigua_recommend/xigua_recommend_scheduling.py
@@ -605,10 +605,12 @@ class XiguarecommendScheduling:
 
				         queryCount = 1
			
 
				         while True:
			
 
				             Common.logger(log_type, crawler).info(f"正在抓取第{queryCount}页视频")
			
 
				+            Common.logging(log_type, crawler, env, f"正在抓取第{queryCount}页视频")
			
 
				             try:
			
 
				                 signature = cls.get_signature(env)
			
 
				                 if signature is None:
			
 
				                     Common.logger(log_type, crawler).warning(f"signature:{signature}")
			
 
				+                    Common.logging(log_type, crawler, env, f"signature:{signature}")
			
 
				                     time.sleep(1)
			
 
				                     continue
			
 
				                 url = "https://www.ixigua.com/api/feedv2/feedById?"
			
@@ -654,18 +656,23 @@ class XiguarecommendScheduling:
 
				                 queryCount += 1
			
 
				                 if response.status_code != 200:
			
 
				                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
			
 
				+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
			
 
				                     return
			
 
				                 elif 'data' not in response.text:
			
 
				                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
			
 
				+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n")
			
 
				                     return
			
 
				                 elif 'channelFeed' not in response.json()['data']:
			
 
				                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
			
 
				+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.json()}\n")
			
 
				                     return
			
 
				                 elif 'Data' not in response.json()['data']['channelFeed']:
			
 
				                     Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
			
 
				+                    Common.logging(log_type, crawler, env, f"get_videolist_response:{response.json()}\n")
			
 
				                     return
			
 
				                 elif len(response.json()['data']['channelFeed']['Data']) == 0:
			
 
				                     Common.logger(log_type, crawler).warning(f"没有更多数据啦 ~ :{response.json()}\n")
			
 
				+                    Common.logging(log_type, crawler, env, f"没有更多数据啦 ~ :{response.json()}\n")
			
 
				                     return
			
 
				                 else:
			
 
				                     feeds = response.json()['data']['channelFeed']['Data']
			
@@ -674,15 +681,19 @@ class XiguarecommendScheduling:
 
				                             item_id = feeds[i].get("data", {}).get("item_id", "")
			
 
				                             if item_id == "":
			
 
				                                 Common.logger(log_type, crawler).info("无效视频\n")
			
 
				+                                Common.logging(log_type, crawler, env, "无效视频\n")
			
 
				                                 continue
			
 
				                             video_dict = cls.get_video_info(log_type, crawler, item_id)
			
 
				                             if video_dict is None:
			
 
				                                 Common.logger(log_type, crawler).info("无效视频\n")
			
 
				+                                Common.logging(log_type, crawler, env, "无效视频\n")
			
 
				                                 continue
			
 
				                             for k, v in video_dict.items():
			
 
				                                 Common.logger(log_type, crawler).info(f"{k}:{v}")
			
 
				+                            Common.logging(log_type, crawler, env, f"{video_dict}")
			
 
				                             if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
			
 
				                                 Common.logger(log_type, crawler).info("不满足抓取规则\n")
			
 
				+                                Common.logging(log_type, crawler, env, "不满足抓取规则\n")
			
 
				                             elif any(str(word) if str(word) in video_dict["video_title"] else False
			
 
				                                      for word in get_config_from_mysql(log_type=log_type,
			
 
				                                                                        source=crawler,
			
@@ -690,8 +701,10 @@ class XiguarecommendScheduling:
 
				                                                                        text="filter",
			
 
				                                                                        action="")) is True:
			
 
				                                 Common.logger(log_type, crawler).info('已中过滤词\n')
			
 
				+                                Common.logging(log_type, crawler, env, "已中过滤词\n")
			
 
				                             elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
			
 
				                                 Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				+                                Common.logging(log_type, crawler, env, "视频已下载\n")
			
 
				                             else:
			
 
				                                 cls.download_publish(log_type=log_type,
			
 
				                                                      crawler=crawler,
			
@@ -701,8 +714,10 @@ class XiguarecommendScheduling:
 
				                                                      env=env)
			
 
				                         except Exception as e:
			
 
				                             Common.logger(log_type, crawler).error(f"抓取单条视频时异常:{e}\n")
			
 
				+                            Common.logging(log_type, crawler, env, f"抓取单条视频时异常:{e}\n")
			
 
				             except Exception as e:
			
 
				                 Common.logger(log_type, crawler).error(f"抓取第{queryCount}页时异常:{e}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"抓取第{queryCount}页时异常:{e}\n")
			
 
				 
			
 
				     @classmethod
			
 
				     def download_publish(cls, log_type, crawler, our_uid, video_dict, rule_dict, env):
			
@@ -718,11 +733,13 @@ class XiguarecommendScheduling:
 
				                 # 删除视频文件夹
			
 
				                 shutil.rmtree(f"./{crawler}/videos/{md_title}")
			
 
				                 Common.logger(log_type, crawler).info("视频size=0，删除成功\n")
			
 
				+                Common.logging(log_type, crawler, env, "视频size=0，删除成功\n")
			
 
				                 return
			
 
				         except FileNotFoundError:
			
 
				             # 删除视频文件夹
			
 
				             shutil.rmtree(f"./{crawler}/videos/{md_title}")
			
 
				             Common.logger(log_type, crawler).info("视频文件不存在，删除文件夹成功\n")
			
 
				+            Common.logging(log_type, crawler, env, "视频文件不存在，删除文件夹成功\n")
			
 
				             return
			
 
				         # 下载封面
			
 
				         Common.download_method(log_type=log_type, crawler=crawler, text='cover', title=video_dict['video_title'], url=video_dict['cover_url'])
			
@@ -731,6 +748,7 @@ class XiguarecommendScheduling:
 
				 
			
 
				         # 上传视频
			
 
				         Common.logger(log_type, crawler).info("开始上传视频...")
			
 
				+        Common.logging(log_type, crawler, env, "开始上传视频...")
			
 
				         if env == "dev":
			
 
				             oss_endpoint = "out"
			
 
				             our_video_id = Publish.upload_and_publish(log_type=log_type,
			
@@ -791,8 +809,10 @@ class XiguarecommendScheduling:
 
				                                         {int(video_dict['video_width'])},
			
 
				                                         {int(video_dict['video_height'])}) """
			
 
				         Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
			
 
				+        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
			
 
				         MysqlHelper.update_values(log_type, crawler, insert_sql, env, action='')
			
 
				         Common.logger(log_type, crawler).info('视频信息写入数据库成功')
			
 
				+        Common.logging(log_type, crawler, env, f"视频信息写入数据库成功")
			
 
				 
			
 
				         # 视频写入飞书
			
 
				         Feishu.insert_columns(log_type, crawler, "1iKGF1", "ROWS", 1, 2)
			
@@ -819,6 +839,7 @@ class XiguarecommendScheduling:
 
				         time.sleep(1)
			
 
				         Feishu.update_values(log_type, 'xigua', "1iKGF1", "F2:Z2", values)
			
 
				         Common.logger(log_type, crawler).info(f"视频已保存至云文档\n")
			
 
				+        Common.logging(log_type, crawler, env, f"视频已保存至云文档\n")
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/xigua/xigua_search/xigua_search_scheduling.py
+++ b/xigua/xigua_search/xigua_search_scheduling.py
@@ -565,6 +565,7 @@ class XiguasearchScheduling:
 
				         driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service(chromedriver))
			
 
				         driver.implicitly_wait(10)
			
 
				         Common.logger(log_type, crawler).info(f"打开搜索页:{user_dict['link']}")
			
 
				+        Common.logging(log_type, crawler, env, f"打开搜索页:{user_dict['link']}")
			
 
				         driver.get(f"https://www.ixigua.com/search/{user_dict['link']}/")
			
 
				         time.sleep(3)
			
 
				         # driver.get_screenshot_as_file(f"./{crawler}/logs/打开搜索页.jpg")
			
@@ -594,20 +595,24 @@ class XiguasearchScheduling:
 
				             video_element_temp = video_elements[index:]
			
 
				             if len(video_element_temp) == 0:
			
 
				                 Common.logger(log_type, crawler).info('到底啦~~~~~~~~~~~~~\n')
			
 
				+                Common.logging(log_type, crawler, env, '到底啦~~~~~~~~~~~~~\n')
			
 
				                 driver.quit()
			
 
				                 return
			
 
				             for i, video_element in enumerate(video_element_temp):
			
 
				                 try:
			
 
				                     if cls.download_cnt >= int(rule_dict.get("videos_cnt", {}).get("min", 30)):
			
 
				                         Common.logger(log_type, crawler).info(f"搜索词: {user_dict['link']}，已下载视频数: {cls.download_cnt}\n")
			
 
				+                        Common.logging(log_type, crawler, env, f"搜索词: {user_dict['link']}，已下载视频数: {cls.download_cnt}\n")
			
 
				                         driver.quit()
			
 
				                         return
			
 
				                     if video_element is None:
			
 
				                         Common.logger(log_type, crawler).info('到底啦~\n')
			
 
				+                        Common.logging(log_type, crawler, env, '到底啦~\n')
			
 
				                         driver.quit()
			
 
				                         return
			
 
				                     num += 1
			
 
				                     Common.logger(log_type, crawler).info(f'拖动"视频"列表第{num}个至屏幕中间')
			
 
				+                    Common.logging(log_type, crawler, env, f'拖动"视频"列表第{num}个至屏幕中间')
			
 
				                     driver.execute_script("arguments[0].scrollIntoView({block:'center',inline:'center'})", video_element)
			
 
				                     time.sleep(3)
			
 
				                     # driver.get_screenshot_as_file(f"./{crawler}/logs/{num}.jpg")
			
@@ -616,15 +621,18 @@ class XiguasearchScheduling:
 
				                     video_dict = cls.get_video_info(log_type, crawler, item_id)
			
 
				                     if video_dict is None:
			
 
				                         Common.logger(log_type, crawler).info("无效视频\n")
			
 
				+                        Common.logging(log_type, crawler, env, "无效视频\n")
			
 
				                         continue
			
 
				                     for k, v in video_dict.items():
			
 
				                         Common.logger(log_type, crawler).info(f"{k}:{v}")
			
 
				+                    Common.logging(log_type, crawler, env, f"{video_dict}")
			
 
				                     # if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)):
			
 
				                     #     Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n')
			
 
				                     #     driver.quit()
			
 
				                     #     return
			
 
				                     if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False:
			
 
				                         Common.logger(log_type, crawler).info("不满足抓取规则\n")
			
 
				+                        Common.logging(log_type, crawler, env, "不满足抓取规则\n")
			
 
				                     elif any(str(word) if str(word) in video_dict["video_title"] else False
			
 
				                              for word in get_config_from_mysql(log_type=log_type,
			
 
				                                                                source=crawler,
			
@@ -632,8 +640,10 @@ class XiguasearchScheduling:
 
				                                                                text="filter",
			
 
				                                                                action="")) is True:
			
 
				                         Common.logger(log_type, crawler).info('已中过滤词\n')
			
 
				+                        Common.logging(log_type, crawler, env, '已中过滤词\n')
			
 
				                     elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0:
			
 
				                         Common.logger(log_type, crawler).info('视频已下载\n')
			
 
				+                        Common.logging(log_type, crawler, env, '视频已下载\n')
			
 
				                     else:
			
 
				                         cls.download_publish(log_type=log_type,
			
 
				                                              crawler=crawler,
			
@@ -643,8 +653,10 @@ class XiguasearchScheduling:
 
				                                              env=env)
			
 
				                 except Exception as e:
			
 
				                     Common.logger(log_type, crawler).warning(f"抓取单条视频异常:{e}\n")
			
 
				+                    Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n")
			
 
				 
			
 
				             Common.logger(log_type, crawler).info('已抓取完一组视频，休眠10秒\n')
			
 
				+            Common.logging(log_type, crawler, env, '已抓取完一组视频，休眠10秒\n')
			
 
				             time.sleep(10)
			
 
				             index = index + len(video_element_temp)
			
 
				 
			
@@ -672,11 +684,13 @@ class XiguasearchScheduling:
 
				                 # 删除视频文件夹
			
 
				                 shutil.rmtree(f"./{crawler}/videos/{md_title}")
			
 
				                 Common.logger(log_type, crawler).info("视频size=0，删除成功\n")
			
 
				+                Common.logging(log_type, crawler, env, "视频size=0，删除成功\n")
			
 
				                 return
			
 
				         except FileNotFoundError:
			
 
				             # 删除视频文件夹
			
 
				             shutil.rmtree(f"./{crawler}/videos/{md_title}")
			
 
				             Common.logger(log_type, crawler).info("视频文件不存在，删除文件夹成功\n")
			
 
				+            Common.logging(log_type, crawler, env, "视频文件不存在，删除文件夹成功\n")
			
 
				             return
			
 
				         # 下载封面
			
 
				         Common.download_method(log_type=log_type, crawler=crawler, text='cover',
			
@@ -686,6 +700,7 @@ class XiguasearchScheduling:
 
				 
			
 
				         # 上传视频
			
 
				         Common.logger(log_type, crawler).info("开始上传视频...")
			
 
				+        Common.logging(log_type, crawler, env, "开始上传视频...")
			
 
				         if env == "dev":
			
 
				             oss_endpoint = "out"
			
 
				             our_video_id = Publish.upload_and_publish(log_type=log_type,
			
@@ -746,9 +761,11 @@ class XiguasearchScheduling:
 
				                                 {int(video_dict['video_width'])},
			
 
				                                 {int(video_dict['video_height'])}) """
			
 
				         Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
			
 
				+        Common.logging(log_type, crawler, env, f"insert_sql:{insert_sql}")
			
 
				         MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
			
 
				         cls.download_cnt += 1
			
 
				         Common.logger(log_type, crawler).info("视频信息写入数据库完成")
			
 
				+        Common.logging(log_type, crawler, env, "视频信息写入数据库完成")
			
 
				 
			
 
				         # 视频信息写入飞书
			
 
				         Feishu.insert_columns(log_type, crawler, "BUNvGC", "ROWS", 1, 2)
			
@@ -775,6 +792,7 @@ class XiguasearchScheduling:
 
				         time.sleep(0.5)
			
 
				         Feishu.update_values(log_type, crawler, "BUNvGC", "E2:Z2", values)
			
 
				         Common.logger(log_type, crawler).info('视频信息写入飞书完成\n')
			
 
				+        Common.logging(log_type, crawler, env, '视频信息写入飞书完成\n')
			
 
				 
			
 
				     @classmethod
			
 
				     def get_search_videos(cls, log_type, crawler, user_list, rule_dict, env):
			
@@ -782,6 +800,7 @@ class XiguasearchScheduling:
 
				             try:
			
 
				                 cls.download_cnt = 0
			
 
				                 Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['link']} 视频\n")
			
 
				+                Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['link']} 视频\n")
			
 
				                 cls.get_videoList(log_type=log_type,
			
 
				                                   crawler=crawler,
			
 
				                                   user_dict=user_dict,
			
@@ -789,6 +808,7 @@ class XiguasearchScheduling:
 
				                                   env=env)
			
 
				             except Exception as e:
			
 
				                 Common.logger(log_type, crawler).error(f"抓取{user_dict['link']}视频时异常:{e}\n")
			
 
				+                Common.logging(log_type, crawler, env, f"抓取{user_dict['link']}视频时异常:{e}\n")
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':