|
@@ -1,155 +1,66 @@
|
|
|
+import json
|
|
|
+import os
|
|
|
+import sys
|
|
|
import argparse
|
|
|
-import time
|
|
|
-import random
|
|
|
-from mq_http_sdk.mq_client import *
|
|
|
-from mq_http_sdk.mq_consumer import *
|
|
|
-from mq_http_sdk.mq_exception import MQExceptionBase
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
-# from common.public import task_fun_mq, get_consumer, ack_message
|
|
|
-# from common.scheduling_db import MysqlHelper
|
|
|
-# from common import AliyunLogger
|
|
|
-# from zhuwanwufusu.zhuwanwufusu_recommend import ZhuWanWuFuSuRecommend
|
|
|
|
|
|
+from application.common.mysql import MysqlHelper
|
|
|
+from application.config import spider_map
|
|
|
+from application.spider.crawler_online import *
|
|
|
|
|
|
-def main(platform, mode, env):
|
|
|
- # consumer = get_consumer(topic_name, group_id)
|
|
|
- # # 长轮询表示如果Topic没有消息,则客户端请求会在服务端挂起3秒,3秒内如果有消息可以消费则立即返回响应。
|
|
|
- # # 长轮询时间3秒(最多可设置为30秒)。
|
|
|
- # wait_seconds = 30
|
|
|
- # # 一次最多消费3条(最多可设置为16条)。
|
|
|
- # batch = 1
|
|
|
- # AliyunLogger.logging(
|
|
|
- # code="1000",
|
|
|
- # platform=crawler,
|
|
|
- # mode=log_type,
|
|
|
- # env=env,
|
|
|
- # message=f'{10 * "="}Consume And Ack Message From Topic{10 * "="}\n'
|
|
|
- # f"WaitSeconds:{wait_seconds}\n"
|
|
|
- # f"TopicName:{topic_name}\n"
|
|
|
- # f"MQConsumer:{group_id}",
|
|
|
- # )
|
|
|
- while True:
|
|
|
- try:
|
|
|
- # 长轮询消费消息。
|
|
|
- recv_msgs = consumer.consume_message(batch, wait_seconds)
|
|
|
- for msg in recv_msgs:
|
|
|
- AliyunLogger.logging(
|
|
|
- code="1000",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message=f"Receive\n"
|
|
|
- f"MessageId:{msg.message_id}\n"
|
|
|
- f"MessageBodyMD5:{msg.message_body_md5}\n"
|
|
|
- f"MessageTag:{msg.message_tag}\n"
|
|
|
- f"ConsumedTimes:{msg.consumed_times}\n"
|
|
|
- f"PublishTime:{msg.publish_time}\n"
|
|
|
- f"Body:{msg.message_body}\n"
|
|
|
- f"NextConsumeTime:{msg.next_consume_time}\n"
|
|
|
- f"ReceiptHandle:{msg.receipt_handle}\n"
|
|
|
- f"Properties:{msg.properties}",
|
|
|
- )
|
|
|
- # ack_mq_message
|
|
|
- ack_message(
|
|
|
- log_type=log_type,
|
|
|
- crawler=crawler,
|
|
|
- recv_msgs=recv_msgs,
|
|
|
- consumer=consumer,
|
|
|
- )
|
|
|
- # 解析 task_dict
|
|
|
- task_dict = task_fun_mq(msg.message_body)["task_dict"]
|
|
|
- AliyunLogger.logging(
|
|
|
- code="1000",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message="f调度任务:{task_dict}",
|
|
|
- )
|
|
|
- # 解析 rule_dict
|
|
|
- rule_dict = task_fun_mq(msg.message_body)["rule_dict"]
|
|
|
- AliyunLogger.logging(
|
|
|
- code="1000",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message=f"抓取规则:{rule_dict}\n",
|
|
|
- )
|
|
|
- # 解析 user_list
|
|
|
- task_id = task_dict["id"]
|
|
|
- select_user_sql = (
|
|
|
- f"""select * from crawler_user_v3 where task_id={task_id}"""
|
|
|
- )
|
|
|
- user_list = MysqlHelper.get_values(
|
|
|
- log_type, crawler, select_user_sql, env, action=""
|
|
|
- )
|
|
|
- AliyunLogger.logging(
|
|
|
- code="1003",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message="开始抓取"
|
|
|
- )
|
|
|
- AliyunLogger.logging(
|
|
|
- code="1000",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message="开始抓取祝万物复苏——推荐",
|
|
|
- )
|
|
|
- main_process = ZhuWanWuFuSuRecommend(
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- rule_dict=rule_dict,
|
|
|
- user_list=user_list,
|
|
|
- env=env
|
|
|
- )
|
|
|
- main_process.schedule()
|
|
|
- AliyunLogger.logging(
|
|
|
- code="1000",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message="完成抓取——祝万物复苏",
|
|
|
- )
|
|
|
- AliyunLogger.logging(
|
|
|
- code="1004", platform=crawler, mode=log_type, env=env,message="结束一轮抓取"
|
|
|
- )
|
|
|
|
|
|
- except MQExceptionBase as err:
|
|
|
- # Topic中没有消息可消费。
|
|
|
- if err.type == "MessageNotExist":
|
|
|
- AliyunLogger.logging(
|
|
|
- code="2000",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message=f"No new message! RequestId:{err.req_id}\n",
|
|
|
- )
|
|
|
- continue
|
|
|
- AliyunLogger.logging(
|
|
|
- code="2000",
|
|
|
- platform=crawler,
|
|
|
- mode=log_type,
|
|
|
- env=env,
|
|
|
- message=f"Consume Message Fail! Exception:{err}\n",
|
|
|
+class OnlineManager(object):
|
|
|
+ def __init__(self, task_id, mode, platform):
|
|
|
+ self.env = "prod"
|
|
|
+ self.task_id = task_id
|
|
|
+ self.mode = mode
|
|
|
+ self.platform = platform
|
|
|
+ self.MySQL = MysqlHelper(mode=self.mode, platform=self.platform, env=self.env)
|
|
|
+
|
|
|
+ def get_task_rule(self):
|
|
|
+ """
|
|
|
+ :return: 返回任务的规则, task_rule
|
|
|
+ """
|
|
|
+ rule_dict = {}
|
|
|
+ task_rule_sql = f"SELECT rule FROM crawler_task_v3 WHERE id = {self.task_id};"
|
|
|
+ data = self.MySQL.select(task_rule_sql)
|
|
|
+ if data:
|
|
|
+ rule_list = json.loads(data[0][0])
|
|
|
+ for item in rule_list:
|
|
|
+ for key in item:
|
|
|
+ rule_dict[key] = item[key]
|
|
|
+ return rule_dict
|
|
|
+
|
|
|
+ def get_task_user_list(self):
|
|
|
+ """
|
|
|
+ :return: 返回用户列表
|
|
|
+ """
|
|
|
+ task_user_list_sql = f"SELECT uid, link from crawler_user_v3 where task_id = {self.task_id};"
|
|
|
+ uid_list = self.MySQL.select(task_user_list_sql)
|
|
|
+ user_list = [{"uid": i[0], "link": i[1]} for i in uid_list] if uid_list else []
|
|
|
+ return user_list
|
|
|
+
|
|
|
+ def start_crawl(self):
|
|
|
+ rule_dict = self.get_task_rule()
|
|
|
+ user_list = self.get_task_user_list()
|
|
|
+ if rule_dict and user_list:
|
|
|
+ spider_class = spider_map[self.platform][self.mode]
|
|
|
+ main_process = spider_class(
|
|
|
+ platform=self.platform,
|
|
|
+ mode=self.mode,
|
|
|
+ rule_dict=rule_dict,
|
|
|
+ user_list=user_list,
|
|
|
+ env=self.env
|
|
|
)
|
|
|
- time.sleep(2)
|
|
|
- continue
|
|
|
+ main_process.run()
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
parser = argparse.ArgumentParser() ## 新建参数解释器对象
|
|
|
- parser.add_argument("--log_type", type=str) ## 添加参数,注明参数类型
|
|
|
- parser.add_argument("--crawler") ## 添加参数
|
|
|
- parser.add_argument("--topic_name") ## 添加参数
|
|
|
- parser.add_argument("--group_id") ## 添加参数
|
|
|
- parser.add_argument("--env") ## 添加参数
|
|
|
- args = parser.parse_args() ### 参数赋值,也可以通过终端赋值
|
|
|
- main(
|
|
|
- log_type=args.log_type,
|
|
|
- crawler=args.crawler,
|
|
|
- topic_name=args.topic_name,
|
|
|
- group_id=args.group_id,
|
|
|
- env=args.env,
|
|
|
- )
|
|
|
+ parser.add_argument("--task_id")
|
|
|
+ parser.add_argument("--mode")
|
|
|
+ parser.add_argument("--platform")
|
|
|
+ args = parser.parse_args()
|
|
|
+ M = OnlineManager(task_id=args.task_id, mode=args.mode, platform=args.platform)
|
|
|
+ M.start_crawl()
|