# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/9/25 import argparse import json import os import sys import time sys.path.append(os.getcwd()) from common.common import Common from common.scheduling_db import MysqlHelper class KanyikanSession: chls_file_path = "./kanyikan/chlsfiles/" @classmethod def get_session(cls, log_type, crawler, env): while True: all_files = os.listdir(cls.chls_file_path) chls_files = [] for chls_file in all_files: if "charles" in os.path.splitext(chls_file)[0]: chls_files.append(chls_file) if len(chls_files) < 1: Common.logger(log_type, crawler).info("未找到chlsfile文件,等待60s重试") Common.logging(log_type, crawler, env, "未找到chlsfile文件,等待60s重试") time.sleep(60) continue chls_file = sorted(chls_files)[-1] Common.logger(log_type, crawler).info(f"chls_file:{chls_file}") chls_file_name = os.path.splitext(chls_file)[0] # 重命名文件后缀 os.rename(os.path.join(cls.chls_file_path, chls_file), os.path.join(cls.chls_file_path, f"{chls_file_name}.txt")) with open(os.path.join(cls.chls_file_path, f"{chls_file_name}.txt"), encoding='utf-8-sig', errors='ignore') as f: contents = json.load(f, strict=False) kanyikan_request_list = [] for content in contents: if "search.weixin.qq.com" in content["host"]: kanyikan_request_list.append(content) if len(kanyikan_request_list) == 0: Common.logger(log_type, crawler).info("chlsfile文件中未找到:search.weixin.qq.com,等待60s重试") Common.logging(log_type, crawler, env, "chlsfile文件中未找到:search.weixin.qq.com,等待60s重试") time.sleep(60) continue for kanyikan_request in kanyikan_request_list: if kanyikan_request["path"] == "/cgi-bin/recwxa/recwxagetunreadmessagecnt": Common.logger(log_type, crawler).info(f'query:{kanyikan_request["query"]}\n') sessions = kanyikan_request["query"].split("session=")[-1].split("&")[0] Common.logger(log_type, crawler).info(f"sessions:{sessions}\n") for session in sessions: if any(keyword in session for keyword in ["vid", "offset", "wxaVersion", "limit", "scene", "count", "channelid", "subscene", "clientVersion", "sharesearchid", "nettype", "switchprofile", "switchnewuser"]): session = session.split("&")[0] return session return sessions @classmethod def del_chls_file(cls, log_type, crawler): all_file = sorted(os.listdir(cls.chls_file_path)) for file in all_file: os.remove(os.path.join(cls.chls_file_path, file)) Common.logger(log_type, crawler).info("删除 charles 缓存文件成功\n") @classmethod def save_session(cls, log_type, crawler, env, kanyikan_type): session = cls.get_session(log_type, crawler, env) Common.logger(log_type, crawler).info(session) if kanyikan_type == "kyk": update_sql = f""" UPDATE crawler_config SET config = JSON_SET (config, "$.token", "{session}"), update_time={int(time.time()*1000)} WHERE title="看一看推荐"; """ MysqlHelper.update_values(log_type, crawler, update_sql, env, action="") elif kanyikan_type == "kykjk": update_sql = f""" UPDATE crawler_config SET config = JSON_SET (config, "$.token", "{session}"), update_time={int(time.time()*1000)} WHERE title="看一看健康"; """ MysqlHelper.update_values(log_type, crawler, update_sql, env, action="") elif kanyikan_type == "kykln": update_sql = f""" UPDATE crawler_config SET config = JSON_SET (config, "$.token", "{session}"), update_time={int(time.time()*1000)} WHERE title="看一看老年"; """ MysqlHelper.update_values(log_type, crawler, update_sql, env, action="") Common.logger(log_type, crawler).info("session 更新数据库成功") cls.del_chls_file(log_type, crawler) @classmethod def main(cls, log_type, crawler, env, kanyikan_type): Common.logger(log_type, crawler).info(f'开始抓取:看一看 session\n') Common.logging(log_type, crawler, env, f'开始抓取:看一看 session\n') cls.save_session(log_type, crawler, env, kanyikan_type) Common.del_logs(log_type, crawler) Common.logger(log_type, crawler).info('抓取一轮结束\n') Common.logging(log_type, crawler, env, '抓取一轮结束\n') if __name__ == "__main__": parser = argparse.ArgumentParser() ## 新建参数解释器对象 parser.add_argument('--log_type', type=str) ## 添加参数,注明参数类型 parser.add_argument('--crawler') ## 添加参数 parser.add_argument('--kanyikan_type') ## 添加参数 parser.add_argument('--env') ## 添加参数 args = parser.parse_args() ### 参数赋值,也可以通过终端赋值 KanyikanSession.main(log_type=args.log_type, crawler=args.crawler, kanyikan_type=args.kanyikan_type, env=args.env)