""" @author: luojunhui """ import os import json import pymysql from tqdm import tqdm from concurrent.futures.thread import ThreadPoolExecutor def insert_into_mysql(path): """ :param path: 文件路径 :return: """ with open(path, encoding="utf-8") as f: info = json.loads(f.read()) gzh_info = path.split("/")[-3] accountName = gzh_info.split("_")[-1] ghId = gzh_info.replace("_" + accountName, "") baseInfo = info.get("BaseInfo", {}) appMsgId = info.get("AppMsg", {}).get("BaseInfo", {}).get("AppMsgId", None) createTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None) updateTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("UpdateTime", None) Type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None) detail_article_list = info.get("AppMsg", {}).get("DetailInfo", []) if detail_article_list: for article in detail_article_list: title = article.get("Title", None) Digest = article.get("Digest", None) ItemIndex = article.get("ItemIndex", None) ContentUrl = article.get("ContentUrl", None) SourceUrl = article.get("SourceUrl", None) CoverImgUrl = article.get("CoverImgUrl", None) CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None) CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None) ItemShowType = article.get("ItemShowType", None) IsOriginal = article.get("IsOriginal", None) ShowDesc = article.get("ShowDesc", None) ori_content = article.get("ori_content", None) show_view_count = article.get("show_view_count", 0) show_like_count = article.get("show_like_count", 0) show_zs_count = article.get("show_zs_count", 0) show_pay_count = article.get("show_pay_count", 0) wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None info_tuple = ( ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_235_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, json.dumps(baseInfo, ensure_ascii=False) ) connection = pymysql.connect( host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com', port=3306, user='crawler', password='crawler123456@', db='piaoquan-crawler', charset='utf8mb4' ) insert_sql = f""" INSERT INTO official_articles (ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_255_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, baseInfo) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ cursor = connection.cursor() cursor.execute( insert_sql, info_tuple ) connection.commit() def get_file_list(): """ 获取文件 :return: """ path = 'account' dirs = os.listdir(path) sub_dirs = [os.path.join(path, i, "msg") for i in dirs] L = [] for sub_dir in sub_dirs: try: file_list = os.listdir(sub_dir) file_path_list = [os.path.join(sub_dir, i) for i in file_list] L.append(file_path_list) except: pass return L if __name__ == '__main__': file_list = get_file_list() L_files = [] c = 0 for files in tqdm(file_list): c += len(files) print(c) # with ThreadPoolExecutor(max_workers=10) as pool: # pool.map(insert_into_mysql, files)