123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- """
- @author: luojunhui
- """
- import os
- import json
- import pymysql
- from tqdm import tqdm
- from concurrent.futures.thread import ThreadPoolExecutor
- def insert_into_mysql(path):
- """
- :param path: 文件路径
- :return:
- """
- with open(path, encoding="utf-8") as f:
- info = json.loads(f.read())
- gzh_info = path.split("/")[-3]
- accountName = gzh_info.split("_")[-1]
- ghId = gzh_info.replace("_" + accountName, "")
- baseInfo = info.get("BaseInfo", {})
- appMsgId = info.get("AppMsg", {}).get("BaseInfo", {}).get("AppMsgId", None)
- createTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
- updateTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("UpdateTime", None)
- Type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
- detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
- if detail_article_list:
- for article in detail_article_list:
- title = article.get("Title", None)
- Digest = article.get("Digest", None)
- ItemIndex = article.get("ItemIndex", None)
- ContentUrl = article.get("ContentUrl", None)
- SourceUrl = article.get("SourceUrl", None)
- CoverImgUrl = article.get("CoverImgUrl", None)
- CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
- CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
- ItemShowType = article.get("ItemShowType", None)
- IsOriginal = article.get("IsOriginal", None)
- ShowDesc = article.get("ShowDesc", None)
- ori_content = article.get("ori_content", None)
- show_view_count = article.get("show_view_count", 0)
- show_like_count = article.get("show_like_count", 0)
- show_zs_count = article.get("show_zs_count", 0)
- show_pay_count = article.get("show_pay_count", 0)
- wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
- info_tuple = (
- ghId,
- accountName,
- appMsgId,
- title,
- Type,
- createTime,
- updateTime,
- Digest,
- ItemIndex,
- ContentUrl,
- SourceUrl,
- CoverImgUrl,
- CoverImgUrl_1_1,
- CoverImgUrl_235_1,
- ItemShowType,
- IsOriginal,
- ShowDesc,
- ori_content,
- show_view_count,
- show_like_count,
- show_zs_count,
- show_pay_count,
- wx_sn,
- json.dumps(baseInfo, ensure_ascii=False)
- )
- connection = pymysql.connect(
- host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
- port=3306,
- user='crawler',
- password='crawler123456@',
- db='piaoquan-crawler',
- charset='utf8mb4'
- )
- insert_sql = f"""
- INSERT INTO official_articles
- (ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_255_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, baseInfo)
- values
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
- """
- cursor = connection.cursor()
- cursor.execute(
- insert_sql,
- info_tuple
- )
- connection.commit()
- def get_file_list():
- """
- 获取文件
- :return:
- """
- path = 'account'
- dirs = os.listdir(path)
- sub_dirs = [os.path.join(path, i, "msg") for i in dirs]
- L = []
- for sub_dir in sub_dirs:
- try:
- file_list = os.listdir(sub_dir)
- file_path_list = [os.path.join(sub_dir, i) for i in file_list]
- L.append(file_path_list)
- except:
- pass
- return L
- if __name__ == '__main__':
- file_list = get_file_list()
- L_files = []
- c = 0
- for files in tqdm(file_list):
- c += len(files)
- print(c)
- # with ThreadPoolExecutor(max_workers=10) as pool:
- # pool.map(insert_into_mysql, files)
|