|
@@ -0,0 +1,120 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import os
|
|
|
+import json
|
|
|
+import pymysql
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+from concurrent.futures.thread import ThreadPoolExecutor
|
|
|
+
|
|
|
+def insert_into_mysql(path):
|
|
|
+ """
|
|
|
+ :param path: 文件路径
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ with open(path, encoding="utf-8") as f:
|
|
|
+ info = json.loads(f.read())
|
|
|
+ gzh_info = path.split("/")[-3]
|
|
|
+ accountName = gzh_info.split("_")[-1]
|
|
|
+ ghId = gzh_info.replace("_" + accountName, "")
|
|
|
+ baseInfo = info.get("BaseInfo", {})
|
|
|
+ appMsgId = info.get("AppMsg", {}).get("BaseInfo", {}).get("AppMsgId", None)
|
|
|
+ createTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
|
|
|
+ updateTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("UpdateTime", None)
|
|
|
+ Type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
|
|
|
+ detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
|
|
|
+ if detail_article_list:
|
|
|
+ for article in detail_article_list:
|
|
|
+ title = article.get("Title", None)
|
|
|
+ Digest = article.get("Digest", None)
|
|
|
+ ItemIndex = article.get("ItemIndex", None)
|
|
|
+ ContentUrl = article.get("ContentUrl", None)
|
|
|
+ SourceUrl = article.get("SourceUrl", None)
|
|
|
+ CoverImgUrl = article.get("CoverImgUrl", None)
|
|
|
+ CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
|
|
|
+ CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
|
|
|
+ ItemShowType = article.get("ItemShowType", None)
|
|
|
+ IsOriginal = article.get("IsOriginal", None)
|
|
|
+ ShowDesc = article.get("ShowDesc", None)
|
|
|
+ ori_content = article.get("ori_content", None)
|
|
|
+ show_view_count = article.get("show_view_count", 0)
|
|
|
+ show_like_count = article.get("show_like_count", 0)
|
|
|
+ show_zs_count = article.get("show_zs_count", 0)
|
|
|
+ show_pay_count = article.get("show_pay_count", 0)
|
|
|
+ wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
|
|
|
+ info_tuple = (
|
|
|
+ ghId,
|
|
|
+ accountName,
|
|
|
+ appMsgId,
|
|
|
+ title,
|
|
|
+ Type,
|
|
|
+ createTime,
|
|
|
+ updateTime,
|
|
|
+ Digest,
|
|
|
+ ItemIndex,
|
|
|
+ ContentUrl,
|
|
|
+ SourceUrl,
|
|
|
+ CoverImgUrl,
|
|
|
+ CoverImgUrl_1_1,
|
|
|
+ CoverImgUrl_235_1,
|
|
|
+ ItemShowType,
|
|
|
+ IsOriginal,
|
|
|
+ ShowDesc,
|
|
|
+ ori_content,
|
|
|
+ show_view_count,
|
|
|
+ show_like_count,
|
|
|
+ show_zs_count,
|
|
|
+ show_pay_count,
|
|
|
+ wx_sn,
|
|
|
+ json.dumps(baseInfo, ensure_ascii=False)
|
|
|
+ )
|
|
|
+ connection = pymysql.connect(
|
|
|
+ host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
|
|
|
+ port=3306,
|
|
|
+ user='crawler',
|
|
|
+ password='crawler123456@',
|
|
|
+ db='piaoquan-crawler',
|
|
|
+ charset='utf8mb4'
|
|
|
+ )
|
|
|
+ insert_sql = f"""
|
|
|
+ INSERT INTO official_articles
|
|
|
+ (ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_255_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, baseInfo)
|
|
|
+ values
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
+ """
|
|
|
+ cursor = connection.cursor()
|
|
|
+ cursor.execute(
|
|
|
+ insert_sql,
|
|
|
+ info_tuple
|
|
|
+ )
|
|
|
+ connection.commit()
|
|
|
+
|
|
|
+
|
|
|
+def get_file_list():
|
|
|
+ """
|
|
|
+ 获取文件
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ path = 'account'
|
|
|
+ dirs = os.listdir(path)
|
|
|
+ sub_dirs = [os.path.join(path, i, "msg") for i in dirs]
|
|
|
+ L = []
|
|
|
+ for sub_dir in sub_dirs:
|
|
|
+ try:
|
|
|
+ file_list = os.listdir(sub_dir)
|
|
|
+ file_path_list = [os.path.join(sub_dir, i) for i in file_list]
|
|
|
+ L.append(file_path_list)
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+ return L
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ file_list = get_file_list()
|
|
|
+ L_files = []
|
|
|
+ c = 0
|
|
|
+ for files in tqdm(file_list):
|
|
|
+ c += len(files)
|
|
|
+ print(c)
|
|
|
+ # with ThreadPoolExecutor(max_workers=10) as pool:
|
|
|
+ # pool.map(insert_into_mysql, files)
|