12345678910111213141516171819202122232425262728293031323334353637383940414243444546 |
- import json
- import pymysql
- import pandas as pd
- connection = pymysql.connect(
- host='rm-bp1159bu17li9hi94.mysql.rds.aliyuncs.com',
- port=3306,
- user='crawler',
- password='crawler123456@',
- db='piaoquan-crawler',
- charset='utf8mb4'
- )
- columns = pd.read_csv("piaoquan_crawler_long_articles_video.csv").columns.tolist()
- def extract_desired_key(json_str):
- """
- ppp
- :param json_str:
- :return:
- """
- try:
- data = json.loads(json_str)
- return data['productionPath'].split("rootSourceId%3D")[1]
- except json.JSONDecodeError:
- return None
- sql = f"""
- select * from long_articles_video where account_name in ("小阳看天下", "小惠爱厨房") and update_time > "2024-07-04";
- """
- cursor = connection.cursor()
- cursor.execute(sql)
- result = cursor.fetchall()
- connection.close()
- L = []
- for line in result:
- temp = list(line)
- temp[-5] = json.loads(temp[-5])['productionPath'].split("rootSourceId%3D")[1] if temp[-5] else None
- temp[-6] = json.loads(temp[-6])['productionPath'].split("rootSourceId%3D")[1] if temp[-6] else None
- temp[-7] = json.loads(temp[-7])['productionPath'].split("rootSourceId%3D")[1] if temp[-7] else None
- print(temp)
- L.append(temp)
- df = pd.DataFrame(L, columns=columns)
- df.to_excel("root_source_id.xlsx", index=False)
|