|
@@ -32,20 +32,13 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(level
|
|
|
|
|
|
def get_partition_df(table, dt):
|
|
|
logger.info(f"开始下载: {table} -- {dt} 的数据")
|
|
|
- df = pd.DataFrame()
|
|
|
- try:
|
|
|
- table_info = odps_client.get_table(table)
|
|
|
- col_names = [col.name for col in table_info.table_schema.columns]
|
|
|
- download_session = odps_client.get_download_session(table, dt)
|
|
|
- logger.info(f"表: {table} 中的分区 {dt}, 共有 {download_session.count} 条数据")
|
|
|
- with download_session.open_record_reader(0, download_session.count) as reader:
|
|
|
- records = []
|
|
|
- for record in reader:
|
|
|
- records.append(record.values) # 获取每一行的值
|
|
|
- # 使用元数据中的列名
|
|
|
- df = pd.DataFrame(records, columns=col_names)
|
|
|
- except Exception as e:
|
|
|
- logger.error(f"下载 {table} -- {dt} 的数据异常: ", e)
|
|
|
+
|
|
|
+ download_session = odps_client.get_download_session(table, dt)
|
|
|
+ logger.info(f"表: {table} 中的分区 {dt}, 共有 {download_session.count} 条数据")
|
|
|
+
|
|
|
+ with download_session.open_arrow_reader(0, download_session.count) as reader:
|
|
|
+ # 将所有数据加载到 DataFrame 中
|
|
|
+ df = pd.concat([batch.to_pandas() for batch in reader])
|
|
|
|
|
|
logger.info(f"下载结束: {table} -- {dt} 的数据, 共计 {df.shape[0]} 条数据")
|
|
|
return df
|