|
@@ -89,12 +89,12 @@ def get_file_list(data_path, config, file_extensions=['.gz']):
|
|
continue
|
|
continue
|
|
all_files.append(file)
|
|
all_files.append(file)
|
|
|
|
|
|
- print(sub_dirs,all_files)
|
|
|
|
|
|
+ #print(sub_dirs,all_files)
|
|
# 如果配置中指定了分割文件列表
|
|
# 如果配置中指定了分割文件列表
|
|
if config.get("runner.split_file_list"):
|
|
if config.get("runner.split_file_list"):
|
|
logger.info("Split file list for worker {}".format(dist.get_rank()))
|
|
logger.info("Split file list for worker {}".format(dist.get_rank()))
|
|
all_files = fleet.util.get_file_shard(all_files)
|
|
all_files = fleet.util.get_file_shard(all_files)
|
|
- logger.info("File list: {}".format(all_files))
|
|
|
|
|
|
+ logger.info("File list: {}".format(sub_dirs))
|
|
|
|
|
|
base_url = f'{configs["fs.default.name"]}'
|
|
base_url = f'{configs["fs.default.name"]}'
|
|
full_paths = [base_url + file for file in all_files]
|
|
full_paths = [base_url + file for file in all_files]
|