|
@@ -79,22 +79,17 @@ def get_infer_reader(input_var, config):
|
|
|
|
|
|
|
|
|
def get_file_list(data_path, config):
|
|
|
- logger.info("data_path: {}".format(data_path))
|
|
|
-
|
|
|
- cmd = "test -e {}".format(data_path)
|
|
|
- ret, out = hdfs_client._run_cmd(cmd, redirect_stderr=True, retry_times=1)
|
|
|
- logger.info("ret: {} out: {}".format(ret, out))
|
|
|
- cmd = "ls " + data_path
|
|
|
- ret, lines = hdfs_client._run_cmd(cmd)
|
|
|
- logger.info("ret: {} lines: {}".format(ret, lines))
|
|
|
-
|
|
|
dirs,file_list = hdfs_client.ls_dir(data_path)
|
|
|
# 如果配置中指定了分割文件列表
|
|
|
if config.get("runner.split_file_list"):
|
|
|
logger.info("Split file list for worker {}".format(dist.get_rank()))
|
|
|
file_list = get_file_shard(file_list)
|
|
|
logger.info("File list: {}".format(file_list))
|
|
|
- return file_list
|
|
|
+
|
|
|
+ base_url = f'hdfs://{configs["fs.default.name"]}'
|
|
|
+ full_paths = [base_url + file for file in file_list]
|
|
|
+
|
|
|
+ return full_paths
|
|
|
|
|
|
|
|
|
def get_reader_generator(path, reader_name="Reader"):
|