import os ## 清华数据集 数据处理 def generatefilelist(datadir,root_data_dir,spearkerid_2_num): dat = datadir.split('/')[-1] with open ('./thchs30_%s_filelist.txt'%dat,'w') as f: for dirpath, dirnames, filenames in os.walk(datadir): for filename in filenames: if(".trn" not in filename and not filename.startswith('.')): spearkerid = spearkerid_2_num[filename.split('_')[0]] strpath = os.path.join(datadir,filename) transpath = os.path.join(datadir,filename) + '.trn' with open(transpath,'r') as t: transcript = t.readline().strip().replace('..',root_data_dir) with open(transcript,'r') as t1: transcript_nor = t1.readline() transcription = transcript_nor.strip().replace(' ','') res = strpath + '|' + transcription + '|' + str(spearkerid)+'\n' f.write(res) if __name__ == '__main__': train_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/train' test_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/test' root_data_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/' spearkerids = [] for dirpath, dirnames, filenames in os.walk(root_data_dir): for filename in filenames: if (".trn" not in filename and not filename.startswith('.')): spearkerids.append(filename.split('_')[0]) spearkerid_2_num = {j: i for i, j in enumerate(set(spearkerids))} generatefilelist(train_dir,root_data_dir,spearkerid_2_num) generatefilelist(test_dir,root_data_dir,spearkerid_2_num)