123456789101112131415161718192021222324252627282930313233343536 |
- import os
- ## 清华数据集 数据处理
- def generatefilelist(datadir,root_data_dir,spearkerid_2_num):
- dat = datadir.split('/')[-1]
- with open ('./thchs30_%s_filelist.txt'%dat,'w') as f:
- for dirpath, dirnames, filenames in os.walk(datadir):
- for filename in filenames:
- if(".trn" not in filename and not filename.startswith('.')):
- spearkerid = spearkerid_2_num[filename.split('_')[0]]
- strpath = os.path.join(datadir,filename)
- transpath = os.path.join(datadir,filename) + '.trn'
- with open(transpath,'r') as t:
- transcript = t.readline().strip().replace('..',root_data_dir)
- with open(transcript,'r') as t1:
- transcript_nor = t1.readline()
- transcription = transcript_nor.strip().replace(' ','')
- res = strpath + '|' + transcription + '|' + str(spearkerid)+'\n'
- f.write(res)
- if __name__ == '__main__':
- train_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/train'
- test_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/test'
- root_data_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/'
- spearkerids = []
- for dirpath, dirnames, filenames in os.walk(root_data_dir):
- for filename in filenames:
- if (".trn" not in filename and not filename.startswith('.')):
- spearkerids.append(filename.split('_')[0])
- spearkerid_2_num = {j: i for i, j in enumerate(set(spearkerids))}
- generatefilelist(train_dir,root_data_dir,spearkerid_2_num)
- generatefilelist(test_dir,root_data_dir,spearkerid_2_num)
|