QinghuaAudioProcess.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import os
  2. ## 清华数据集 数据处理
  3. def generatefilelist(datadir,root_data_dir,spearkerid_2_num):
  4. dat = datadir.split('/')[-1]
  5. with open ('./thchs30_%s_filelist.txt'%dat,'w') as f:
  6. for dirpath, dirnames, filenames in os.walk(datadir):
  7. for filename in filenames:
  8. if(".trn" not in filename and not filename.startswith('.')):
  9. spearkerid = spearkerid_2_num[filename.split('_')[0]]
  10. strpath = os.path.join(datadir,filename)
  11. transpath = os.path.join(datadir,filename) + '.trn'
  12. with open(transpath,'r') as t:
  13. transcript = t.readline().strip().replace('..',root_data_dir)
  14. with open(transcript,'r') as t1:
  15. transcript_nor = t1.readline()
  16. transcription = transcript_nor.strip().replace(' ','')
  17. res = strpath + '|' + transcription + '|' + str(spearkerid)+'\n'
  18. f.write(res)
  19. if __name__ == '__main__':
  20. train_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/train'
  21. test_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/test'
  22. root_data_dir = '/Users/tzld/Downloads/THCHS-30/data_thchs30/'
  23. spearkerids = []
  24. for dirpath, dirnames, filenames in os.walk(root_data_dir):
  25. for filename in filenames:
  26. if (".trn" not in filename and not filename.startswith('.')):
  27. spearkerids.append(filename.split('_')[0])
  28. spearkerid_2_num = {j: i for i, j in enumerate(set(spearkerids))}
  29. generatefilelist(train_dir,root_data_dir,spearkerid_2_num)
  30. generatefilelist(test_dir,root_data_dir,spearkerid_2_num)