__init__.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. """ from https://github.com/keithito/tacotron """
  2. import re
  3. import random
  4. from text import cleaners
  5. from text.symbols import symbols
  6. from pypinyin import pinyin, lazy_pinyin, Style
  7. import phkit
  8. # Mappings from symbol to numeric ID and vice versa:
  9. _symbol_to_id = {s: i for i, s in enumerate(symbols)}
  10. _id_to_symbol = {i: s for i, s in enumerate(symbols)}
  11. # Regular expression matching text enclosed in curly braces:
  12. _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
  13. _words_re = re.compile(r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)")
  14. _chinese_words_re = re.compile(r'[\u4e00-\u9fa5]+')
  15. def get_arpabet(word, dictionary):
  16. word_arpabet = dictionary.lookup(word)
  17. if word_arpabet is not None:
  18. return "{" + word_arpabet[0] + "}"
  19. else:
  20. return word
  21. def text_to_sequence(text, cleaner_names, dictionary=None, p_arpabet=1.0):
  22. '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
  23. The text can optionally have ARPAbet sequences enclosed in curly braces embedded
  24. in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
  25. Args:
  26. text: string to convert to a sequence
  27. cleaner_names: names of the cleaner functions to run the text through
  28. dictionary: arpab俄et class with arpabet dictionary
  29. Returns:
  30. List of integers corresponding to the symbols in the text
  31. '''
  32. sequence = []
  33. # Check for curly braces and treat their contents as ARPAbet:
  34. while len(text):
  35. m = _curly_re.match(text)
  36. if not m:
  37. clean_text = _clean_text(text, cleaner_names)
  38. if dictionary is not None:
  39. words = _words_re.findall(text)
  40. clean_text = [
  41. get_arpabet(word[0], dictionary)
  42. if ((word[0] != '') and random.random() < p_arpabet) else word[1]
  43. for word in words]
  44. for i in range(len(clean_text)):
  45. t = clean_text[i]
  46. if t.startswith("{"):
  47. sequence += _arpabet_to_sequence(t[1:-1])
  48. else:
  49. sequence += _symbols_to_sequence(t)
  50. #sequence += space
  51. else:
  52. sequence += _symbols_to_sequence(clean_text)
  53. break
  54. sequence += text_to_sequence(m.group(1), cleaner_names, dictionary, p_arpabet)
  55. sequence += _arpabet_to_sequence(m.group(2))
  56. text = m.group(3)
  57. return sequence
  58. def chinese_text_to_phoneme_sequence(text):
  59. '''
  60. convert chinese words to phoneme , phkit toolkit implement
  61. chinese word normalize and change pitch for continuous chinese word
  62. '''
  63. sequence = []
  64. while len(text):
  65. sequence = phkit.text2sequence(text)[:-3]
  66. break
  67. return sequence
  68. def sequence_to_text(sequence):
  69. '''Converts a sequence of IDs back to a string'''
  70. result = ''
  71. for symbol_id in sequence:
  72. if symbol_id in _id_to_symbol:
  73. s = _id_to_symbol[symbol_id]
  74. # Enclose ARPAbet back in curly braces:
  75. if len(s) > 1 and s[0] == '@':
  76. s = '{%s}' % s[1:]
  77. result += s
  78. return result.replace('}{', ' ')
  79. def _clean_text(text, cleaner_names):
  80. for name in cleaner_names:
  81. cleaner = getattr(cleaners, name)
  82. if not cleaner:
  83. raise Exception('Unknown cleaner: %s' % name)
  84. text = cleaner(text)
  85. return text
  86. def _symbols_to_sequence(symbols):
  87. return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
  88. def _arpabet_to_sequence(text):
  89. return _symbols_to_sequence(['@' + s for s in text.split()])
  90. def _should_keep_symbol(s):
  91. return s in _symbol_to_id and s is not '_' and s is not '~'