config.json 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. {
  2. "istftnet": {
  3. "upsample_kernel_sizes": [20, 12],
  4. "upsample_rates": [10, 6],
  5. "gen_istft_hop_size": 5,
  6. "gen_istft_n_fft": 20,
  7. "resblock_dilation_sizes": [
  8. [1, 3, 5],
  9. [1, 3, 5],
  10. [1, 3, 5]
  11. ],
  12. "resblock_kernel_sizes": [3, 7, 11],
  13. "upsample_initial_channel": 512
  14. },
  15. "dim_in": 64,
  16. "dropout": 0.2,
  17. "hidden_dim": 512,
  18. "max_conv_dim": 512,
  19. "max_dur": 50,
  20. "multispeaker": true,
  21. "n_layer": 3,
  22. "n_mels": 80,
  23. "n_token": 178,
  24. "style_dim": 128,
  25. "text_encoder_kernel_size": 5,
  26. "plbert": {
  27. "hidden_size": 768,
  28. "num_attention_heads": 12,
  29. "intermediate_size": 2048,
  30. "max_position_embeddings": 512,
  31. "num_hidden_layers": 12,
  32. "dropout": 0.1
  33. },
  34. "vocab": {
  35. ";": 1,
  36. ":": 2,
  37. ",": 3,
  38. ".": 4,
  39. "!": 5,
  40. "?": 6,
  41. "—": 9,
  42. "…": 10,
  43. "\"": 11,
  44. "(": 12,
  45. ")": 13,
  46. "“": 14,
  47. "”": 15,
  48. " ": 16,
  49. "\u0303": 17,
  50. "ʣ": 18,
  51. "ʥ": 19,
  52. "ʦ": 20,
  53. "ʨ": 21,
  54. "ᵝ": 22,
  55. "\uAB67": 23,
  56. "A": 24,
  57. "I": 25,
  58. "O": 31,
  59. "Q": 33,
  60. "S": 35,
  61. "T": 36,
  62. "W": 39,
  63. "Y": 41,
  64. "ᵊ": 42,
  65. "a": 43,
  66. "b": 44,
  67. "c": 45,
  68. "d": 46,
  69. "e": 47,
  70. "f": 48,
  71. "h": 50,
  72. "i": 51,
  73. "j": 52,
  74. "k": 53,
  75. "l": 54,
  76. "m": 55,
  77. "n": 56,
  78. "o": 57,
  79. "p": 58,
  80. "q": 59,
  81. "r": 60,
  82. "s": 61,
  83. "t": 62,
  84. "u": 63,
  85. "v": 64,
  86. "w": 65,
  87. "x": 66,
  88. "y": 67,
  89. "z": 68,
  90. "ɑ": 69,
  91. "ɐ": 70,
  92. "ɒ": 71,
  93. "æ": 72,
  94. "β": 75,
  95. "ɔ": 76,
  96. "ɕ": 77,
  97. "ç": 78,
  98. "ɖ": 80,
  99. "ð": 81,
  100. "ʤ": 82,
  101. "ə": 83,
  102. "ɚ": 85,
  103. "ɛ": 86,
  104. "ɜ": 87,
  105. "ɟ": 90,
  106. "ɡ": 92,
  107. "ɥ": 99,
  108. "ɨ": 101,
  109. "ɪ": 102,
  110. "ʝ": 103,
  111. "ɯ": 110,
  112. "ɰ": 111,
  113. "ŋ": 112,
  114. "ɳ": 113,
  115. "ɲ": 114,
  116. "ɴ": 115,
  117. "ø": 116,
  118. "ɸ": 118,
  119. "θ": 119,
  120. "œ": 120,
  121. "ɹ": 123,
  122. "ɾ": 125,
  123. "ɻ": 126,
  124. "ʁ": 128,
  125. "ɽ": 129,
  126. "ʂ": 130,
  127. "ʃ": 131,
  128. "ʈ": 132,
  129. "ʧ": 133,
  130. "ʊ": 135,
  131. "ʋ": 136,
  132. "ʌ": 138,
  133. "ɣ": 139,
  134. "ɤ": 140,
  135. "χ": 142,
  136. "ʎ": 143,
  137. "ʒ": 147,
  138. "ʔ": 148,
  139. "ˈ": 156,
  140. "ˌ": 157,
  141. "ː": 158,
  142. "ʰ": 162,
  143. "ʲ": 164,
  144. "↓": 169,
  145. "→": 171,
  146. "↗": 172,
  147. "↘": 173,
  148. "ᵻ": 177
  149. }
  150. }