tokenizer.json 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. {
  2. "version": "1.0",
  3. "truncation": null,
  4. "padding": null,
  5. "added_tokens": [],
  6. "normalizer": {
  7. "type": "Replace",
  8. "pattern": {
  9. "Regex": "[^ !\"$',.:;?A-Za-z\u00a1\u00ab\u00bb\u00bf\u00e6\u00e7\u00f0\u00f8\u0127\u014b\u0153\u01c0-\u01c3\u0250-\u0268\u026a-\u0276\u0278-\u027b\u027d\u027e\u0280-\u0284\u0288-\u0292\u0294\u0295\u0298\u0299\u029b-\u029d\u029f\u02a1\u02a2\u02a4\u02a7\u02b0-\u02b2\u02b4\u02b7\u02bc\u02c8\u02cc\u02d0\u02d1\u02de\u02e0\u02e4\u0329\u03b2\u03b8\u03c7\u1d7b\u2014\u201c\u201d\u2026\u2191-\u2193\u2197\u2198\u2c71]"
  10. },
  11. "content": ""
  12. },
  13. "pre_tokenizer": {
  14. "type": "Split",
  15. "pattern": {
  16. "Regex": ""
  17. },
  18. "behavior": "Isolated",
  19. "invert": false
  20. },
  21. "post_processor": {
  22. "type": "TemplateProcessing",
  23. "single": [
  24. {
  25. "SpecialToken": {
  26. "id": "$",
  27. "type_id": 0
  28. }
  29. },
  30. {
  31. "Sequence": {
  32. "id": "A",
  33. "type_id": 0
  34. }
  35. },
  36. {
  37. "SpecialToken": {
  38. "id": "$",
  39. "type_id": 0
  40. }
  41. }
  42. ],
  43. "special_tokens": {
  44. "$": {
  45. "id": "$",
  46. "ids": [
  47. 0
  48. ],
  49. "tokens": [
  50. "$"
  51. ]
  52. }
  53. }
  54. },
  55. "decoder": null,
  56. "model": {
  57. "vocab": {
  58. "$": 0,
  59. ";": 1,
  60. ":": 2,
  61. ",": 3,
  62. ".": 4,
  63. "!": 5,
  64. "?": 6,
  65. "\u00a1": 7,
  66. "\u00bf": 8,
  67. "\u2014": 9,
  68. "\u2026": 10,
  69. "\"": 11,
  70. "\u00ab": 12,
  71. "\u00bb": 13,
  72. "\u201c": 14,
  73. "\u201d": 15,
  74. " ": 16,
  75. "A": 17,
  76. "B": 18,
  77. "C": 19,
  78. "D": 20,
  79. "E": 21,
  80. "F": 22,
  81. "G": 23,
  82. "H": 24,
  83. "I": 25,
  84. "J": 26,
  85. "K": 27,
  86. "L": 28,
  87. "M": 29,
  88. "N": 30,
  89. "O": 31,
  90. "P": 32,
  91. "Q": 33,
  92. "R": 34,
  93. "S": 35,
  94. "T": 36,
  95. "U": 37,
  96. "V": 38,
  97. "W": 39,
  98. "X": 40,
  99. "Y": 41,
  100. "Z": 42,
  101. "a": 43,
  102. "b": 44,
  103. "c": 45,
  104. "d": 46,
  105. "e": 47,
  106. "f": 48,
  107. "g": 49,
  108. "h": 50,
  109. "i": 51,
  110. "j": 52,
  111. "k": 53,
  112. "l": 54,
  113. "m": 55,
  114. "n": 56,
  115. "o": 57,
  116. "p": 58,
  117. "q": 59,
  118. "r": 60,
  119. "s": 61,
  120. "t": 62,
  121. "u": 63,
  122. "v": 64,
  123. "w": 65,
  124. "x": 66,
  125. "y": 67,
  126. "z": 68,
  127. "\u0251": 69,
  128. "\u0250": 70,
  129. "\u0252": 71,
  130. "\u00e6": 72,
  131. "\u0253": 73,
  132. "\u0299": 74,
  133. "\u03b2": 75,
  134. "\u0254": 76,
  135. "\u0255": 77,
  136. "\u00e7": 78,
  137. "\u0257": 79,
  138. "\u0256": 80,
  139. "\u00f0": 81,
  140. "\u02a4": 82,
  141. "\u0259": 83,
  142. "\u0258": 84,
  143. "\u025a": 85,
  144. "\u025b": 86,
  145. "\u025c": 87,
  146. "\u025d": 88,
  147. "\u025e": 89,
  148. "\u025f": 90,
  149. "\u0284": 91,
  150. "\u0261": 92,
  151. "\u0260": 93,
  152. "\u0262": 94,
  153. "\u029b": 95,
  154. "\u0266": 96,
  155. "\u0267": 97,
  156. "\u0127": 98,
  157. "\u0265": 99,
  158. "\u029c": 100,
  159. "\u0268": 101,
  160. "\u026a": 102,
  161. "\u029d": 103,
  162. "\u026d": 104,
  163. "\u026c": 105,
  164. "\u026b": 106,
  165. "\u026e": 107,
  166. "\u029f": 108,
  167. "\u0271": 109,
  168. "\u026f": 110,
  169. "\u0270": 111,
  170. "\u014b": 112,
  171. "\u0273": 113,
  172. "\u0272": 114,
  173. "\u0274": 115,
  174. "\u00f8": 116,
  175. "\u0275": 117,
  176. "\u0278": 118,
  177. "\u03b8": 119,
  178. "\u0153": 120,
  179. "\u0276": 121,
  180. "\u0298": 122,
  181. "\u0279": 123,
  182. "\u027a": 124,
  183. "\u027e": 125,
  184. "\u027b": 126,
  185. "\u0280": 127,
  186. "\u0281": 128,
  187. "\u027d": 129,
  188. "\u0282": 130,
  189. "\u0283": 131,
  190. "\u0288": 132,
  191. "\u02a7": 133,
  192. "\u0289": 134,
  193. "\u028a": 135,
  194. "\u028b": 136,
  195. "\u2c71": 137,
  196. "\u028c": 138,
  197. "\u0263": 139,
  198. "\u0264": 140,
  199. "\u028d": 141,
  200. "\u03c7": 142,
  201. "\u028e": 143,
  202. "\u028f": 144,
  203. "\u0291": 145,
  204. "\u0290": 146,
  205. "\u0292": 147,
  206. "\u0294": 148,
  207. "\u02a1": 149,
  208. "\u0295": 150,
  209. "\u02a2": 151,
  210. "\u01c0": 152,
  211. "\u01c1": 153,
  212. "\u01c2": 154,
  213. "\u01c3": 155,
  214. "\u02c8": 156,
  215. "\u02cc": 157,
  216. "\u02d0": 158,
  217. "\u02d1": 159,
  218. "\u02bc": 160,
  219. "\u02b4": 161,
  220. "\u02b0": 162,
  221. "\u02b1": 163,
  222. "\u02b2": 164,
  223. "\u02b7": 165,
  224. "\u02e0": 166,
  225. "\u02e4": 167,
  226. "\u02de": 168,
  227. "\u2193": 169,
  228. "\u2191": 170,
  229. "\u2192": 171,
  230. "\u2197": 172,
  231. "\u2198": 173,
  232. "\u0329": 175,
  233. "'": 176,
  234. "\u1d7b": 177
  235. }
  236. }
  237. }