charset.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. MBLENGTH = {8: 1, 33: 3, 88: 2, 91: 2}
  2. class Charset:
  3. def __init__(self, id, name, collation, is_default):
  4. self.id, self.name, self.collation = id, name, collation
  5. self.is_default = is_default == "Yes"
  6. def __repr__(self):
  7. return "Charset(id=%s, name=%r, collation=%r)" % (
  8. self.id,
  9. self.name,
  10. self.collation,
  11. )
  12. @property
  13. def encoding(self):
  14. name = self.name
  15. if name in ("utf8mb4", "utf8mb3"):
  16. return "utf8"
  17. if name == "latin1":
  18. return "cp1252"
  19. if name == "koi8r":
  20. return "koi8_r"
  21. if name == "koi8u":
  22. return "koi8_u"
  23. return name
  24. @property
  25. def is_binary(self):
  26. return self.id == 63
  27. class Charsets:
  28. def __init__(self):
  29. self._by_id = {}
  30. self._by_name = {}
  31. def add(self, c):
  32. self._by_id[c.id] = c
  33. if c.is_default:
  34. self._by_name[c.name] = c
  35. def by_id(self, id):
  36. return self._by_id[id]
  37. def by_name(self, name):
  38. return self._by_name.get(name.lower())
  39. _charsets = Charsets()
  40. """
  41. Generated with:
  42. mysql -N -s -e "select id, character_set_name, collation_name, is_default
  43. from information_schema.collations order by id;" | python -c "import sys
  44. for l in sys.stdin.readlines():
  45. id, name, collation, is_default = l.split(chr(9))
  46. print '_charsets.add(Charset(%s, \'%s\', \'%s\', \'%s\'))' \
  47. % (id, name, collation, is_default.strip())
  48. "
  49. """
  50. _charsets.add(Charset(1, "big5", "big5_chinese_ci", "Yes"))
  51. _charsets.add(Charset(2, "latin2", "latin2_czech_cs", ""))
  52. _charsets.add(Charset(3, "dec8", "dec8_swedish_ci", "Yes"))
  53. _charsets.add(Charset(4, "cp850", "cp850_general_ci", "Yes"))
  54. _charsets.add(Charset(5, "latin1", "latin1_german1_ci", ""))
  55. _charsets.add(Charset(6, "hp8", "hp8_english_ci", "Yes"))
  56. _charsets.add(Charset(7, "koi8r", "koi8r_general_ci", "Yes"))
  57. _charsets.add(Charset(8, "latin1", "latin1_swedish_ci", "Yes"))
  58. _charsets.add(Charset(9, "latin2", "latin2_general_ci", "Yes"))
  59. _charsets.add(Charset(10, "swe7", "swe7_swedish_ci", "Yes"))
  60. _charsets.add(Charset(11, "ascii", "ascii_general_ci", "Yes"))
  61. _charsets.add(Charset(12, "ujis", "ujis_japanese_ci", "Yes"))
  62. _charsets.add(Charset(13, "sjis", "sjis_japanese_ci", "Yes"))
  63. _charsets.add(Charset(14, "cp1251", "cp1251_bulgarian_ci", ""))
  64. _charsets.add(Charset(15, "latin1", "latin1_danish_ci", ""))
  65. _charsets.add(Charset(16, "hebrew", "hebrew_general_ci", "Yes"))
  66. _charsets.add(Charset(18, "tis620", "tis620_thai_ci", "Yes"))
  67. _charsets.add(Charset(19, "euckr", "euckr_korean_ci", "Yes"))
  68. _charsets.add(Charset(20, "latin7", "latin7_estonian_cs", ""))
  69. _charsets.add(Charset(21, "latin2", "latin2_hungarian_ci", ""))
  70. _charsets.add(Charset(22, "koi8u", "koi8u_general_ci", "Yes"))
  71. _charsets.add(Charset(23, "cp1251", "cp1251_ukrainian_ci", ""))
  72. _charsets.add(Charset(24, "gb2312", "gb2312_chinese_ci", "Yes"))
  73. _charsets.add(Charset(25, "greek", "greek_general_ci", "Yes"))
  74. _charsets.add(Charset(26, "cp1250", "cp1250_general_ci", "Yes"))
  75. _charsets.add(Charset(27, "latin2", "latin2_croatian_ci", ""))
  76. _charsets.add(Charset(28, "gbk", "gbk_chinese_ci", "Yes"))
  77. _charsets.add(Charset(29, "cp1257", "cp1257_lithuanian_ci", ""))
  78. _charsets.add(Charset(30, "latin5", "latin5_turkish_ci", "Yes"))
  79. _charsets.add(Charset(31, "latin1", "latin1_german2_ci", ""))
  80. _charsets.add(Charset(32, "armscii8", "armscii8_general_ci", "Yes"))
  81. _charsets.add(Charset(33, "utf8", "utf8_general_ci", "Yes"))
  82. _charsets.add(Charset(34, "cp1250", "cp1250_czech_cs", ""))
  83. _charsets.add(Charset(36, "cp866", "cp866_general_ci", "Yes"))
  84. _charsets.add(Charset(37, "keybcs2", "keybcs2_general_ci", "Yes"))
  85. _charsets.add(Charset(38, "macce", "macce_general_ci", "Yes"))
  86. _charsets.add(Charset(39, "macroman", "macroman_general_ci", "Yes"))
  87. _charsets.add(Charset(40, "cp852", "cp852_general_ci", "Yes"))
  88. _charsets.add(Charset(41, "latin7", "latin7_general_ci", "Yes"))
  89. _charsets.add(Charset(42, "latin7", "latin7_general_cs", ""))
  90. _charsets.add(Charset(43, "macce", "macce_bin", ""))
  91. _charsets.add(Charset(44, "cp1250", "cp1250_croatian_ci", ""))
  92. _charsets.add(Charset(45, "utf8mb4", "utf8mb4_general_ci", "Yes"))
  93. _charsets.add(Charset(46, "utf8mb4", "utf8mb4_bin", ""))
  94. _charsets.add(Charset(47, "latin1", "latin1_bin", ""))
  95. _charsets.add(Charset(48, "latin1", "latin1_general_ci", ""))
  96. _charsets.add(Charset(49, "latin1", "latin1_general_cs", ""))
  97. _charsets.add(Charset(50, "cp1251", "cp1251_bin", ""))
  98. _charsets.add(Charset(51, "cp1251", "cp1251_general_ci", "Yes"))
  99. _charsets.add(Charset(52, "cp1251", "cp1251_general_cs", ""))
  100. _charsets.add(Charset(53, "macroman", "macroman_bin", ""))
  101. _charsets.add(Charset(57, "cp1256", "cp1256_general_ci", "Yes"))
  102. _charsets.add(Charset(58, "cp1257", "cp1257_bin", ""))
  103. _charsets.add(Charset(59, "cp1257", "cp1257_general_ci", "Yes"))
  104. _charsets.add(Charset(63, "binary", "binary", "Yes"))
  105. _charsets.add(Charset(64, "armscii8", "armscii8_bin", ""))
  106. _charsets.add(Charset(65, "ascii", "ascii_bin", ""))
  107. _charsets.add(Charset(66, "cp1250", "cp1250_bin", ""))
  108. _charsets.add(Charset(67, "cp1256", "cp1256_bin", ""))
  109. _charsets.add(Charset(68, "cp866", "cp866_bin", ""))
  110. _charsets.add(Charset(69, "dec8", "dec8_bin", ""))
  111. _charsets.add(Charset(70, "greek", "greek_bin", ""))
  112. _charsets.add(Charset(71, "hebrew", "hebrew_bin", ""))
  113. _charsets.add(Charset(72, "hp8", "hp8_bin", ""))
  114. _charsets.add(Charset(73, "keybcs2", "keybcs2_bin", ""))
  115. _charsets.add(Charset(74, "koi8r", "koi8r_bin", ""))
  116. _charsets.add(Charset(75, "koi8u", "koi8u_bin", ""))
  117. _charsets.add(Charset(76, "utf8", "utf8_tolower_ci", ""))
  118. _charsets.add(Charset(77, "latin2", "latin2_bin", ""))
  119. _charsets.add(Charset(78, "latin5", "latin5_bin", ""))
  120. _charsets.add(Charset(79, "latin7", "latin7_bin", ""))
  121. _charsets.add(Charset(80, "cp850", "cp850_bin", ""))
  122. _charsets.add(Charset(81, "cp852", "cp852_bin", ""))
  123. _charsets.add(Charset(82, "swe7", "swe7_bin", ""))
  124. _charsets.add(Charset(83, "utf8", "utf8_bin", ""))
  125. _charsets.add(Charset(84, "big5", "big5_bin", ""))
  126. _charsets.add(Charset(85, "euckr", "euckr_bin", ""))
  127. _charsets.add(Charset(86, "gb2312", "gb2312_bin", ""))
  128. _charsets.add(Charset(87, "gbk", "gbk_bin", ""))
  129. _charsets.add(Charset(88, "sjis", "sjis_bin", ""))
  130. _charsets.add(Charset(89, "tis620", "tis620_bin", ""))
  131. _charsets.add(Charset(91, "ujis", "ujis_bin", ""))
  132. _charsets.add(Charset(92, "geostd8", "geostd8_general_ci", "Yes"))
  133. _charsets.add(Charset(93, "geostd8", "geostd8_bin", ""))
  134. _charsets.add(Charset(94, "latin1", "latin1_spanish_ci", ""))
  135. _charsets.add(Charset(95, "cp932", "cp932_japanese_ci", "Yes"))
  136. _charsets.add(Charset(96, "cp932", "cp932_bin", ""))
  137. _charsets.add(Charset(97, "eucjpms", "eucjpms_japanese_ci", "Yes"))
  138. _charsets.add(Charset(98, "eucjpms", "eucjpms_bin", ""))
  139. _charsets.add(Charset(99, "cp1250", "cp1250_polish_ci", ""))
  140. _charsets.add(Charset(192, "utf8", "utf8_unicode_ci", ""))
  141. _charsets.add(Charset(193, "utf8", "utf8_icelandic_ci", ""))
  142. _charsets.add(Charset(194, "utf8", "utf8_latvian_ci", ""))
  143. _charsets.add(Charset(195, "utf8", "utf8_romanian_ci", ""))
  144. _charsets.add(Charset(196, "utf8", "utf8_slovenian_ci", ""))
  145. _charsets.add(Charset(197, "utf8", "utf8_polish_ci", ""))
  146. _charsets.add(Charset(198, "utf8", "utf8_estonian_ci", ""))
  147. _charsets.add(Charset(199, "utf8", "utf8_spanish_ci", ""))
  148. _charsets.add(Charset(200, "utf8", "utf8_swedish_ci", ""))
  149. _charsets.add(Charset(201, "utf8", "utf8_turkish_ci", ""))
  150. _charsets.add(Charset(202, "utf8", "utf8_czech_ci", ""))
  151. _charsets.add(Charset(203, "utf8", "utf8_danish_ci", ""))
  152. _charsets.add(Charset(204, "utf8", "utf8_lithuanian_ci", ""))
  153. _charsets.add(Charset(205, "utf8", "utf8_slovak_ci", ""))
  154. _charsets.add(Charset(206, "utf8", "utf8_spanish2_ci", ""))
  155. _charsets.add(Charset(207, "utf8", "utf8_roman_ci", ""))
  156. _charsets.add(Charset(208, "utf8", "utf8_persian_ci", ""))
  157. _charsets.add(Charset(209, "utf8", "utf8_esperanto_ci", ""))
  158. _charsets.add(Charset(210, "utf8", "utf8_hungarian_ci", ""))
  159. _charsets.add(Charset(211, "utf8", "utf8_sinhala_ci", ""))
  160. _charsets.add(Charset(212, "utf8", "utf8_german2_ci", ""))
  161. _charsets.add(Charset(213, "utf8", "utf8_croatian_ci", ""))
  162. _charsets.add(Charset(214, "utf8", "utf8_unicode_520_ci", ""))
  163. _charsets.add(Charset(215, "utf8", "utf8_vietnamese_ci", ""))
  164. _charsets.add(Charset(223, "utf8", "utf8_general_mysql500_ci", ""))
  165. _charsets.add(Charset(224, "utf8mb4", "utf8mb4_unicode_ci", ""))
  166. _charsets.add(Charset(225, "utf8mb4", "utf8mb4_icelandic_ci", ""))
  167. _charsets.add(Charset(226, "utf8mb4", "utf8mb4_latvian_ci", ""))
  168. _charsets.add(Charset(227, "utf8mb4", "utf8mb4_romanian_ci", ""))
  169. _charsets.add(Charset(228, "utf8mb4", "utf8mb4_slovenian_ci", ""))
  170. _charsets.add(Charset(229, "utf8mb4", "utf8mb4_polish_ci", ""))
  171. _charsets.add(Charset(230, "utf8mb4", "utf8mb4_estonian_ci", ""))
  172. _charsets.add(Charset(231, "utf8mb4", "utf8mb4_spanish_ci", ""))
  173. _charsets.add(Charset(232, "utf8mb4", "utf8mb4_swedish_ci", ""))
  174. _charsets.add(Charset(233, "utf8mb4", "utf8mb4_turkish_ci", ""))
  175. _charsets.add(Charset(234, "utf8mb4", "utf8mb4_czech_ci", ""))
  176. _charsets.add(Charset(235, "utf8mb4", "utf8mb4_danish_ci", ""))
  177. _charsets.add(Charset(236, "utf8mb4", "utf8mb4_lithuanian_ci", ""))
  178. _charsets.add(Charset(237, "utf8mb4", "utf8mb4_slovak_ci", ""))
  179. _charsets.add(Charset(238, "utf8mb4", "utf8mb4_spanish2_ci", ""))
  180. _charsets.add(Charset(239, "utf8mb4", "utf8mb4_roman_ci", ""))
  181. _charsets.add(Charset(240, "utf8mb4", "utf8mb4_persian_ci", ""))
  182. _charsets.add(Charset(241, "utf8mb4", "utf8mb4_esperanto_ci", ""))
  183. _charsets.add(Charset(242, "utf8mb4", "utf8mb4_hungarian_ci", ""))
  184. _charsets.add(Charset(243, "utf8mb4", "utf8mb4_sinhala_ci", ""))
  185. _charsets.add(Charset(244, "utf8mb4", "utf8mb4_german2_ci", ""))
  186. _charsets.add(Charset(245, "utf8mb4", "utf8mb4_croatian_ci", ""))
  187. _charsets.add(Charset(246, "utf8mb4", "utf8mb4_unicode_520_ci", ""))
  188. _charsets.add(Charset(247, "utf8mb4", "utf8mb4_vietnamese_ci", ""))
  189. _charsets.add(Charset(248, "gb18030", "gb18030_chinese_ci", "Yes"))
  190. _charsets.add(Charset(249, "gb18030", "gb18030_bin", ""))
  191. _charsets.add(Charset(250, "gb18030", "gb18030_unicode_520_ci", ""))
  192. _charsets.add(Charset(255, "utf8mb4", "utf8mb4_0900_ai_ci", ""))
  193. charset_by_name = _charsets.by_name
  194. charset_by_id = _charsets.by_id