utf.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. // utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
  2. // 10 november 2016
  3. #include "utf.h"
  4. // this code imitates Go's unicode/utf8 and unicode/utf16
  5. // the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
  6. // it is also an imitation so we can license it under looser terms than the Go source
  7. #define badrune 0xFFFD
  8. // encoded must be at most 4 bytes
  9. // TODO clean this code up somehow
  10. size_t utf8EncodeRune(uint32_t rune, char *encoded)
  11. {
  12. uint8_t b;
  13. uint8_t c = 0;
  14. uint8_t d = 0;
  15. uint8_t e = 0;
  16. size_t n;
  17. // not in the valid range for Unicode
  18. if (rune > 0x10FFFF)
  19. rune = badrune;
  20. // surrogate runes cannot be encoded
  21. if (rune >= 0xD800 && rune < 0xE000)
  22. rune = badrune;
  23. if (rune < 0x80) { // ASCII bytes represent themselves
  24. b = (uint8_t) (rune & 0xFF);
  25. n = 1;
  26. goto done;
  27. }
  28. if (rune < 0x800) { // two-byte encoding
  29. c = (uint8_t) (rune & 0x3F);
  30. c |= 0x80;
  31. rune >>= 6;
  32. b = (uint8_t) (rune & 0x1F);
  33. b |= 0xC0;
  34. n = 2;
  35. goto done;
  36. }
  37. if (rune < 0x10000) { // three-byte encoding
  38. d = (uint8_t) (rune & 0x3F);
  39. d |= 0x80;
  40. rune >>= 6;
  41. c = (uint8_t) (rune & 0x3F);
  42. c |= 0x80;
  43. rune >>= 6;
  44. b = (uint8_t) (rune & 0x0F);
  45. b |= 0xE0;
  46. n = 3;
  47. goto done;
  48. }
  49. // otherwise use a four-byte encoding
  50. e = (uint8_t) (rune & 0x3F);
  51. e |= 0x80;
  52. rune >>= 6;
  53. d = (uint8_t) (rune & 0x3F);
  54. d |= 0x80;
  55. rune >>= 6;
  56. c = (uint8_t) (rune & 0x3F);
  57. c |= 0x80;
  58. rune >>= 6;
  59. b = (uint8_t) (rune & 0x07);
  60. b |= 0xF0;
  61. n = 4;
  62. done:
  63. encoded[0] = b;
  64. if (n > 1)
  65. encoded[1] = c;
  66. if (n > 2)
  67. encoded[2] = d;
  68. if (n > 3)
  69. encoded[3] = e;
  70. return n;
  71. }
  72. const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
  73. {
  74. uint8_t b, c;
  75. uint8_t lowestAllowed, highestAllowed;
  76. size_t i, expected;
  77. int bad;
  78. b = (uint8_t) (*s);
  79. if (b < 0x80) { // ASCII bytes represent themselves
  80. *rune = b;
  81. s++;
  82. return s;
  83. }
  84. // 0xC0 and 0xC1 cover 2-byte overlong equivalents
  85. // 0xF5 to 0xFD cover values > 0x10FFFF
  86. // 0xFE and 0xFF were never defined (always illegal)
  87. if (b < 0xC2 || b > 0xF4) { // invalid
  88. *rune = badrune;
  89. s++;
  90. return s;
  91. }
  92. // this determines the range of allowed first continuation bytes
  93. lowestAllowed = 0x80;
  94. highestAllowed = 0xBF;
  95. switch (b) {
  96. case 0xE0:
  97. // disallow 3-byte overlong equivalents
  98. lowestAllowed = 0xA0;
  99. break;
  100. case 0xED:
  101. // disallow surrogate characters
  102. highestAllowed = 0x9F;
  103. break;
  104. case 0xF0:
  105. // disallow 4-byte overlong equivalents
  106. lowestAllowed = 0x90;
  107. break;
  108. case 0xF4:
  109. // disallow values > 0x10FFFF
  110. highestAllowed = 0x8F;
  111. break;
  112. }
  113. // and this determines how many continuation bytes are expected
  114. expected = 1;
  115. if (b >= 0xE0)
  116. expected++;
  117. if (b >= 0xF0)
  118. expected++;
  119. if (nElem != 0) { // are there enough bytes?
  120. nElem--;
  121. if (nElem < expected) { // nope
  122. *rune = badrune;
  123. s++;
  124. return s;
  125. }
  126. }
  127. // ensure that everything is correct
  128. // if not, **only** consume the initial byte
  129. bad = 0;
  130. for (i = 0; i < expected; i++) {
  131. c = (uint8_t) (s[1 + i]);
  132. if (c < lowestAllowed || c > highestAllowed) {
  133. bad = 1;
  134. break;
  135. }
  136. // the old lowestAllowed and highestAllowed is only for the first continuation byte
  137. lowestAllowed = 0x80;
  138. highestAllowed = 0xBF;
  139. }
  140. if (bad) {
  141. *rune = badrune;
  142. s++;
  143. return s;
  144. }
  145. // now do the topmost bits
  146. if (b < 0xE0)
  147. *rune = b & 0x1F;
  148. else if (b < 0xF0)
  149. *rune = b & 0x0F;
  150. else
  151. *rune = b & 0x07;
  152. s++; // we can finally move on
  153. // now do the continuation bytes
  154. for (; expected; expected--) {
  155. c = (uint8_t) (*s);
  156. s++;
  157. c &= 0x3F; // strip continuation bits
  158. *rune <<= 6;
  159. *rune |= c;
  160. }
  161. return s;
  162. }
  163. // encoded must have at most 2 elements
  164. size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
  165. {
  166. uint16_t low, high;
  167. // not in the valid range for Unicode
  168. if (rune > 0x10FFFF)
  169. rune = badrune;
  170. // surrogate runes cannot be encoded
  171. if (rune >= 0xD800 && rune < 0xE000)
  172. rune = badrune;
  173. if (rune < 0x10000) {
  174. encoded[0] = (uint16_t) rune;
  175. return 1;
  176. }
  177. rune -= 0x10000;
  178. low = (uint16_t) (rune & 0x3FF);
  179. rune >>= 10;
  180. high = (uint16_t) (rune & 0x3FF);
  181. encoded[0] = high | 0xD800;
  182. encoded[1] = low | 0xDC00;
  183. return 2;
  184. }
  185. // TODO see if this can be cleaned up somehow
  186. const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
  187. {
  188. uint16_t high, low;
  189. if (*s < 0xD800 || *s >= 0xE000) {
  190. // self-representing character
  191. *rune = *s;
  192. s++;
  193. return s;
  194. }
  195. if (*s >= 0xDC00) {
  196. // out-of-order surrogates
  197. *rune = badrune;
  198. s++;
  199. return s;
  200. }
  201. if (nElem == 1) { // not enough elements
  202. *rune = badrune;
  203. s++;
  204. return s;
  205. }
  206. high = *s;
  207. high &= 0x3FF;
  208. if (s[1] < 0xDC00 || s[1] >= 0xE000) {
  209. // bad surrogate pair
  210. *rune = badrune;
  211. s++;
  212. return s;
  213. }
  214. s++;
  215. low = *s;
  216. s++;
  217. low &= 0x3FF;
  218. *rune = high;
  219. *rune <<= 10;
  220. *rune |= low;
  221. *rune += 0x10000;
  222. return s;
  223. }
  224. // TODO find a way to reduce the code in all of these somehow
  225. // TODO find a way to remove u as well
  226. size_t utf8RuneCount(const char *s, size_t nElem)
  227. {
  228. size_t len;
  229. uint32_t rune;
  230. if (nElem != 0) {
  231. const char *t, *u;
  232. len = 0;
  233. t = s;
  234. while (nElem != 0) {
  235. u = utf8DecodeRune(t, nElem, &rune);
  236. len++;
  237. nElem -= u - t;
  238. t = u;
  239. }
  240. return len;
  241. }
  242. len = 0;
  243. while (*s) {
  244. s = utf8DecodeRune(s, nElem, &rune);
  245. len++;
  246. }
  247. return len;
  248. }
  249. size_t utf8UTF16Count(const char *s, size_t nElem)
  250. {
  251. size_t len;
  252. uint32_t rune;
  253. uint16_t encoded[2];
  254. if (nElem != 0) {
  255. const char *t, *u;
  256. len = 0;
  257. t = s;
  258. while (nElem != 0) {
  259. u = utf8DecodeRune(t, nElem, &rune);
  260. len += utf16EncodeRune(rune, encoded);
  261. nElem -= u - t;
  262. t = u;
  263. }
  264. return len;
  265. }
  266. len = 0;
  267. while (*s) {
  268. s = utf8DecodeRune(s, nElem, &rune);
  269. len += utf16EncodeRune(rune, encoded);
  270. }
  271. return len;
  272. }
  273. size_t utf16RuneCount(const uint16_t *s, size_t nElem)
  274. {
  275. size_t len;
  276. uint32_t rune;
  277. if (nElem != 0) {
  278. const uint16_t *t, *u;
  279. len = 0;
  280. t = s;
  281. while (nElem != 0) {
  282. u = utf16DecodeRune(t, nElem, &rune);
  283. len++;
  284. nElem -= u - t;
  285. t = u;
  286. }
  287. return len;
  288. }
  289. len = 0;
  290. while (*s) {
  291. s = utf16DecodeRune(s, nElem, &rune);
  292. len++;
  293. }
  294. return len;
  295. }
  296. size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
  297. {
  298. size_t len;
  299. uint32_t rune;
  300. char encoded[4];
  301. if (nElem != 0) {
  302. const uint16_t *t, *u;
  303. len = 0;
  304. t = s;
  305. while (nElem != 0) {
  306. u = utf16DecodeRune(t, nElem, &rune);
  307. len += utf8EncodeRune(rune, encoded);
  308. nElem -= u - t;
  309. t = u;
  310. }
  311. return len;
  312. }
  313. len = 0;
  314. while (*s) {
  315. s = utf16DecodeRune(s, nElem, &rune);
  316. len += utf8EncodeRune(rune, encoded);
  317. }
  318. return len;
  319. }