123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350 |
- // utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
- // 10 november 2016
- #include "utf.h"
- // this code imitates Go's unicode/utf8 and unicode/utf16
- // the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
- // it is also an imitation so we can license it under looser terms than the Go source
- #define badrune 0xFFFD
- // encoded must be at most 4 bytes
- // TODO clean this code up somehow
- size_t utf8EncodeRune(uint32_t rune, char *encoded)
- {
- uint8_t b;
- uint8_t c = 0;
- uint8_t d = 0;
- uint8_t e = 0;
- size_t n;
- // not in the valid range for Unicode
- if (rune > 0x10FFFF)
- rune = badrune;
- // surrogate runes cannot be encoded
- if (rune >= 0xD800 && rune < 0xE000)
- rune = badrune;
- if (rune < 0x80) { // ASCII bytes represent themselves
- b = (uint8_t) (rune & 0xFF);
- n = 1;
- goto done;
- }
- if (rune < 0x800) { // two-byte encoding
- c = (uint8_t) (rune & 0x3F);
- c |= 0x80;
- rune >>= 6;
- b = (uint8_t) (rune & 0x1F);
- b |= 0xC0;
- n = 2;
- goto done;
- }
- if (rune < 0x10000) { // three-byte encoding
- d = (uint8_t) (rune & 0x3F);
- d |= 0x80;
- rune >>= 6;
- c = (uint8_t) (rune & 0x3F);
- c |= 0x80;
- rune >>= 6;
- b = (uint8_t) (rune & 0x0F);
- b |= 0xE0;
- n = 3;
- goto done;
- }
- // otherwise use a four-byte encoding
- e = (uint8_t) (rune & 0x3F);
- e |= 0x80;
- rune >>= 6;
- d = (uint8_t) (rune & 0x3F);
- d |= 0x80;
- rune >>= 6;
- c = (uint8_t) (rune & 0x3F);
- c |= 0x80;
- rune >>= 6;
- b = (uint8_t) (rune & 0x07);
- b |= 0xF0;
- n = 4;
- done:
- encoded[0] = b;
- if (n > 1)
- encoded[1] = c;
- if (n > 2)
- encoded[2] = d;
- if (n > 3)
- encoded[3] = e;
- return n;
- }
- const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
- {
- uint8_t b, c;
- uint8_t lowestAllowed, highestAllowed;
- size_t i, expected;
- int bad;
- b = (uint8_t) (*s);
- if (b < 0x80) { // ASCII bytes represent themselves
- *rune = b;
- s++;
- return s;
- }
- // 0xC0 and 0xC1 cover 2-byte overlong equivalents
- // 0xF5 to 0xFD cover values > 0x10FFFF
- // 0xFE and 0xFF were never defined (always illegal)
- if (b < 0xC2 || b > 0xF4) { // invalid
- *rune = badrune;
- s++;
- return s;
- }
- // this determines the range of allowed first continuation bytes
- lowestAllowed = 0x80;
- highestAllowed = 0xBF;
- switch (b) {
- case 0xE0:
- // disallow 3-byte overlong equivalents
- lowestAllowed = 0xA0;
- break;
- case 0xED:
- // disallow surrogate characters
- highestAllowed = 0x9F;
- break;
- case 0xF0:
- // disallow 4-byte overlong equivalents
- lowestAllowed = 0x90;
- break;
- case 0xF4:
- // disallow values > 0x10FFFF
- highestAllowed = 0x8F;
- break;
- }
- // and this determines how many continuation bytes are expected
- expected = 1;
- if (b >= 0xE0)
- expected++;
- if (b >= 0xF0)
- expected++;
- if (nElem != 0) { // are there enough bytes?
- nElem--;
- if (nElem < expected) { // nope
- *rune = badrune;
- s++;
- return s;
- }
- }
- // ensure that everything is correct
- // if not, **only** consume the initial byte
- bad = 0;
- for (i = 0; i < expected; i++) {
- c = (uint8_t) (s[1 + i]);
- if (c < lowestAllowed || c > highestAllowed) {
- bad = 1;
- break;
- }
- // the old lowestAllowed and highestAllowed is only for the first continuation byte
- lowestAllowed = 0x80;
- highestAllowed = 0xBF;
- }
- if (bad) {
- *rune = badrune;
- s++;
- return s;
- }
- // now do the topmost bits
- if (b < 0xE0)
- *rune = b & 0x1F;
- else if (b < 0xF0)
- *rune = b & 0x0F;
- else
- *rune = b & 0x07;
- s++; // we can finally move on
- // now do the continuation bytes
- for (; expected; expected--) {
- c = (uint8_t) (*s);
- s++;
- c &= 0x3F; // strip continuation bits
- *rune <<= 6;
- *rune |= c;
- }
- return s;
- }
- // encoded must have at most 2 elements
- size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
- {
- uint16_t low, high;
- // not in the valid range for Unicode
- if (rune > 0x10FFFF)
- rune = badrune;
- // surrogate runes cannot be encoded
- if (rune >= 0xD800 && rune < 0xE000)
- rune = badrune;
- if (rune < 0x10000) {
- encoded[0] = (uint16_t) rune;
- return 1;
- }
- rune -= 0x10000;
- low = (uint16_t) (rune & 0x3FF);
- rune >>= 10;
- high = (uint16_t) (rune & 0x3FF);
- encoded[0] = high | 0xD800;
- encoded[1] = low | 0xDC00;
- return 2;
- }
- // TODO see if this can be cleaned up somehow
- const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
- {
- uint16_t high, low;
- if (*s < 0xD800 || *s >= 0xE000) {
- // self-representing character
- *rune = *s;
- s++;
- return s;
- }
- if (*s >= 0xDC00) {
- // out-of-order surrogates
- *rune = badrune;
- s++;
- return s;
- }
- if (nElem == 1) { // not enough elements
- *rune = badrune;
- s++;
- return s;
- }
- high = *s;
- high &= 0x3FF;
- if (s[1] < 0xDC00 || s[1] >= 0xE000) {
- // bad surrogate pair
- *rune = badrune;
- s++;
- return s;
- }
- s++;
- low = *s;
- s++;
- low &= 0x3FF;
- *rune = high;
- *rune <<= 10;
- *rune |= low;
- *rune += 0x10000;
- return s;
- }
- // TODO find a way to reduce the code in all of these somehow
- // TODO find a way to remove u as well
- size_t utf8RuneCount(const char *s, size_t nElem)
- {
- size_t len;
- uint32_t rune;
- if (nElem != 0) {
- const char *t, *u;
- len = 0;
- t = s;
- while (nElem != 0) {
- u = utf8DecodeRune(t, nElem, &rune);
- len++;
- nElem -= u - t;
- t = u;
- }
- return len;
- }
- len = 0;
- while (*s) {
- s = utf8DecodeRune(s, nElem, &rune);
- len++;
- }
- return len;
- }
- size_t utf8UTF16Count(const char *s, size_t nElem)
- {
- size_t len;
- uint32_t rune;
- uint16_t encoded[2];
- if (nElem != 0) {
- const char *t, *u;
- len = 0;
- t = s;
- while (nElem != 0) {
- u = utf8DecodeRune(t, nElem, &rune);
- len += utf16EncodeRune(rune, encoded);
- nElem -= u - t;
- t = u;
- }
- return len;
- }
- len = 0;
- while (*s) {
- s = utf8DecodeRune(s, nElem, &rune);
- len += utf16EncodeRune(rune, encoded);
- }
- return len;
- }
- size_t utf16RuneCount(const uint16_t *s, size_t nElem)
- {
- size_t len;
- uint32_t rune;
- if (nElem != 0) {
- const uint16_t *t, *u;
- len = 0;
- t = s;
- while (nElem != 0) {
- u = utf16DecodeRune(t, nElem, &rune);
- len++;
- nElem -= u - t;
- t = u;
- }
- return len;
- }
- len = 0;
- while (*s) {
- s = utf16DecodeRune(s, nElem, &rune);
- len++;
- }
- return len;
- }
- size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
- {
- size_t len;
- uint32_t rune;
- char encoded[4];
- if (nElem != 0) {
- const uint16_t *t, *u;
- len = 0;
- t = s;
- while (nElem != 0) {
- u = utf16DecodeRune(t, nElem, &rune);
- len += utf8EncodeRune(rune, encoded);
- nElem -= u - t;
- t = u;
- }
- return len;
- }
- len = 0;
- while (*s) {
- s = utf16DecodeRune(s, nElem, &rune);
- len += utf8EncodeRune(rune, encoded);
- }
- return len;
- }
|