diff options
author | default <nobody@localhost> | 2023-08-03 08:42:38 +0200 |
---|---|---|
committer | default <nobody@localhost> | 2023-08-03 08:42:38 +0200 |
commit | 2137d2f13310aca3cef6a0fc7735fdf4aac53e8c (patch) | |
tree | 69d426ca8e14ff509a9cdd94d443b81bf975e7a0 /xs_unicode.h | |
parent | d455280705700cf75b4b7a59218cf48d4684ae68 (diff) |
Backport from xs.
Diffstat (limited to 'xs_unicode.h')
-rw-r--r-- | xs_unicode.h | 131 |
1 files changed, 119 insertions, 12 deletions
diff --git a/xs_unicode.h b/xs_unicode.h index d45b52e..48cd660 100644 --- a/xs_unicode.h +++ b/xs_unicode.h @@ -5,8 +5,15 @@ #define _XS_UNICODE_H xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); - char *xs_utf8_dec(const char *str, unsigned int *cpoint); - + unsigned int xs_utf8_dec(char **str); + unsigned int *_xs_unicode_upper_search(unsigned int cpoint); + unsigned int *_xs_unicode_lower_search(unsigned int cpoint); + #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint)) + #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint)) + unsigned int xs_unicode_to_upper(unsigned int cpoint); + unsigned int xs_unicode_to_lower(unsigned int cpoint); + int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac); + int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint); #ifdef XS_IMPLEMENTATION @@ -50,46 +57,146 @@ xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint) } -char *xs_utf8_dec(const char *str, unsigned int *cpoint) -/* decodes an utf-8 char inside str into cpoint and returns the next position */ +unsigned int xs_utf8_dec(char **str) +/* decodes an utf-8 char inside str and updates the pointer */ { - unsigned char *p = (unsigned char *)str; + unsigned char *p = (unsigned char *)*str; + unsigned int cpoint = 0; int c = *p++; int cb = 0; if ((c & 0x80) == 0) { /* 1 byte char */ - *cpoint = c; + cpoint = c; } else if ((c & 0xe0) == 0xc0) { /* 2 byte char */ - *cpoint = (c & 0x1f) << 6; + cpoint = (c & 0x1f) << 6; cb = 1; } else if ((c & 0xf0) == 0xe0) { /* 3 byte char */ - *cpoint = (c & 0x0f) << 12; + cpoint = (c & 0x0f) << 12; cb = 2; } else if ((c & 0xf8) == 0xf0) { /* 4 byte char */ - *cpoint = (c & 0x07) << 18; + cpoint = (c & 0x07) << 18; cb = 3; } /* process the continuation bytes */ while (cb--) { if ((*p & 0xc0) == 0x80) - *cpoint |= (*p++ & 0x3f) << (cb * 6); + cpoint |= (*p++ & 0x3f) << (cb * 6); else { - *cpoint = 0xfffd; + cpoint = 0xfffd; break; } } - return (char *)p; + *str = (char *)p; + return cpoint; +} + + +#ifdef _XS_UNICODE_TBL_H + +/* include xs_unicode_tbl.h before to use these functions */ + +static int int_cmp(const void *p1, const void *p2) +{ + const unsigned int *a = p1; + const unsigned int *b = p2; + + return *a < *b ? -1 : *a > *b ? 1 : 0; +} + + +unsigned int *_xs_unicode_upper_search(unsigned int cpoint) +/* searches for an uppercase codepoint in the case fold table */ +{ + return bsearch(&cpoint, xs_unicode_case_fold_table, + sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2), + sizeof(unsigned int) * 2, + int_cmp); +} + + +unsigned int *_xs_unicode_lower_search(unsigned int cpoint) +/* searches for a lowercase codepoint in the case fold table */ +{ + unsigned int *p = xs_unicode_case_fold_table + 1; + unsigned int *e = xs_unicode_case_fold_table + + sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int); + + while (p < e) { + if (cpoint == *p) + return p; + + p += 2; + } + + return NULL; +} + + +unsigned int xs_unicode_to_upper(unsigned int cpoint) +/* returns the cpoint to uppercase */ +{ + unsigned int *p = _xs_unicode_lower_search(cpoint); + + return p == NULL ? cpoint : p[-1]; +} + + +unsigned int xs_unicode_to_lower(unsigned int cpoint) +/* returns the cpoint to lowercase */ +{ + unsigned int *p = _xs_unicode_upper_search(cpoint); + + return p == NULL ? cpoint : p[1]; } +int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac) +/* applies unicode Normalization Form D */ +{ + unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table, + sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3), + sizeof(unsigned int) * 3, + int_cmp); + + if (r != NULL) { + *base = r[1]; + *diac = r[2]; + } + + return !!r; +} + + +int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint) +/* applies unicode Normalization Form C */ +{ + unsigned int *p = xs_unicode_nfd_table; + unsigned int *e = xs_unicode_nfd_table + + sizeof(xs_unicode_nfd_table) / sizeof(unsigned int); + + while (p < e) { + if (p[1] == base && p[2] == diac) { + *cpoint = p[0]; + return 1; + } + + p += 3; + } + + return 0; +} + + +#endif /* _XS_UNICODE_TBL_H */ + #endif /* XS_IMPLEMENTATION */ #endif /* _XS_UNICODE_H */ |