From bf435af788d387b3d97fd744e3b1f6a73795beb8 Mon Sep 17 00:00:00 2001 From: default Date: Wed, 27 Dec 2023 12:54:38 +0100 Subject: Backport from xs. --- xs_unicode.h | 192 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 113 insertions(+), 79 deletions(-) (limited to 'xs_unicode.h') diff --git a/xs_unicode.h b/xs_unicode.h index f5880f0..c666479 100644 --- a/xs_unicode.h +++ b/xs_unicode.h @@ -5,7 +5,6 @@ #define _XS_UNICODE_H int _xs_utf8_enc(char buf[4], unsigned int cpoint); - xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); unsigned int xs_utf8_dec(char **str); int xs_unicode_width(unsigned int cpoint); int xs_is_surrogate(unsigned int cpoint); @@ -21,13 +20,20 @@ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint); int xs_unicode_is_alpha(unsigned int cpoint); +#ifdef _XS_H + xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint); +#endif + #ifdef XS_IMPLEMENTATION +#ifndef countof +#define countof(a) (sizeof((a)) / sizeof((*a))) +#endif int _xs_utf8_enc(char buf[4], unsigned int cpoint) /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */ { - unsigned char *p = (unsigned char *)buf; + char *p = buf; if (cpoint < 0x80) /* 1 byte char */ *p++ = cpoint & 0xff; @@ -48,27 +54,16 @@ int _xs_utf8_enc(char buf[4], unsigned int cpoint) *p++ = 0x80 | (cpoint & 0x3f); } - return p - (unsigned char *)buf; -} - - -xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint) -/* encodes an Unicode codepoint to utf-8 into str */ -{ - char tmp[4]; - - int c = _xs_utf8_enc(tmp, cpoint); - - return xs_append_m(str, tmp, c); + return p - buf; } unsigned int xs_utf8_dec(char **str) /* decodes an utf-8 char inside str and updates the pointer */ { - unsigned char *p = (unsigned char *)*str; + char *p = *str; unsigned int cpoint = 0; - int c = *p++; + unsigned char c = *p++; int cb = 0; if ((c & 0x80) == 0) { /* 1 byte char */ @@ -91,30 +86,19 @@ unsigned int xs_utf8_dec(char **str) } /* process the continuation bytes */ - while (cb--) { - if ((*p & 0xc0) == 0x80) - cpoint |= (*p++ & 0x3f) << (cb * 6); - else { - cpoint = 0xfffd; - break; - } - } + while (cb > 0 && *p && (*p & 0xc0) == 0x80) + cpoint |= (*p++ & 0x3f) << (--cb * 6); - *str = (char *)p; - return cpoint; -} - - -static int int_range_cmp(const void *p1, const void *p2) -{ - const unsigned int *a = p1; - const unsigned int *b = p2; + /* incomplete or broken? */ + if (cb) + cpoint = 0xfffd; - return *a < b[0] ? -1 : *a > b[1] ? 1 : 0; + *str = p; + return cpoint; } -/* intentionally dead simple */ +/** Unicode character width: intentionally dead simple **/ static unsigned int xs_unicode_width_table[] = { 0x300, 0x36f, 0, /* diacritics */ @@ -132,12 +116,23 @@ static unsigned int xs_unicode_width_table[] = { int xs_unicode_width(unsigned int cpoint) /* returns the width in columns of a Unicode codepoint (somewhat simplified) */ { - unsigned int *r = bsearch(&cpoint, xs_unicode_width_table, - sizeof(xs_unicode_width_table) / (sizeof(unsigned int) * 3), - sizeof(unsigned int) * 3, - int_range_cmp); + int b = 0; + int t = countof(xs_unicode_width_table) / 3 - 1; + + while (t >= b) { + int n = (b + t) / 2; + unsigned int *p = &xs_unicode_width_table[n * 3]; + + if (cpoint < p[0]) + t = n - 1; + else + if (cpoint > p[1]) + b = n + 1; + else + return p[2]; + } - return r ? r[2] : 1; + return 1; } @@ -167,38 +162,56 @@ unsigned int xs_surrogate_enc(unsigned int cpoint) } -#ifdef _XS_UNICODE_TBL_H - -/* include xs_unicode_tbl.h before this one to use these functions */ +#ifdef _XS_H -static int int_cmp(const void *p1, const void *p2) +xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint) +/* encodes an Unicode codepoint to utf-8 into str */ { - const unsigned int *a = p1; - const unsigned int *b = p2; + char tmp[4]; + + int c = _xs_utf8_enc(tmp, cpoint); - return *a < *b ? -1 : *a > *b ? 1 : 0; + return xs_append_m(str, tmp, c); } +#endif /* _XS_H */ + + +#ifdef _XS_UNICODE_TBL_H + +/* include xs_unicode_tbl.h before this one to use these functions */ unsigned int *_xs_unicode_upper_search(unsigned int cpoint) /* searches for an uppercase codepoint in the case fold table */ { - return bsearch(&cpoint, xs_unicode_case_fold_table, - sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2), - sizeof(unsigned int) * 2, - int_cmp); + int b = 0; + int t = countof(xs_unicode_case_fold_table) / 2 + 1; + + while (t >= b) { + int n = (b + t) / 2; + unsigned int *p = &xs_unicode_case_fold_table[n * 2]; + + if (cpoint < p[0]) + t = n - 1; + else + if (cpoint > p[0]) + b = n + 1; + else + return p; + } + + return NULL; } unsigned int *_xs_unicode_lower_search(unsigned int cpoint) /* searches for a lowercase codepoint in the case fold table */ { - unsigned int *p = xs_unicode_case_fold_table + 1; - unsigned int *e = xs_unicode_case_fold_table + - sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int); + unsigned int *p = xs_unicode_case_fold_table; + unsigned int *e = p + countof(xs_unicode_case_fold_table); while (p < e) { - if (cpoint == *p) + if (cpoint == p[1]) return p; p += 2; @@ -208,38 +221,49 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint) } -unsigned int xs_unicode_to_upper(unsigned int cpoint) -/* returns the cpoint to uppercase */ +unsigned int xs_unicode_to_lower(unsigned int cpoint) +/* returns the cpoint to lowercase */ { - unsigned int *p = _xs_unicode_lower_search(cpoint); + unsigned int *p = _xs_unicode_upper_search(cpoint); - return p == NULL ? cpoint : p[-1]; + return p == NULL ? cpoint : p[1]; } -unsigned int xs_unicode_to_lower(unsigned int cpoint) -/* returns the cpoint to lowercase */ +unsigned int xs_unicode_to_upper(unsigned int cpoint) +/* returns the cpoint to uppercase */ { - unsigned int *p = _xs_unicode_upper_search(cpoint); + unsigned int *p = _xs_unicode_lower_search(cpoint); - return p == NULL ? cpoint : p[1]; + return p == NULL ? cpoint : p[0]; } int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac) /* applies unicode Normalization Form D */ { - unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table, - sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3), - sizeof(unsigned int) * 3, - int_cmp); - - if (r != NULL) { - *base = r[1]; - *diac = r[2]; + int b = 0; + int t = countof(xs_unicode_nfd_table) / 3 - 1; + + while (t >= b) { + int n = (b + t) / 2; + unsigned int *p = &xs_unicode_nfd_table[n * 3]; + + int c = cpoint - p[0]; + + if (c < 0) + t = n - 1; + else + if (c > 0) + b = n + 1; + else { + *base = p[1]; + *diac = p[2]; + return 1; + } } - return !!r; + return 0; } @@ -247,8 +271,7 @@ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint) /* applies unicode Normalization Form C */ { unsigned int *p = xs_unicode_nfd_table; - unsigned int *e = xs_unicode_nfd_table + - sizeof(xs_unicode_nfd_table) / sizeof(unsigned int); + unsigned int *e = p + countof(xs_unicode_nfd_table); while (p < e) { if (p[1] == base && p[2] == diac) { @@ -266,12 +289,23 @@ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint) int xs_unicode_is_alpha(unsigned int cpoint) /* checks if a codepoint is an alpha (i.e. a letter) */ { - unsigned int *r = bsearch(&cpoint, xs_unicode_alpha_table, - sizeof(xs_unicode_alpha_table) / (sizeof(unsigned int) * 2), - sizeof(unsigned int) * 2, - int_range_cmp); + int b = 0; + int t = countof(xs_unicode_alpha_table) / 2 - 1; + + while (t >= b) { + int n = (b + t) / 2; + unsigned int *p = &xs_unicode_alpha_table[n * 2]; + + if (cpoint < p[0]) + t = n - 1; + else + if (cpoint > p[1]) + b = n + 1; + else + return 1; + } - return !!r; + return 0; } -- cgit v1.2.3