diff options
author | default <nobody@localhost> | 2024-08-23 17:22:10 +0200 |
---|---|---|
committer | default <nobody@localhost> | 2024-08-23 17:22:10 +0200 |
commit | 8586e44de92c827d6a19a7700121c8b21d3687b1 (patch) | |
tree | a0cc5a181a851f58a1cdea505bf1096970eb3e8d /xs_unicode.h | |
parent | d2daba7b9cf6282f0929f88a9d8a8467dab4e886 (diff) |
Some optimizations.
Diffstat (limited to 'xs_unicode.h')
-rw-r--r-- | xs_unicode.h | 116 |
1 files changed, 114 insertions, 2 deletions
diff --git a/xs_unicode.h b/xs_unicode.h index 2e9a754..a5a1dcb 100644 --- a/xs_unicode.h +++ b/xs_unicode.h @@ -9,6 +9,7 @@ unsigned int xs_utf8_dec(const char **str); int xs_unicode_width(unsigned int cpoint); int xs_is_surrogate(unsigned int cpoint); + int xs_is_diacritic(unsigned int cpoint); unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2); unsigned int xs_surrogate_enc(unsigned int cpoint); unsigned int *_xs_unicode_upper_search(unsigned int cpoint); @@ -22,7 +23,12 @@ int xs_unicode_is_alpha(unsigned int cpoint); #ifdef _XS_H + xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset); xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint); + xs_str *xs_utf8_to_upper(const char *str); + xs_str *xs_utf8_to_lower(const char *str); + xs_str *xs_utf8_to_nfd(const char *str); + xs_str *xs_utf8_to_nfc(const char *str); #endif #ifdef XS_IMPLEMENTATION @@ -144,6 +150,12 @@ int xs_unicode_width(unsigned int cpoint) } +int xs_is_diacritic(unsigned int cpoint) +{ + return cpoint >= 0x300 && cpoint <= 0x36f; +} + + /** surrogate pairs **/ int xs_is_surrogate(unsigned int cpoint) @@ -172,14 +184,27 @@ unsigned int xs_surrogate_enc(unsigned int cpoint) #ifdef _XS_H -xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) +xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset) /* encodes an Unicode codepoint to utf-8 into str */ { char tmp[4]; int c = xs_utf8_enc(tmp, cpoint); - return xs_append_m(str, tmp, c); + str = xs_insert_m(str, *offset, tmp, c); + + *offset += c; + + return str; +} + + +xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint) +/* encodes an Unicode codepoint to utf-8 into str */ +{ + int offset = strlen(str); + + return xs_utf8_insert(str, cpoint, &offset); } #endif /* _XS_H */ @@ -232,6 +257,9 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint) unsigned int xs_unicode_to_lower(unsigned int cpoint) /* returns the cpoint to lowercase */ { + if (cpoint < 0x80) + return tolower(cpoint); + unsigned int *p = _xs_unicode_upper_search(cpoint); return p == NULL ? cpoint : p[1]; @@ -241,6 +269,9 @@ unsigned int xs_unicode_to_lower(unsigned int cpoint) unsigned int xs_unicode_to_upper(unsigned int cpoint) /* returns the cpoint to uppercase */ { + if (cpoint < 0x80) + return toupper(cpoint); + unsigned int *p = _xs_unicode_lower_search(cpoint); return p == NULL ? cpoint : p[0]; @@ -317,6 +348,87 @@ int xs_unicode_is_alpha(unsigned int cpoint) } +#ifdef _XS_H + +xs_str *xs_utf8_to_upper(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + cpoint = xs_unicode_to_upper(cpoint); + s = xs_utf8_insert(s, cpoint, &offset); + } + + return s; +} + + +xs_str *xs_utf8_to_lower(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + cpoint = xs_unicode_to_lower(cpoint); + s = xs_utf8_insert(s, cpoint, &offset); + } + + return s; +} + + +xs_str *xs_utf8_to_nfd(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + unsigned int base; + unsigned int diac; + + if (xs_unicode_nfd(cpoint, &base, &diac)) { + s = xs_utf8_insert(s, base, &offset); + s = xs_utf8_insert(s, diac, &offset); + } + else + s = xs_utf8_insert(s, cpoint, &offset); + } + + return s; +} + + +xs_str *xs_utf8_to_nfc(const char *str) +{ + xs_str *s = xs_str_new(NULL); + unsigned int cpoint; + unsigned int base = 0; + int offset = 0; + + while ((cpoint = xs_utf8_dec(&str))) { + if (xs_is_diacritic(cpoint)) { + if (xs_unicode_nfc(base, cpoint, &base)) + continue; + } + + if (base) + s = xs_utf8_insert(s, base, &offset); + + base = cpoint; + } + + if (base) + s = xs_utf8_insert(s, base, &offset); + + return s; +} + +#endif /* _XS_H */ + #endif /* _XS_UNICODE_TBL_H */ #endif /* XS_IMPLEMENTATION */ |