summaryrefslogtreecommitdiff
path: root/xs_unicode.h
diff options
context:
space:
mode:
authordefault <nobody@localhost>2023-08-03 08:42:38 +0200
committerdefault <nobody@localhost>2023-08-03 08:42:38 +0200
commit2137d2f13310aca3cef6a0fc7735fdf4aac53e8c (patch)
tree69d426ca8e14ff509a9cdd94d443b81bf975e7a0 /xs_unicode.h
parentd455280705700cf75b4b7a59218cf48d4684ae68 (diff)
Backport from xs.
Diffstat (limited to 'xs_unicode.h')
-rw-r--r--xs_unicode.h131
1 files changed, 119 insertions, 12 deletions
diff --git a/xs_unicode.h b/xs_unicode.h
index d45b52e..48cd660 100644
--- a/xs_unicode.h
+++ b/xs_unicode.h
@@ -5,8 +5,15 @@
#define _XS_UNICODE_H
xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
- char *xs_utf8_dec(const char *str, unsigned int *cpoint);
-
+ unsigned int xs_utf8_dec(char **str);
+ unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
+ unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
+ #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
+ #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
+ unsigned int xs_unicode_to_upper(unsigned int cpoint);
+ unsigned int xs_unicode_to_lower(unsigned int cpoint);
+ int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
+ int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
#ifdef XS_IMPLEMENTATION
@@ -50,46 +57,146 @@ xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
}
-char *xs_utf8_dec(const char *str, unsigned int *cpoint)
-/* decodes an utf-8 char inside str into cpoint and returns the next position */
+unsigned int xs_utf8_dec(char **str)
+/* decodes an utf-8 char inside str and updates the pointer */
{
- unsigned char *p = (unsigned char *)str;
+ unsigned char *p = (unsigned char *)*str;
+ unsigned int cpoint = 0;
int c = *p++;
int cb = 0;
if ((c & 0x80) == 0) { /* 1 byte char */
- *cpoint = c;
+ cpoint = c;
}
else
if ((c & 0xe0) == 0xc0) { /* 2 byte char */
- *cpoint = (c & 0x1f) << 6;
+ cpoint = (c & 0x1f) << 6;
cb = 1;
}
else
if ((c & 0xf0) == 0xe0) { /* 3 byte char */
- *cpoint = (c & 0x0f) << 12;
+ cpoint = (c & 0x0f) << 12;
cb = 2;
}
else
if ((c & 0xf8) == 0xf0) { /* 4 byte char */
- *cpoint = (c & 0x07) << 18;
+ cpoint = (c & 0x07) << 18;
cb = 3;
}
/* process the continuation bytes */
while (cb--) {
if ((*p & 0xc0) == 0x80)
- *cpoint |= (*p++ & 0x3f) << (cb * 6);
+ cpoint |= (*p++ & 0x3f) << (cb * 6);
else {
- *cpoint = 0xfffd;
+ cpoint = 0xfffd;
break;
}
}
- return (char *)p;
+ *str = (char *)p;
+ return cpoint;
+}
+
+
+#ifdef _XS_UNICODE_TBL_H
+
+/* include xs_unicode_tbl.h before to use these functions */
+
+static int int_cmp(const void *p1, const void *p2)
+{
+ const unsigned int *a = p1;
+ const unsigned int *b = p2;
+
+ return *a < *b ? -1 : *a > *b ? 1 : 0;
+}
+
+
+unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
+/* searches for an uppercase codepoint in the case fold table */
+{
+ return bsearch(&cpoint, xs_unicode_case_fold_table,
+ sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2),
+ sizeof(unsigned int) * 2,
+ int_cmp);
+}
+
+
+unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
+/* searches for a lowercase codepoint in the case fold table */
+{
+ unsigned int *p = xs_unicode_case_fold_table + 1;
+ unsigned int *e = xs_unicode_case_fold_table +
+ sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int);
+
+ while (p < e) {
+ if (cpoint == *p)
+ return p;
+
+ p += 2;
+ }
+
+ return NULL;
+}
+
+
+unsigned int xs_unicode_to_upper(unsigned int cpoint)
+/* returns the cpoint to uppercase */
+{
+ unsigned int *p = _xs_unicode_lower_search(cpoint);
+
+ return p == NULL ? cpoint : p[-1];
+}
+
+
+unsigned int xs_unicode_to_lower(unsigned int cpoint)
+/* returns the cpoint to lowercase */
+{
+ unsigned int *p = _xs_unicode_upper_search(cpoint);
+
+ return p == NULL ? cpoint : p[1];
}
+int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
+/* applies unicode Normalization Form D */
+{
+ unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table,
+ sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3),
+ sizeof(unsigned int) * 3,
+ int_cmp);
+
+ if (r != NULL) {
+ *base = r[1];
+ *diac = r[2];
+ }
+
+ return !!r;
+}
+
+
+int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
+/* applies unicode Normalization Form C */
+{
+ unsigned int *p = xs_unicode_nfd_table;
+ unsigned int *e = xs_unicode_nfd_table +
+ sizeof(xs_unicode_nfd_table) / sizeof(unsigned int);
+
+ while (p < e) {
+ if (p[1] == base && p[2] == diac) {
+ *cpoint = p[0];
+ return 1;
+ }
+
+ p += 3;
+ }
+
+ return 0;
+}
+
+
+#endif /* _XS_UNICODE_TBL_H */
+
#endif /* XS_IMPLEMENTATION */
#endif /* _XS_UNICODE_H */