summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--xs_json.h7
-rw-r--r--xs_unicode.h116
-rw-r--r--xs_version.h2
3 files changed, 120 insertions, 5 deletions
diff --git a/xs_json.h b/xs_json.h
index a4112b0..3a91de9 100644
--- a/xs_json.h
+++ b/xs_json.h
@@ -208,6 +208,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t)
{
int c;
xs_val *v = NULL;
+ int offset;
*t = JS_ERROR;
@@ -236,6 +237,7 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t)
*t = JS_STRING;
v = xs_str_new(NULL);
+ offset = 0;
while ((c = fgetc(f)) != '"' && c != EOF && *t != JS_ERROR) {
if (c == '\\') {
@@ -274,11 +276,12 @@ static xs_val *_xs_json_load_lexer(FILE *f, js_type *t)
break;
}
- v = xs_utf8_cat(v, cp);
+ v = xs_utf8_insert(v, cp, &offset);
}
else {
char cc = c;
- v = xs_append_m(v, &cc, 1);
+ v = xs_insert_m(v, offset, &cc, 1);
+ offset++;
}
}
diff --git a/xs_unicode.h b/xs_unicode.h
index 2e9a754..a5a1dcb 100644
--- a/xs_unicode.h
+++ b/xs_unicode.h
@@ -9,6 +9,7 @@
unsigned int xs_utf8_dec(const char **str);
int xs_unicode_width(unsigned int cpoint);
int xs_is_surrogate(unsigned int cpoint);
+ int xs_is_diacritic(unsigned int cpoint);
unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
unsigned int xs_surrogate_enc(unsigned int cpoint);
unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
@@ -22,7 +23,12 @@
int xs_unicode_is_alpha(unsigned int cpoint);
#ifdef _XS_H
+ xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset);
xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint);
+ xs_str *xs_utf8_to_upper(const char *str);
+ xs_str *xs_utf8_to_lower(const char *str);
+ xs_str *xs_utf8_to_nfd(const char *str);
+ xs_str *xs_utf8_to_nfc(const char *str);
#endif
#ifdef XS_IMPLEMENTATION
@@ -144,6 +150,12 @@ int xs_unicode_width(unsigned int cpoint)
}
+int xs_is_diacritic(unsigned int cpoint)
+{
+ return cpoint >= 0x300 && cpoint <= 0x36f;
+}
+
+
/** surrogate pairs **/
int xs_is_surrogate(unsigned int cpoint)
@@ -172,14 +184,27 @@ unsigned int xs_surrogate_enc(unsigned int cpoint)
#ifdef _XS_H
-xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
+xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset)
/* encodes an Unicode codepoint to utf-8 into str */
{
char tmp[4];
int c = xs_utf8_enc(tmp, cpoint);
- return xs_append_m(str, tmp, c);
+ str = xs_insert_m(str, *offset, tmp, c);
+
+ *offset += c;
+
+ return str;
+}
+
+
+xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
+/* encodes an Unicode codepoint to utf-8 into str */
+{
+ int offset = strlen(str);
+
+ return xs_utf8_insert(str, cpoint, &offset);
}
#endif /* _XS_H */
@@ -232,6 +257,9 @@ unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
unsigned int xs_unicode_to_lower(unsigned int cpoint)
/* returns the cpoint to lowercase */
{
+ if (cpoint < 0x80)
+ return tolower(cpoint);
+
unsigned int *p = _xs_unicode_upper_search(cpoint);
return p == NULL ? cpoint : p[1];
@@ -241,6 +269,9 @@ unsigned int xs_unicode_to_lower(unsigned int cpoint)
unsigned int xs_unicode_to_upper(unsigned int cpoint)
/* returns the cpoint to uppercase */
{
+ if (cpoint < 0x80)
+ return toupper(cpoint);
+
unsigned int *p = _xs_unicode_lower_search(cpoint);
return p == NULL ? cpoint : p[0];
@@ -317,6 +348,87 @@ int xs_unicode_is_alpha(unsigned int cpoint)
}
+#ifdef _XS_H
+
+xs_str *xs_utf8_to_upper(const char *str)
+{
+ xs_str *s = xs_str_new(NULL);
+ unsigned int cpoint;
+ int offset = 0;
+
+ while ((cpoint = xs_utf8_dec(&str))) {
+ cpoint = xs_unicode_to_upper(cpoint);
+ s = xs_utf8_insert(s, cpoint, &offset);
+ }
+
+ return s;
+}
+
+
+xs_str *xs_utf8_to_lower(const char *str)
+{
+ xs_str *s = xs_str_new(NULL);
+ unsigned int cpoint;
+ int offset = 0;
+
+ while ((cpoint = xs_utf8_dec(&str))) {
+ cpoint = xs_unicode_to_lower(cpoint);
+ s = xs_utf8_insert(s, cpoint, &offset);
+ }
+
+ return s;
+}
+
+
+xs_str *xs_utf8_to_nfd(const char *str)
+{
+ xs_str *s = xs_str_new(NULL);
+ unsigned int cpoint;
+ int offset = 0;
+
+ while ((cpoint = xs_utf8_dec(&str))) {
+ unsigned int base;
+ unsigned int diac;
+
+ if (xs_unicode_nfd(cpoint, &base, &diac)) {
+ s = xs_utf8_insert(s, base, &offset);
+ s = xs_utf8_insert(s, diac, &offset);
+ }
+ else
+ s = xs_utf8_insert(s, cpoint, &offset);
+ }
+
+ return s;
+}
+
+
+xs_str *xs_utf8_to_nfc(const char *str)
+{
+ xs_str *s = xs_str_new(NULL);
+ unsigned int cpoint;
+ unsigned int base = 0;
+ int offset = 0;
+
+ while ((cpoint = xs_utf8_dec(&str))) {
+ if (xs_is_diacritic(cpoint)) {
+ if (xs_unicode_nfc(base, cpoint, &base))
+ continue;
+ }
+
+ if (base)
+ s = xs_utf8_insert(s, base, &offset);
+
+ base = cpoint;
+ }
+
+ if (base)
+ s = xs_utf8_insert(s, base, &offset);
+
+ return s;
+}
+
+#endif /* _XS_H */
+
#endif /* _XS_UNICODE_TBL_H */
#endif /* XS_IMPLEMENTATION */
diff --git a/xs_version.h b/xs_version.h
index 4318c7e..ce88558 100644
--- a/xs_version.h
+++ b/xs_version.h
@@ -1 +1 @@
-/* c6eca9593f9b3d6791cba600e5950f682fdb36cf 2024-08-12T16:08:37+02:00 */
+/* cc9ebd36ae640e4701277327fbba9996143076f6 2024-08-23T17:17:08+02:00 */