Parcourir la source

stdinc: Drastically improve SDL_StepUTF8() and make it a public API.

Fixes #10105.
Ryan C. Gordon il y a 9 mois
Parent
commit
a9cfcf6bde

+ 61 - 0
include/SDL3/SDL_stdinc.h

@@ -1250,6 +1250,67 @@ extern SDL_DECLSPEC int SDLCALL SDL_strcasecmp(const char *str1, const char *str
  */
 extern SDL_DECLSPEC int SDLCALL SDL_strncasecmp(const char *str1, const char *str2, size_t maxlen);
 
+/**
+ * The Unicode REPLACEMENT CHARACTER codepoint.
+ *
+ * SDL_StepUTF8() reports this codepoint when it encounters a UTF-8 string
+ * with encoding errors.
+ *
+ * This tends to render as something like a question mark in most places.
+ *
+ * \since This macro is available since SDL 3.0.0.
+ *
+ * \sa SDL_StepUTF8
+ */
+#define SDL_INVALID_UNICODE_CODEPOINT 0xFFFD
+
+/**
+ * Decode a UTF-8 string, one Unicode codepoint at a time.
+ *
+ * This will return the first Unicode codepoint in the UTF-8 encoded
+ * string in `*pstr`, and then advance `*pstr` past any consumed bytes
+ * before returning.
+ *
+ * It will not access more than `*pslen` bytes from the string.
+ * `*pslen` will be adjusted, as well, subtracting the number of
+ * bytes consumed.
+ *
+ * `pslen` is allowed to be NULL, in which case the string _must_ be
+ * NULL-terminated, as the function will blindly read until it sees
+ * the NULL char.
+ *
+ * if `*pslen` is zero, it assumes the end of string is reached and
+ * returns a zero codepoint regardless of the contents of the string
+ * buffer.
+ *
+ * If the resulting codepoint is zero (a NULL terminator), or `*pslen`
+ * is zero, it will not advance `*pstr` or `*pslen` at all.
+ *
+ * Generally this function is called in a loop until it returns zero,
+ * adjusting its parameters each iteration.
+ *
+ * If an invalid UTF-8 sequence is encountered, this function returns
+ * SDL_INVALID_UNICODE_CODEPOINT and advances the string/length by one
+ * byte (which is to say, a multibyte sequence might produce several
+ * SDL_INVALID_UNICODE_CODEPOINT returns before it syncs to the next
+ * valid UTF-8 sequence).
+ *
+ * Several things can generate invalid UTF-8 sequences, including
+ * overlong encodings, the use of UTF-16 surrogate values, and
+ * truncated data. Please refer to
+ * [RFC3629](https://www.ietf.org/rfc/rfc3629.txt) for details.
+ *
+ * \param pstr a pointer to a UTF-8 string pointer to be read and adjusted.
+ * \param pslen a pointer to the number of bytes in the string, to be read
+ *              and adjusted. NULL is allowed.
+ * \returns the first Unicode codepoint in the string.
+ *
+ * \threadsafety It is safe to call this function from any thread.
+ *
+ * \since This function is available since SDL 3.0.0.
+ */
+extern SDL_DECLSPEC Uint32 SDLCALL SDL_StepUTF8(const char **pstr, size_t *pslen);
+
 extern SDL_DECLSPEC int SDLCALL SDL_sscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, ...) SDL_SCANF_VARARG_FUNC(2);
 extern SDL_DECLSPEC int SDLCALL SDL_vsscanf(const char *text, SDL_SCANF_FORMAT_STRING const char *fmt, va_list ap) SDL_SCANF_VARARG_FUNCV(2);
 extern SDL_DECLSPEC int SDLCALL SDL_snprintf(SDL_OUT_Z_CAP(maxlen) char *text, size_t maxlen, SDL_PRINTF_FORMAT_STRING const char *fmt, ... ) SDL_PRINTF_VARARG_FUNC(3);

+ 1 - 0
src/dynapi/SDL_dynapi.sym

@@ -788,6 +788,7 @@ SDL3_0.0.0 {
     SDL_SignalCondition;
     SDL_SoftStretch;
     SDL_StartTextInput;
+    SDL_StepUTF8;
     SDL_StopHapticEffect;
     SDL_StopHapticEffects;
     SDL_StopHapticRumble;

+ 1 - 0
src/dynapi/SDL_dynapi_overrides.h

@@ -813,6 +813,7 @@
 #define SDL_SignalCondition SDL_SignalCondition_REAL
 #define SDL_SoftStretch SDL_SoftStretch_REAL
 #define SDL_StartTextInput SDL_StartTextInput_REAL
+#define SDL_StepUTF8 SDL_StepUTF8_REAL
 #define SDL_StopHapticEffect SDL_StopHapticEffect_REAL
 #define SDL_StopHapticEffects SDL_StopHapticEffects_REAL
 #define SDL_StopHapticRumble SDL_StopHapticRumble_REAL

+ 1 - 0
src/dynapi/SDL_dynapi_procs.h

@@ -823,6 +823,7 @@ SDL_DYNAPI_PROC(int,SDL_ShowWindowSystemMenu,(SDL_Window *a, int b, int c),(a,b,
 SDL_DYNAPI_PROC(int,SDL_SignalCondition,(SDL_Condition *a),(a),return)
 SDL_DYNAPI_PROC(int,SDL_SoftStretch,(SDL_Surface *a, const SDL_Rect *b, SDL_Surface *c, const SDL_Rect *d, SDL_ScaleMode e),(a,b,c,d,e),return)
 SDL_DYNAPI_PROC(int,SDL_StartTextInput,(SDL_Window *a),(a),return)
+SDL_DYNAPI_PROC(Uint32,SDL_StepUTF8,(const char **a, size_t *b),(a,b),return)
 SDL_DYNAPI_PROC(int,SDL_StopHapticEffect,(SDL_Haptic *a, int b),(a,b),return)
 SDL_DYNAPI_PROC(int,SDL_StopHapticEffects,(SDL_Haptic *a),(a),return)
 SDL_DYNAPI_PROC(int,SDL_StopHapticRumble,(SDL_Haptic *a),(a),return)

+ 1 - 1
src/filesystem/SDL_filesystem.c

@@ -185,7 +185,7 @@ static char *CaseFoldUtf8String(const char *fname)
     Uint32 codepoint;
     char *ptr = retval;
     size_t remaining = allocation;
-    while ((codepoint = SDL_StepUTF8(&fname, 4)) != 0) {
+    while ((codepoint = SDL_StepUTF8(&fname, NULL)) != 0) {
         Uint32 folded[3];
         const int num_folded = SDL_CaseFoldUnicode(codepoint, folded);
         SDL_assert(num_folded > 0);

+ 72 - 37
src/stdlib/SDL_string.c

@@ -32,9 +32,6 @@
 
 #include "SDL_casefolding.h"
 
-// this is the Unicode REPLACEMENT CHARACTER, used for invalid codepoint values.
-#define INVALID_UNICODE_CODEPOINT 0xFFFD
-
 #if defined(__SIZEOF_WCHAR_T__)
 #define SDL_SIZEOF_WCHAR_T __SIZEOF_WCHAR_T__
 #elif defined(SDL_PLATFORM_WINDOWS)
@@ -129,7 +126,7 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to)
             cp1 = folded1[tail1++]; \
         } else { \
             const Uint##bits *str1start = (const Uint##bits *) str1; \
-            head1 = SDL_CaseFoldUnicode(SDL_StepUTF##bits(&str1, slen1), folded1); \
+            head1 = SDL_CaseFoldUnicode(StepUTF##bits(&str1, slen1), folded1); \
             update_slen1; \
             cp1 = folded1[0]; \
             tail1 = 1; \
@@ -138,7 +135,7 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to)
             cp2 = folded2[tail2++]; \
         } else { \
             const Uint##bits *str2start = (const Uint##bits *) str2; \
-            head2 = SDL_CaseFoldUnicode(SDL_StepUTF##bits(&str2, slen2), folded2); \
+            head2 = SDL_CaseFoldUnicode(StepUTF##bits(&str2, slen2), folded2); \
             update_slen2; \
             cp2 = folded2[0]; \
             tail2 = 1; \
@@ -154,12 +151,23 @@ int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to)
     return 0
 
 
-Uint32 SDL_StepUTF8(const char **_str, const size_t slen)
+static Uint32 StepUTF8(const char **_str, const size_t slen)
 {
-    const char *str = *_str;
-    const Uint32 octet = (Uint32) (slen ? ((Uint8) *str) : 0);
-
-    // !!! FIXME: this could have _way_ more error checking! Illegal surrogate codepoints, unexpected bit patterns, etc.
+    /*
+     * From rfc3629, the UTF-8 spec:
+     *  https://www.ietf.org/rfc/rfc3629.txt
+     *
+     *   Char. number range  |        UTF-8 octet sequence
+     *      (hexadecimal)    |              (binary)
+     *   --------------------+---------------------------------------------
+     *   0000 0000-0000 007F | 0xxxxxxx
+     *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+     *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+     *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+     */
+
+    const Uint8 *str = (const Uint8 *) *_str;
+    const Uint32 octet = (Uint32) (slen ? *str : 0);
 
     if (octet == 0) {  // null terminator, end of string.
         return 0;  // don't advance `*_str`.
@@ -167,41 +175,73 @@ Uint32 SDL_StepUTF8(const char **_str, const size_t slen)
         (*_str)++;
         return octet;
     } else if (((octet & 0xE0) == 0xC0) && (slen >= 2)) {  // 110xxxxx 10xxxxxx: two byte codepoint.
-        if (slen >= 2) {
-            *_str += 2;
-            return ((octet & 0x1F) << 6) | (((Uint8) str[1]) & 0x3F);
+        const Uint8 str1 = str[1];
+        if ((str1 & 0xC0) == 0x80) {  // If trailing bytes aren't 10xxxxxx, sequence is bogus.
+            const Uint32 retval = ((octet & 0x1F) << 6) | (str1 & 0x3F);
+            if (retval >= 0x0080) {  // rfc3629 says you can't use overlong sequences for smaller values.
+                *_str += 2;
+                return retval;
+            }
         }
     } else if (((octet & 0xF0) == 0xE0) && (slen >= 3)) {  // 1110xxxx 10xxxxxx 10xxxxxx: three byte codepoint.
-        *_str += 3;
-        const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 6;
-        const Uint32 octet3 = (Uint32) (((Uint8) str[2]) & 0x3F);
-        return ((octet & 0x0F) << 12) | octet2 | octet3;
+        const Uint8 str1 = str[1];
+        const Uint8 str2 = str[2];
+        if (((str1 & 0xC0) == 0x80) && ((str2 & 0xC0) == 0x80)) {  // If trailing bytes aren't 10xxxxxx, sequence is bogus.
+            const Uint32 octet2 = ((Uint32) (str1 & 0x3F)) << 6;
+            const Uint32 octet3 = ((Uint32) (str2 & 0x3F));
+            const Uint32 retval = ((octet & 0x0F) << 12) | octet2 | octet3;
+            if (retval >= 0x800) {  // rfc3629 says you can't use overlong sequences for smaller values.
+                if ((retval < 0xD800) || (retval > 0xDFFF)) {  // UTF-16 surrogate values are illegal in UTF-8.
+                    *_str += 3;
+                    return retval;
+                }
+            }
+        }
     } else if (((octet & 0xF8) == 0xF0) && (slen >= 4)) {  // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx: four byte codepoint.
-        *_str += 4;
-        const Uint32 octet2 = ((Uint32) (((Uint8) str[1]) & 0x1F)) << 12;
-        const Uint32 octet3 = ((Uint32) (((Uint8) str[2]) & 0x3F)) << 6;
-        const Uint32 octet4 = (Uint32) (((Uint8) str[3]) & 0x3F);
-        return ((octet & 0x07) << 18) | octet2 | octet3 | octet4;
+        const Uint8 str1 = str[1];
+        const Uint8 str2 = str[2];
+        const Uint8 str3 = str[3];
+        if (((str1 & 0xC0) == 0x80) && ((str2 & 0xC0) == 0x80) && ((str3 & 0xC0) == 0x80)) {  // If trailing bytes aren't 10xxxxxx, sequence is bogus.
+            const Uint32 octet2 = ((Uint32) (str1 & 0x1F)) << 12;
+            const Uint32 octet3 = ((Uint32) (str2 & 0x3F)) << 6;
+            const Uint32 octet4 = ((Uint32) (str3 & 0x3F));
+            const Uint32 retval = ((octet & 0x07) << 18) | octet2 | octet3 | octet4;
+            if (retval >= 0x10000) {  // rfc3629 says you can't use overlong sequences for smaller values.
+                *_str += 4;
+                return retval;
+            }
+        }
     }
 
     // bogus byte, skip ahead, return a REPLACEMENT CHARACTER.
     (*_str)++;
-    return INVALID_UNICODE_CODEPOINT;
+    return SDL_INVALID_UNICODE_CODEPOINT;
+}
+
+Uint32 SDL_StepUTF8(const char **pstr, size_t *pslen)
+{
+    if (!pslen) {
+        return StepUTF8(pstr, 4);  // 4 == max codepoint size.
+    }
+    const char *origstr = *pstr;
+    const Uint32 retval = StepUTF8(pstr, *pslen);
+    *pslen -= (size_t) (*pstr - origstr);
+    return retval;
 }
 
 #if (SDL_SIZEOF_WCHAR_T == 2)
-static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen)
+static Uint32 StepUTF16(const Uint16 **_str, const size_t slen)
 {
     const Uint16 *str = *_str;
     Uint32 cp = (Uint32) *(str++);
     if (cp == 0) {
         return 0;  // don't advance string pointer.
     } else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) {
-        cp = INVALID_UNICODE_CODEPOINT;  // Orphaned second half of surrogate pair
+        cp = SDL_INVALID_UNICODE_CODEPOINT;  // Orphaned second half of surrogate pair
     } else if ((cp >= 0xD800) && (cp <= 0xDBFF)) {  // start of surrogate pair!
         const Uint32 pair = (Uint32) *str;
         if ((pair == 0) || ((pair < 0xDC00) || (pair > 0xDFFF))) {
-            cp = INVALID_UNICODE_CODEPOINT;
+            cp = SDL_INVALID_UNICODE_CODEPOINT;
         } else {
             str++;  // eat the other surrogate.
             cp = 0x10000 + (((cp - 0xD800) << 10) | (pair - 0xDC00));
@@ -209,10 +249,10 @@ static Uint32 SDL_StepUTF16(const Uint16 **_str, const size_t slen)
     }
 
     *_str = str;
-    return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
+    return (cp > 0x10FFFF) ? SDL_INVALID_UNICODE_CODEPOINT : cp;
 }
 #elif (SDL_SIZEOF_WCHAR_T == 4)
-static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen)
+static Uint32 StepUTF32(const Uint32 **_str, const size_t slen)
 {
     if (!slen) {
         return 0;
@@ -225,7 +265,7 @@ static Uint32 SDL_StepUTF32(const Uint32 **_str, const size_t slen)
     }
 
     (*_str)++;
-    return (cp > 0x10FFFF) ? INVALID_UNICODE_CODEPOINT : cp;
+    return (cp > 0x10FFFF) ? SDL_INVALID_UNICODE_CODEPOINT : cp;
 }
 #endif
 
@@ -816,7 +856,7 @@ size_t SDL_utf8strlcpy(SDL_OUT_Z_CAP(dst_bytes) char *dst, const char *src, size
 size_t SDL_utf8strlen(const char *str)
 {
     size_t retval = 0;
-    while (SDL_StepUTF8(&str, 4)) {
+    while (SDL_StepUTF8(&str, NULL)) {
         retval++;
     }
     return retval;
@@ -825,14 +865,9 @@ size_t SDL_utf8strlen(const char *str)
 size_t SDL_utf8strnlen(const char *str, size_t bytes)
 {
     size_t retval = 0;
-    const char *strstart = str;
-
-    while (SDL_StepUTF8(&str, bytes)) {
-        bytes -= (size_t) (str - strstart);
-        strstart = str;
+    while (SDL_StepUTF8(&str, &bytes)) {
         retval++;
     }
-
     return retval;
 }
 
@@ -983,7 +1018,7 @@ char *SDL_strcasestr(const char *haystack, const char *needle)
         if (SDL_strncasecmp(haystack, needle, length) == 0) {
             return (char *)haystack;
         }
-    } while (SDL_StepUTF8(&haystack, 4));  // move ahead by a full codepoint at a time, regardless of bytes.
+    } while (SDL_StepUTF8(&haystack, NULL));  // move ahead by a full codepoint at a time, regardless of bytes.
 
     return NULL;
 }

+ 0 - 2
src/stdlib/SDL_sysstdlib.h

@@ -25,8 +25,6 @@
 // most things you might need internally in here are public APIs, this is
 // just a few special pieces right now.
 
-Uint32 SDL_StepUTF8(const char **_str, const size_t slen);
-
 // this expects `from` to be a Unicode codepoint, and `to` to point to AT LEAST THREE Uint32s.
 int SDL_CaseFoldUnicode(const Uint32 from, Uint32 *to);
 

+ 75 - 19
test/testiconv.c

@@ -10,13 +10,6 @@
   freely.
 */
 
-/* quiet windows compiler warnings */
-#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
-#include <stdio.h>
-
 #include <SDL3/SDL.h>
 #include <SDL3/SDL_main.h>
 #include <SDL3/SDL_test.h>
@@ -33,6 +26,34 @@ widelen(char *data)
     return len;
 }
 
+static char *get_next_line(Uint8 **fdataptr, size_t *fdatalen)
+{
+    char *retval = (char *) *fdataptr;
+    Uint8 *ptr = *fdataptr;
+    size_t len = *fdatalen;
+
+    if (len == 0) {
+        return NULL;
+    }
+
+    while (len > 0) {
+        if (*ptr == '\r') {
+            *ptr = '\0';
+        } else if (*ptr == '\n') {
+            *ptr = '\0';
+            ptr++;
+            len--;
+            break;
+        }
+        ptr++;
+        len--;
+    }
+
+    *fdataptr = ptr;
+    *fdatalen = len;
+    return retval;
+}
+
 int main(int argc, char *argv[])
 {
     const char *formats[] = {
@@ -51,13 +72,15 @@ int main(int argc, char *argv[])
     };
 
     char *fname = NULL;
-    char buffer[BUFSIZ];
     char *ucs4;
     char *test[2];
     int i;
-    FILE *file;
     int errors = 0;
     SDLTest_CommonState *state;
+    Uint8 *fdata = NULL;
+    Uint8 *fdataptr = NULL;
+    char *line = NULL;
+    size_t fdatalen = 0;
 
     /* Initialize test framework */
     state = SDLTest_CommonCreateState(argv, 0);
@@ -89,20 +112,19 @@ int main(int argc, char *argv[])
     }
 
     fname = GetResourceFilename(fname, "utf8.txt");
-    file = fopen(fname, "rb");
-    if (!file) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to open %s\n", fname);
+    fdata = (Uint8 *) (fname ? SDL_LoadFile(fname, &fdatalen) : NULL);
+    if (!fdata) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to load %s\n", fname);
         return 1;
     }
-    SDL_free(fname);
 
-    while (fgets(buffer, sizeof(buffer), file)) {
+    fdataptr = fdata;
+    while ((line = get_next_line(&fdataptr, &fdatalen)) != NULL) {
         /* Convert to UCS-4 */
         size_t len;
-        ucs4 =
-            SDL_iconv_string("UCS-4", "UTF-8", buffer,
-                             SDL_strlen(buffer) + 1);
+        ucs4 = SDL_iconv_string("UCS-4", "UTF-8", line, SDL_strlen(line) + 1);
         len = (widelen(ucs4) + 1) * 4;
+
         for (i = 0; i < SDL_arraysize(formats); ++i) {
             test[0] = SDL_iconv_string(formats[i], "UCS-4", ucs4, len);
             test[1] = SDL_iconv_string("UCS-4", formats[i], test[0], len);
@@ -115,10 +137,44 @@ int main(int argc, char *argv[])
         }
         test[0] = SDL_iconv_string("UTF-8", "UCS-4", ucs4, len);
         SDL_free(ucs4);
-        (void)fputs(test[0], stdout);
+        SDL_Log("%s", test[0]);
         SDL_free(test[0]);
     }
-    (void)fclose(file);
+    SDL_free(fdata);
+
+    #if 0
+    {
+        Uint32 *ucs4buf;
+        Uint32 *ucs4ptr;
+        char *utf8out;
+        Uint32 cp;
+        SDL_IOStream *io;
+
+        fdata = (Uint8 *) (fname ? SDL_LoadFile(fname, &fdatalen) : NULL);
+        if (!fdata) {
+            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Unable to load %s\n", fname);
+            return 1;
+        }
+
+        ucs4buf = (Uint32 *) SDL_malloc(fdatalen * 4);
+        ucs4ptr = ucs4buf;
+
+        fdataptr = fdata;
+        while ((cp = SDL_StepUTF8((const char **) &fdataptr, &fdatalen)) != 0) {
+            *(ucs4ptr++) = SDL_Swap32BE(cp);
+        }
+        *(ucs4ptr++) = 0;
+        utf8out = SDL_iconv_string("UTF-8", "UCS-4", (const char *) ucs4buf, (size_t) ((ucs4ptr - ucs4buf)) * 4);
+        io = SDL_IOFromFile("test_steputf8.txt", "wb");
+        SDL_WriteIO(io, utf8out, SDL_strlen(utf8out));
+        SDL_CloseIO(io);
+        SDL_free(ucs4buf);
+        SDL_free(utf8out);
+        SDL_free(fdata);
+    }
+    #endif
+
+    SDL_free(fname);
 
     SDL_LogInfo(SDL_LOG_CATEGORY_APPLICATION, "Total errors: %d\n", errors);
     SDL_Quit();