From b3576dd35409a3a3ad742bb74122e14fcda2d886 Mon Sep 17 00:00:00 2001 From: Rokas Kupstys Date: Wed, 23 Sep 2020 14:35:46 +0300 Subject: [PATCH] Replace UTF-8 decoder with branchless version by Christopher Wellons. Decoding performance increase ranges from 30-40%. Changes: * Errors handling near the end of string changed. If input does not contain enough bytes, decoder returns `IM_UNICODE_CODEPOINT_INVALID`, consuming all remaining bytes while old decoder consumed only one byte. Guarantees: * At least one byte is consumed, if input had at least one byte available. * Number of consumed bytes will never seek past end of string. Requirements: * `in_text` is a valid pointer. * String pointed by `in_text` must be zero-terminated, or `in_text_end` is not NULL. --- docs/CHANGELOG.txt | 3 ++ imgui.cpp | 107 +++++++++++++++++++++------------------------ 2 files changed, 53 insertions(+), 57 deletions(-) diff --git a/docs/CHANGELOG.txt b/docs/CHANGELOG.txt index 7c3f32c8..33e84373 100644 --- a/docs/CHANGELOG.txt +++ b/docs/CHANGELOG.txt @@ -59,6 +59,9 @@ Other Changes: - Tab Bar: Do not display a tooltip if the name already fits over a given tab. (#3521) - Drag and Drop: Fix drag and drop to tie same-size drop targets by choosen the later one. Fixes dragging into a full-window-sized dockspace inside a zero-padded window. (#3519, #2717) [@Black-Cat] +- Misc: Replaced UTF-8 decoder by branchless one by Christopher Wellons (30~40% faster). [@rokups] + Super minor fix handling incomplete UTF-8 contents: if input does not contain enough bytes, decoder + returns IM_UNICODE_CODEPOINT_INVALID and consume remaining bytes (vs old decoded consumed only 1 byte). - Backends: OpenGL3: Use glGetString(GL_VERSION) query instead of glGetIntegerv(GL_MAJOR_VERSION, ...) when the later returns zero (e.g. Desktop GL 2.x). (#3530) [@xndcn] - Backends: OpenGL3: Backup and restore GL_PRIMITIVE_RESTART state. (#3544) [@Xipiryon] diff --git a/imgui.cpp b/imgui.cpp index 0bb82d23..d8b7d5ed 100644 --- a/imgui.cpp +++ b/imgui.cpp @@ -1553,66 +1553,59 @@ void* ImFileLoadToMemory(const char* filename, const char* mode, size_t* out_f //----------------------------------------------------------------------------- // Convert UTF-8 to 32-bit character, process single character input. -// Based on stb_from_utf8() from github.com/nothings/stb/ +// Based on work of Christopher Wellons (https://github.com/skeeto/branchless-utf8) // We handle UTF-8 decoding error by skipping forward. int ImTextCharFromUtf8(unsigned int* out_char, const char* in_text, const char* in_text_end) { - unsigned int c = (unsigned int)-1; - const unsigned char* str = (const unsigned char*)in_text; - if (!(*str & 0x80)) - { - c = (unsigned int)(*str++); - *out_char = c; - return 1; - } - if ((*str & 0xe0) == 0xc0) - { - *out_char = IM_UNICODE_CODEPOINT_INVALID; // will be invalid but not end of string - if (in_text_end && in_text_end - (const char*)str < 2) return 1; - if (*str < 0xc2) return 2; - c = (unsigned int)((*str++ & 0x1f) << 6); - if ((*str & 0xc0) != 0x80) return 2; - c += (*str++ & 0x3f); - *out_char = c; - return 2; - } - if ((*str & 0xf0) == 0xe0) - { - *out_char = IM_UNICODE_CODEPOINT_INVALID; // will be invalid but not end of string - if (in_text_end && in_text_end - (const char*)str < 3) return 1; - if (*str == 0xe0 && (str[1] < 0xa0 || str[1] > 0xbf)) return 3; - if (*str == 0xed && str[1] > 0x9f) return 3; // str[1] < 0x80 is checked below - c = (unsigned int)((*str++ & 0x0f) << 12); - if ((*str & 0xc0) != 0x80) return 3; - c += (unsigned int)((*str++ & 0x3f) << 6); - if ((*str & 0xc0) != 0x80) return 3; - c += (*str++ & 0x3f); - *out_char = c; - return 3; - } - if ((*str & 0xf8) == 0xf0) - { - *out_char = IM_UNICODE_CODEPOINT_INVALID; // will be invalid but not end of string - if (in_text_end && in_text_end - (const char*)str < 4) return 1; - if (*str > 0xf4) return 4; - if (*str == 0xf0 && (str[1] < 0x90 || str[1] > 0xbf)) return 4; - if (*str == 0xf4 && str[1] > 0x8f) return 4; // str[1] < 0x80 is checked below - c = (unsigned int)((*str++ & 0x07) << 18); - if ((*str & 0xc0) != 0x80) return 4; - c += (unsigned int)((*str++ & 0x3f) << 12); - if ((*str & 0xc0) != 0x80) return 4; - c += (unsigned int)((*str++ & 0x3f) << 6); - if ((*str & 0xc0) != 0x80) return 4; - c += (*str++ & 0x3f); - // utf-8 encodings of values used in surrogate pairs are invalid - if ((c & 0xFFFFF800) == 0xD800) return 4; - // If codepoint does not fit in ImWchar, use replacement character U+FFFD instead - if (c > IM_UNICODE_CODEPOINT_MAX) c = IM_UNICODE_CODEPOINT_INVALID; - *out_char = c; - return 4; - } - *out_char = 0; - return 0; + static const char lengths[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 }; + static const int masks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 }; + static const uint32_t mins[] = { 4194304, 0, 128, 2048, 65536 }; + static const int shiftc[] = { 0, 18, 12, 6, 0 }; + static const int shifte[] = { 0, 6, 4, 2, 0 }; + unsigned char s[4]; + int len = lengths[*(const unsigned char*)in_text >> 3]; + + if (in_text_end == NULL) + in_text_end = in_text + len + !len; // Max length, nulls will be taken into account. + + // Copy at most 'len' bytes, stop copying at 0 or past in_text_end. + s[0] = in_text + 0 < in_text_end ? in_text[0] : 0; + s[1] = s[0] && in_text + 1 < in_text_end ? in_text[1] : 0; + s[2] = s[1] && in_text + 2 < in_text_end ? in_text[2] : 0; + s[3] = s[2] && in_text + 3 < in_text_end ? in_text[3] : 0; + + // Compute the pointer to the next character early so that the next + // iteration can start working on the next character. Neither Clang + // nor GCC figure out this reordering on their own. + //const unsigned char *next = s + len + !len; + + // No bytes are consumed when *in_text == 0 || in_text == in_text_end. + // One byte is consumed in case of invalid first byte of in_text. + // All available bytes (at most `len` bytes) are consumed on incomplete/invalid second to last bytes. + int consumed = !!s[0] + !!s[1] + !!s[2] + !!s[3]; + + // Assume a four-byte character and load four bytes. Unused bits are shifted out. + *out_char = (uint32_t)(s[0] & masks[len]) << 18; + *out_char |= (uint32_t)(s[1] & 0x3f) << 12; + *out_char |= (uint32_t)(s[2] & 0x3f) << 6; + *out_char |= (uint32_t)(s[3] & 0x3f) << 0; + *out_char >>= shiftc[len]; + + // Accumulate the various error conditions. + int e = 0; + e = (*out_char < mins[len]) << 6; // non-canonical encoding + e |= ((*out_char >> 11) == 0x1b) << 7; // surrogate half? + e |= (*out_char > IM_UNICODE_CODEPOINT_MAX) << 8; // out of range? + e |= (s[1] & 0xc0) >> 2; + e |= (s[2] & 0xc0) >> 4; + e |= (s[3] ) >> 6; + e ^= 0x2a; // top two bits of each tail byte correct? + e >>= shifte[len]; + + if (e) + *out_char = IM_UNICODE_CODEPOINT_INVALID; + + return consumed; } int ImTextStrFromUtf8(ImWchar* buf, int buf_size, const char* in_text, const char* in_text_end, const char** in_text_remaining)