From 63443f82d5d6b2488f79f6c61130a18f4bc38af5 Mon Sep 17 00:00:00 2001 From: patrick96 Date: Wed, 10 May 2023 14:57:37 +0200 Subject: [PATCH] Refactor utf8_to_ucs4 --- include/cairo/context.hpp | 2 +- include/utils/string.hpp | 4 +- src/utils/string.cpp | 87 +++++++++++++++++++++++-------- tests/unit_tests/utils/string.cpp | 4 +- 4 files changed, 70 insertions(+), 27 deletions(-) diff --git a/include/cairo/context.hpp b/include/cairo/context.hpp index 60f3a232..bdfefa0c 100644 --- a/include/cairo/context.hpp +++ b/include/cairo/context.hpp @@ -165,7 +165,7 @@ namespace cairo { string utf8 = string(t.contents); string_util::unicode_charlist chars; - string_util::utf8_to_ucs4((const unsigned char*)utf8.c_str(), chars); + string_util::utf8_to_ucs4(utf8.c_str(), chars); while (!chars.empty()) { auto remaining = chars.size(); diff --git a/include/utils/string.hpp b/include/utils/string.hpp index 176ee978..32ef01e9 100644 --- a/include/utils/string.hpp +++ b/include/utils/string.hpp @@ -46,7 +46,7 @@ struct unicode_character { /** * The numerical codepoint. Between U+0000 and U+10FFFF */ - unsigned long codepoint{0}; + uint32_t codepoint{0}; /** * Byte offset of this character in the original string */ @@ -88,7 +88,7 @@ string utf8_truncate(string&& value, size_t len); /** * @brief Create a UCS-4 codepoint from a utf-8 encoded string */ -bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list); +bool utf8_to_ucs4(const char* src, unicode_charlist& result_list); /** * @brief Convert a UCS-4 codepoint to a utf-8 encoded string diff --git a/src/utils/string.cpp b/src/utils/string.cpp index 681eae4d..74130979 100644 --- a/src/utils/string.cpp +++ b/src/utils/string.cpp @@ -8,6 +8,32 @@ POLYBAR_NS namespace string_util { + +/** + * Prefixes for the leading byte in a UTF8 codepoint + */ +static constexpr uint8_t UTF8_LEADING1_PREFIX = 0b00000000; +static constexpr uint8_t UTF8_LEADING2_PREFIX = 0b11000000; +static constexpr uint8_t UTF8_LEADING3_PREFIX = 0b11100000; +static constexpr uint8_t UTF8_LEADING4_PREFIX = 0b11110000; + +/** + * Masks to extract the prefix from the leading byte in a UTF8 codepoint + */ +static constexpr uint8_t UTF8_LEADING1_MASK = 0b10000000; +static constexpr uint8_t UTF8_LEADING2_MASK = 0b11100000; +static constexpr uint8_t UTF8_LEADING3_MASK = 0b11110000; +static constexpr uint8_t UTF8_LEADING4_MASK = 0b11111000; + +/** + * Prefix for UTF8 continuation bytes + */ +static constexpr uint8_t UTF8_CONTINUATION_PREFIX = 0b10000000; +/** + * Mask to extract the UTF8 continuation byte prefix + */ +static constexpr uint8_t UTF8_CONTINUATION_MASK = 0b11000000; + /** * Check if haystack contains needle */ @@ -224,42 +250,59 @@ string utf8_truncate(string&& value, size_t len) { return forward(value); } +/** + * Given a leading byte of a UTF8 codepoint calculates the number of bytes taken up by the codepoint. + * + * @returns {len, result} The codepoint is len bytes and result contains the codepoint bits held in the leading byte. + */ +static pair utf8_get_len(uint8_t leading) { + if ((leading & UTF8_LEADING1_MASK) == UTF8_LEADING1_PREFIX) { + return {1, leading & ~UTF8_LEADING1_MASK}; + } else if ((leading & UTF8_LEADING2_MASK) == UTF8_LEADING2_PREFIX) { + return {2, leading & ~UTF8_LEADING2_MASK}; + } else if ((leading & UTF8_LEADING3_MASK) == UTF8_LEADING3_PREFIX) { + return {3, leading & ~UTF8_LEADING3_MASK}; + } else if ((leading & UTF8_LEADING4_MASK) == UTF8_LEADING4_PREFIX) { + return {4, leading & ~UTF8_LEADING4_MASK}; + } else { + return {-1, 0}; + } +} + /** * @brief Create a UCS-4 codepoint from a utf-8 encoded string */ -bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list) { +bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) { if (!src) { return false; } - const unsigned char* first = src; + const auto* begin = reinterpret_cast(src); + const auto* first = begin; while (*first) { - int len = 0; - unsigned long result = 0; - if ((*first >> 7) == 0) { - len = 1; - result = *first; - } else if ((*first >> 5) == 6) { - len = 2; - result = *first & 31; - } else if ((*first >> 4) == 14) { - len = 3; - result = *first & 15; - } else if ((*first >> 3) == 30) { - len = 4; - result = *first & 7; - } else { + // Number of bytes taken up by this codepoint and the bits contained in the leading byte. + auto [len, result] = utf8_get_len(*first); + + // Invalid lengths + if (len <= 0 || len > 4) { return false; } - const unsigned char* next; - for (next = first + 1; *next && ((*next >> 6) == 2) && (next - first < len); next++) { + + const uint8_t* next = first + 1; + for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - first < len); next++) { result = result << 6; - result |= *next & 63; + result |= *next & ~UTF8_CONTINUATION_MASK; } + unicode_character uc_char; uc_char.codepoint = result; - uc_char.offset = first - src; + uc_char.offset = first - begin; uc_char.length = next - first; result_list.push_back(uc_char); + + if (uc_char.length != len) { + return false; + } + first = next; } return true; @@ -268,7 +311,7 @@ bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list) { /** * @brief Convert a UCS-4 codepoint to a utf-8 encoded string */ -size_t ucs4_to_utf8(char* utf8, unsigned int ucs) { +size_t ucs4_to_utf8(char* utf8, uint32_t ucs) { if (ucs <= 0x7f) { *utf8 = ucs; return 1; diff --git a/tests/unit_tests/utils/string.cpp b/tests/unit_tests/utils/string.cpp index e51922d8..465a2fbd 100644 --- a/tests/unit_tests/utils/string.cpp +++ b/tests/unit_tests/utils/string.cpp @@ -188,7 +188,7 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) { string_util::unicode_charlist result_list{}; string str = GetParam(); - bool success = string_util::utf8_to_ucs4((const unsigned char*)str.c_str(), result_list); + bool success = string_util::utf8_to_ucs4(str.c_str(), result_list); ASSERT_TRUE(success); ASSERT_EQ(str.size(), result_list.size()); @@ -231,7 +231,7 @@ TEST_P(Utf8ToUCS4SingleTest, correctness) { string_util::unicode_charlist result_list{}; const auto [str, codepoint] = GetParam(); - bool success = string_util::utf8_to_ucs4((const unsigned char*)str.c_str(), result_list); + bool success = string_util::utf8_to_ucs4(str.c_str(), result_list); ASSERT_TRUE(success); ASSERT_EQ(1, result_list.size());