diff --git a/include/utils/string.hpp b/include/utils/string.hpp index 62c6ee9d..328ef897 100644 --- a/include/utils/string.hpp +++ b/include/utils/string.hpp @@ -1,6 +1,5 @@ #pragma once -#include #include #include "common.hpp" @@ -56,7 +55,7 @@ struct unicode_character { */ int length{0}; }; -using unicode_charlist = std::list; +using unicode_charlist = std::vector; bool contains(const string& haystack, const string& needle); bool contains_ignore_case(const string& haystack, const string& needle); diff --git a/src/utils/string.cpp b/src/utils/string.cpp index 808a6dc4..a66e00cb 100644 --- a/src/utils/string.cpp +++ b/src/utils/string.cpp @@ -272,39 +272,54 @@ static pair utf8_get_len(uint8_t leading) { /** * @brief Create a list of UCS-4 codepoint from a utf-8 encoded string + * + * If invalid utf8 characters are encountered they are skipped until the next valid codepoint and the function will + * eventually return false. + * + * The result_list is always populated with all valid utf8 codepoints. + * + * @return Whether the string is completely valid utf8 */ bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) { assert(src); + bool has_errors = false; const auto* begin = reinterpret_cast(src); - const auto* first = begin; - while (*first) { - // Number of bytes taken up by this codepoint and the bits contained in the leading byte. - auto [len, result] = utf8_get_len(*first); - // Invalid lengths + const auto* current = begin; + while (*current) { + // Number of bytes taken up by this codepoint and the bits contained in the leading byte. + auto [len, result] = utf8_get_len(*current); + auto offset = current - begin; + + /* + * Invalid lengths, this byte is not a valid leading byte. + * Skip it. + */ if (len <= 0 || len > 4) { - return false; + has_errors = true; + current++; + continue; } - const uint8_t* next = first + 1; - for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - first < len); next++) { + const uint8_t* next = current + 1; + for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - current < len); next++) { result = result << 6; result |= *next & ~UTF8_CONTINUATION_MASK; } - unicode_character uc_char; - uc_char.codepoint = result; - uc_char.offset = first - begin; - uc_char.length = next - first; - result_list.push_back(uc_char); + auto actual_len = next - current; + current = next; - if (uc_char.length != len) { - return false; + if (actual_len != len) { + has_errors = true; + continue; } - first = next; + result_list.push_back(unicode_character{result, static_cast(offset), static_cast(actual_len)}); + current = next; } - return true; + + return !has_errors; } /** diff --git a/tests/unit_tests/utils/string.cpp b/tests/unit_tests/utils/string.cpp index adfdfd63..e0f1257e 100644 --- a/tests/unit_tests/utils/string.cpp +++ b/tests/unit_tests/utils/string.cpp @@ -193,8 +193,8 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) { ASSERT_EQ(str.size(), result_list.size()); - int i = 0; - for (const auto& unicode_char : result_list) { + for (size_t i = 0; i < str.size(); i++) { + const auto& unicode_char = result_list[i]; auto c = str[i]; // Matches the single byte character @@ -203,8 +203,6 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) { EXPECT_EQ(i, unicode_char.offset); // Only takes a single byte EXPECT_EQ(1, unicode_char.length); - - i++; } } @@ -266,5 +264,6 @@ TEST_P(Utf8ToUCS4InvalidTest, correctness) { const auto str = GetParam(); bool success = string_util::utf8_to_ucs4(str.c_str(), result_list); EXPECT_FALSE(success); + EXPECT_EQ(0, result_list.size()); } // }}}