diff --git a/src/utils/string.cpp b/src/utils/string.cpp index c9ed64a6..808a6dc4 100644 --- a/src/utils/string.cpp +++ b/src/utils/string.cpp @@ -1,6 +1,7 @@ #include "utils/string.hpp" #include +#include #include #include #include @@ -273,9 +274,7 @@ static pair utf8_get_len(uint8_t leading) { * @brief Create a list of UCS-4 codepoint from a utf-8 encoded string */ bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) { - if (!src) { - return false; - } + assert(src); const auto* begin = reinterpret_cast(src); const auto* first = begin; while (*first) { diff --git a/tests/unit_tests/utils/string.cpp b/tests/unit_tests/utils/string.cpp index 465a2fbd..adfdfd63 100644 --- a/tests/unit_tests/utils/string.cpp +++ b/tests/unit_tests/utils/string.cpp @@ -225,7 +225,7 @@ const vector utf8_to_ucs4_single_list = { INSTANTIATE_TEST_SUITE_P(Inst, Utf8ToUCS4SingleTest, testing::ValuesIn(utf8_to_ucs4_single_list)); /** - * Test that the conversion to ucs4 works correctly with pure ASCII strings. + * Test that the conversion to ucs4 works correctly with a single UTF8 character */ TEST_P(Utf8ToUCS4SingleTest, correctness) { string_util::unicode_charlist result_list{}; @@ -244,4 +244,27 @@ TEST_P(Utf8ToUCS4SingleTest, correctness) { // Must match expected codepoint EXPECT_EQ(codepoint, unicode_char.codepoint); } + +class Utf8ToUCS4InvalidTest : public testing::TestWithParam {}; + +const vector utf8_to_ucs4_invalid_list = { + "\x80", // continuation byte without leading byte + "\xa0", // 2 byte code point with only leading byte + "\xe0", // 3 byte code point with only leading byte + "\xf0", // 4 byte code point with only leading byte + "\xf0\x80\x80", // 4 byte code point with only 3 bytes + "\xe0\x70\x80", // 3 byte code point, 2nd byte has no continuation prefix +}; + +INSTANTIATE_TEST_SUITE_P(Inst, Utf8ToUCS4InvalidTest, testing::ValuesIn(utf8_to_ucs4_invalid_list)); + +/** + * Tests that the conversion correctly returns false for invalid strings. + */ +TEST_P(Utf8ToUCS4InvalidTest, correctness) { + string_util::unicode_charlist result_list{}; + const auto str = GetParam(); + bool success = string_util::utf8_to_ucs4(str.c_str(), result_list); + EXPECT_FALSE(success); +} // }}}