From c86519f077cc48243fb5348ce33db9517aa94eef Mon Sep 17 00:00:00 2001
From: patrick96
Date: Wed, 10 May 2023 15:35:14 +0200
Subject: [PATCH] test: utf8_to_ucs4 with invalid strings
---
src/utils/string.cpp | 5 ++---
tests/unit_tests/utils/string.cpp | 25 ++++++++++++++++++++++++-
2 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/src/utils/string.cpp b/src/utils/string.cpp
index c9ed64a6..808a6dc4 100644
--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@@ -1,6 +1,7 @@
#include "utils/string.hpp"
#include
+#include
#include
#include
#include
@@ -273,9 +274,7 @@ static pair utf8_get_len(uint8_t leading) {
* @brief Create a list of UCS-4 codepoint from a utf-8 encoded string
*/
bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) {
- if (!src) {
- return false;
- }
+ assert(src);
const auto* begin = reinterpret_cast(src);
const auto* first = begin;
while (*first) {
diff --git a/tests/unit_tests/utils/string.cpp b/tests/unit_tests/utils/string.cpp
index 465a2fbd..adfdfd63 100644
--- a/tests/unit_tests/utils/string.cpp
+++ b/tests/unit_tests/utils/string.cpp
@@ -225,7 +225,7 @@ const vector utf8_to_ucs4_single_list = {
INSTANTIATE_TEST_SUITE_P(Inst, Utf8ToUCS4SingleTest, testing::ValuesIn(utf8_to_ucs4_single_list));
/**
- * Test that the conversion to ucs4 works correctly with pure ASCII strings.
+ * Test that the conversion to ucs4 works correctly with a single UTF8 character
*/
TEST_P(Utf8ToUCS4SingleTest, correctness) {
string_util::unicode_charlist result_list{};
@@ -244,4 +244,27 @@ TEST_P(Utf8ToUCS4SingleTest, correctness) {
// Must match expected codepoint
EXPECT_EQ(codepoint, unicode_char.codepoint);
}
+
+class Utf8ToUCS4InvalidTest : public testing::TestWithParam {};
+
+const vector utf8_to_ucs4_invalid_list = {
+ "\x80", // continuation byte without leading byte
+ "\xa0", // 2 byte code point with only leading byte
+ "\xe0", // 3 byte code point with only leading byte
+ "\xf0", // 4 byte code point with only leading byte
+ "\xf0\x80\x80", // 4 byte code point with only 3 bytes
+ "\xe0\x70\x80", // 3 byte code point, 2nd byte has no continuation prefix
+};
+
+INSTANTIATE_TEST_SUITE_P(Inst, Utf8ToUCS4InvalidTest, testing::ValuesIn(utf8_to_ucs4_invalid_list));
+
+/**
+ * Tests that the conversion correctly returns false for invalid strings.
+ */
+TEST_P(Utf8ToUCS4InvalidTest, correctness) {
+ string_util::unicode_charlist result_list{};
+ const auto str = GetParam();
+ bool success = string_util::utf8_to_ucs4(str.c_str(), result_list);
+ EXPECT_FALSE(success);
+}
// }}}