From 5e5a0a7c4d10c2f629e8b5794360c99447433347 Mon Sep 17 00:00:00 2001
From: patrick96
Date: Wed, 10 May 2023 16:11:52 +0200
Subject: [PATCH] Make unicode_charlist a vector
---
include/utils/string.hpp | 3 +-
src/utils/string.cpp | 49 ++++++++++++++++++++-----------
tests/unit_tests/utils/string.cpp | 7 ++---
3 files changed, 36 insertions(+), 23 deletions(-)
diff --git a/include/utils/string.hpp b/include/utils/string.hpp
index 62c6ee9d..328ef897 100644
--- a/include/utils/string.hpp
+++ b/include/utils/string.hpp
@@ -1,6 +1,5 @@
#pragma once
-#include
#include
#include "common.hpp"
@@ -56,7 +55,7 @@ struct unicode_character {
*/
int length{0};
};
-using unicode_charlist = std::list;
+using unicode_charlist = std::vector;
bool contains(const string& haystack, const string& needle);
bool contains_ignore_case(const string& haystack, const string& needle);
diff --git a/src/utils/string.cpp b/src/utils/string.cpp
index 808a6dc4..a66e00cb 100644
--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@@ -272,39 +272,54 @@ static pair utf8_get_len(uint8_t leading) {
/**
* @brief Create a list of UCS-4 codepoint from a utf-8 encoded string
+ *
+ * If invalid utf8 characters are encountered they are skipped until the next valid codepoint and the function will
+ * eventually return false.
+ *
+ * The result_list is always populated with all valid utf8 codepoints.
+ *
+ * @return Whether the string is completely valid utf8
*/
bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) {
assert(src);
+ bool has_errors = false;
const auto* begin = reinterpret_cast(src);
- const auto* first = begin;
- while (*first) {
- // Number of bytes taken up by this codepoint and the bits contained in the leading byte.
- auto [len, result] = utf8_get_len(*first);
- // Invalid lengths
+ const auto* current = begin;
+ while (*current) {
+ // Number of bytes taken up by this codepoint and the bits contained in the leading byte.
+ auto [len, result] = utf8_get_len(*current);
+ auto offset = current - begin;
+
+ /*
+ * Invalid lengths, this byte is not a valid leading byte.
+ * Skip it.
+ */
if (len <= 0 || len > 4) {
- return false;
+ has_errors = true;
+ current++;
+ continue;
}
- const uint8_t* next = first + 1;
- for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - first < len); next++) {
+ const uint8_t* next = current + 1;
+ for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - current < len); next++) {
result = result << 6;
result |= *next & ~UTF8_CONTINUATION_MASK;
}
- unicode_character uc_char;
- uc_char.codepoint = result;
- uc_char.offset = first - begin;
- uc_char.length = next - first;
- result_list.push_back(uc_char);
+ auto actual_len = next - current;
+ current = next;
- if (uc_char.length != len) {
- return false;
+ if (actual_len != len) {
+ has_errors = true;
+ continue;
}
- first = next;
+ result_list.push_back(unicode_character{result, static_cast(offset), static_cast(actual_len)});
+ current = next;
}
- return true;
+
+ return !has_errors;
}
/**
diff --git a/tests/unit_tests/utils/string.cpp b/tests/unit_tests/utils/string.cpp
index adfdfd63..e0f1257e 100644
--- a/tests/unit_tests/utils/string.cpp
+++ b/tests/unit_tests/utils/string.cpp
@@ -193,8 +193,8 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
ASSERT_EQ(str.size(), result_list.size());
- int i = 0;
- for (const auto& unicode_char : result_list) {
+ for (size_t i = 0; i < str.size(); i++) {
+ const auto& unicode_char = result_list[i];
auto c = str[i];
// Matches the single byte character
@@ -203,8 +203,6 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
EXPECT_EQ(i, unicode_char.offset);
// Only takes a single byte
EXPECT_EQ(1, unicode_char.length);
-
- i++;
}
}
@@ -266,5 +264,6 @@ TEST_P(Utf8ToUCS4InvalidTest, correctness) {
const auto str = GetParam();
bool success = string_util::utf8_to_ucs4(str.c_str(), result_list);
EXPECT_FALSE(success);
+ EXPECT_EQ(0, result_list.size());
}
// }}}