Make unicode_charlist a vector

This commit is contained in:
patrick96 2023-05-10 16:11:52 +02:00 committed by Patrick Ziegler
parent c86519f077
commit 5e5a0a7c4d
3 changed files with 36 additions and 23 deletions

View File

@ -1,6 +1,5 @@
#pragma once #pragma once
#include <list>
#include <sstream> #include <sstream>
#include "common.hpp" #include "common.hpp"
@ -56,7 +55,7 @@ struct unicode_character {
*/ */
int length{0}; int length{0};
}; };
using unicode_charlist = std::list<unicode_character>; using unicode_charlist = std::vector<unicode_character>;
bool contains(const string& haystack, const string& needle); bool contains(const string& haystack, const string& needle);
bool contains_ignore_case(const string& haystack, const string& needle); bool contains_ignore_case(const string& haystack, const string& needle);

View File

@ -272,39 +272,54 @@ static pair<int, uint32_t> utf8_get_len(uint8_t leading) {
/** /**
* @brief Create a list of UCS-4 codepoint from a utf-8 encoded string * @brief Create a list of UCS-4 codepoint from a utf-8 encoded string
*
* If invalid utf8 characters are encountered they are skipped until the next valid codepoint and the function will
* eventually return false.
*
* The result_list is always populated with all valid utf8 codepoints.
*
* @return Whether the string is completely valid utf8
*/ */
bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) { bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) {
assert(src); assert(src);
bool has_errors = false;
const auto* begin = reinterpret_cast<const uint8_t*>(src); const auto* begin = reinterpret_cast<const uint8_t*>(src);
const auto* first = begin;
while (*first) {
// Number of bytes taken up by this codepoint and the bits contained in the leading byte.
auto [len, result] = utf8_get_len(*first);
// Invalid lengths const auto* current = begin;
while (*current) {
// Number of bytes taken up by this codepoint and the bits contained in the leading byte.
auto [len, result] = utf8_get_len(*current);
auto offset = current - begin;
/*
* Invalid lengths, this byte is not a valid leading byte.
* Skip it.
*/
if (len <= 0 || len > 4) { if (len <= 0 || len > 4) {
return false; has_errors = true;
current++;
continue;
} }
const uint8_t* next = first + 1; const uint8_t* next = current + 1;
for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - first < len); next++) { for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - current < len); next++) {
result = result << 6; result = result << 6;
result |= *next & ~UTF8_CONTINUATION_MASK; result |= *next & ~UTF8_CONTINUATION_MASK;
} }
unicode_character uc_char; auto actual_len = next - current;
uc_char.codepoint = result; current = next;
uc_char.offset = first - begin;
uc_char.length = next - first;
result_list.push_back(uc_char);
if (uc_char.length != len) { if (actual_len != len) {
return false; has_errors = true;
continue;
} }
first = next; result_list.push_back(unicode_character{result, static_cast<int>(offset), static_cast<int>(actual_len)});
current = next;
} }
return true;
return !has_errors;
} }
/** /**

View File

@ -193,8 +193,8 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
ASSERT_EQ(str.size(), result_list.size()); ASSERT_EQ(str.size(), result_list.size());
int i = 0; for (size_t i = 0; i < str.size(); i++) {
for (const auto& unicode_char : result_list) { const auto& unicode_char = result_list[i];
auto c = str[i]; auto c = str[i];
// Matches the single byte character // Matches the single byte character
@ -203,8 +203,6 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
EXPECT_EQ(i, unicode_char.offset); EXPECT_EQ(i, unicode_char.offset);
// Only takes a single byte // Only takes a single byte
EXPECT_EQ(1, unicode_char.length); EXPECT_EQ(1, unicode_char.length);
i++;
} }
} }
@ -266,5 +264,6 @@ TEST_P(Utf8ToUCS4InvalidTest, correctness) {
const auto str = GetParam(); const auto str = GetParam();
bool success = string_util::utf8_to_ucs4(str.c_str(), result_list); bool success = string_util::utf8_to_ucs4(str.c_str(), result_list);
EXPECT_FALSE(success); EXPECT_FALSE(success);
EXPECT_EQ(0, result_list.size());
} }
// }}} // }}}