Make unicode_charlist a vector
This commit is contained in:
parent
c86519f077
commit
5e5a0a7c4d
@ -1,6 +1,5 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <list>
|
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
@ -56,7 +55,7 @@ struct unicode_character {
|
|||||||
*/
|
*/
|
||||||
int length{0};
|
int length{0};
|
||||||
};
|
};
|
||||||
using unicode_charlist = std::list<unicode_character>;
|
using unicode_charlist = std::vector<unicode_character>;
|
||||||
|
|
||||||
bool contains(const string& haystack, const string& needle);
|
bool contains(const string& haystack, const string& needle);
|
||||||
bool contains_ignore_case(const string& haystack, const string& needle);
|
bool contains_ignore_case(const string& haystack, const string& needle);
|
||||||
|
@ -272,39 +272,54 @@ static pair<int, uint32_t> utf8_get_len(uint8_t leading) {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Create a list of UCS-4 codepoint from a utf-8 encoded string
|
* @brief Create a list of UCS-4 codepoint from a utf-8 encoded string
|
||||||
|
*
|
||||||
|
* If invalid utf8 characters are encountered they are skipped until the next valid codepoint and the function will
|
||||||
|
* eventually return false.
|
||||||
|
*
|
||||||
|
* The result_list is always populated with all valid utf8 codepoints.
|
||||||
|
*
|
||||||
|
* @return Whether the string is completely valid utf8
|
||||||
*/
|
*/
|
||||||
bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) {
|
bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) {
|
||||||
assert(src);
|
assert(src);
|
||||||
|
bool has_errors = false;
|
||||||
const auto* begin = reinterpret_cast<const uint8_t*>(src);
|
const auto* begin = reinterpret_cast<const uint8_t*>(src);
|
||||||
const auto* first = begin;
|
|
||||||
while (*first) {
|
|
||||||
// Number of bytes taken up by this codepoint and the bits contained in the leading byte.
|
|
||||||
auto [len, result] = utf8_get_len(*first);
|
|
||||||
|
|
||||||
// Invalid lengths
|
const auto* current = begin;
|
||||||
|
while (*current) {
|
||||||
|
// Number of bytes taken up by this codepoint and the bits contained in the leading byte.
|
||||||
|
auto [len, result] = utf8_get_len(*current);
|
||||||
|
auto offset = current - begin;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Invalid lengths, this byte is not a valid leading byte.
|
||||||
|
* Skip it.
|
||||||
|
*/
|
||||||
if (len <= 0 || len > 4) {
|
if (len <= 0 || len > 4) {
|
||||||
return false;
|
has_errors = true;
|
||||||
|
current++;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint8_t* next = first + 1;
|
const uint8_t* next = current + 1;
|
||||||
for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - first < len); next++) {
|
for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - current < len); next++) {
|
||||||
result = result << 6;
|
result = result << 6;
|
||||||
result |= *next & ~UTF8_CONTINUATION_MASK;
|
result |= *next & ~UTF8_CONTINUATION_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
unicode_character uc_char;
|
auto actual_len = next - current;
|
||||||
uc_char.codepoint = result;
|
current = next;
|
||||||
uc_char.offset = first - begin;
|
|
||||||
uc_char.length = next - first;
|
|
||||||
result_list.push_back(uc_char);
|
|
||||||
|
|
||||||
if (uc_char.length != len) {
|
if (actual_len != len) {
|
||||||
return false;
|
has_errors = true;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
first = next;
|
result_list.push_back(unicode_character{result, static_cast<int>(offset), static_cast<int>(actual_len)});
|
||||||
|
current = next;
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
|
return !has_errors;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -193,8 +193,8 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
|
|||||||
|
|
||||||
ASSERT_EQ(str.size(), result_list.size());
|
ASSERT_EQ(str.size(), result_list.size());
|
||||||
|
|
||||||
int i = 0;
|
for (size_t i = 0; i < str.size(); i++) {
|
||||||
for (const auto& unicode_char : result_list) {
|
const auto& unicode_char = result_list[i];
|
||||||
auto c = str[i];
|
auto c = str[i];
|
||||||
|
|
||||||
// Matches the single byte character
|
// Matches the single byte character
|
||||||
@ -203,8 +203,6 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
|
|||||||
EXPECT_EQ(i, unicode_char.offset);
|
EXPECT_EQ(i, unicode_char.offset);
|
||||||
// Only takes a single byte
|
// Only takes a single byte
|
||||||
EXPECT_EQ(1, unicode_char.length);
|
EXPECT_EQ(1, unicode_char.length);
|
||||||
|
|
||||||
i++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,5 +264,6 @@ TEST_P(Utf8ToUCS4InvalidTest, correctness) {
|
|||||||
const auto str = GetParam();
|
const auto str = GetParam();
|
||||||
bool success = string_util::utf8_to_ucs4(str.c_str(), result_list);
|
bool success = string_util::utf8_to_ucs4(str.c_str(), result_list);
|
||||||
EXPECT_FALSE(success);
|
EXPECT_FALSE(success);
|
||||||
|
EXPECT_EQ(0, result_list.size());
|
||||||
}
|
}
|
||||||
// }}}
|
// }}}
|
||||||
|
Loading…
Reference in New Issue
Block a user