This commit is contained in:
patrick96 2023-05-10 16:46:09 +02:00 committed by Patrick Ziegler
parent 32c78aa63a
commit 425d4dc338
3 changed files with 21 additions and 25 deletions

View File

@ -166,9 +166,10 @@ namespace cairo {
string utf8 = t.contents; string utf8 = t.contents;
string_util::unicode_charlist chars; string_util::unicode_charlist chars;
bool success = string_util::utf8_to_ucs4(utf8, chars); bool valid = string_util::utf8_to_ucs4(utf8, chars);
if (!success) { // The conversion already removed any invalid chunks. We should probably log a warning though.
if (!valid) {
sstream hex; sstream hex;
hex << std::hex << std::setw(2) << std::setfill('0'); hex << std::hex << std::setw(2) << std::setfill('0');

View File

@ -84,14 +84,7 @@ string trim(string&& value, const char& needle = ' ');
size_t char_len(const string& value); size_t char_len(const string& value);
string utf8_truncate(string&& value, size_t len); string utf8_truncate(string&& value, size_t len);
/**
* @brief Create a UCS-4 codepoint from a utf-8 encoded string
*/
[[nodiscard]] bool utf8_to_ucs4(const string& src, unicode_charlist& result_list); [[nodiscard]] bool utf8_to_ucs4(const string& src, unicode_charlist& result_list);
/**
* @brief Convert a UCS-4 codepoint to a utf-8 encoded string
*/
size_t ucs4_to_utf8(std::array<char, 5>& utf8, unsigned int ucs); size_t ucs4_to_utf8(std::array<char, 5>& utf8, unsigned int ucs);
string join(const vector<string>& strs, const string& delim); string join(const vector<string>& strs, const string& delim);

View File

@ -188,8 +188,8 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
string_util::unicode_charlist result_list{}; string_util::unicode_charlist result_list{};
string str = GetParam(); string str = GetParam();
bool success = string_util::utf8_to_ucs4(str, result_list); bool valid = string_util::utf8_to_ucs4(str, result_list);
ASSERT_TRUE(success); ASSERT_TRUE(valid);
ASSERT_EQ(str.size(), result_list.size()); ASSERT_EQ(str.size(), result_list.size());
@ -206,18 +206,20 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
} }
} }
// String containing a single codepoint and the expected numerical codepoint
using single_test_t = std::pair<string, uint32_t>; using single_test_t = std::pair<string, uint32_t>;
class Utf8ToUCS4SingleTest : public testing::TestWithParam<single_test_t> {}; class Utf8ToUCS4SingleTest : public testing::TestWithParam<single_test_t> {};
const vector<single_test_t> utf8_to_ucs4_single_list = { const vector<single_test_t> utf8_to_ucs4_single_list = {
{" ", 0x20}, {"\u007f", 0x7f}, // End of 1 byte range {" ", 0x20}, // Single ASCII character
{"\u0080", 0x80}, // Start of 2 byte range {"\u007f", 0x7f}, // End of 1 byte range
{"\u07ff", 0x7ff}, // End of 2 byte range {"\u0080", 0x80}, // Start of 2 byte range
{"\u0800", 0x800}, // Start of 3 byte range {"\u07ff", 0x7ff}, // End of 2 byte range
{"\uffff", 0xffff}, // End of 3 byte range {"\u0800", 0x800}, // Start of 3 byte range
{"\U00010000", 0x10000}, // Start of 4 byte range {"\uffff", 0xffff}, // End of 3 byte range
{"\U0010ffff", 0x10ffff}, // End of 4 byte range {"\U00010000", 0x10000}, // Start of 4 byte range
{"\U0001f600", 0x1f600}, // Grinning face emoji {"\U0010ffff", 0x10ffff}, // End of 4 byte range
{"\U0001f600", 0x1f600}, // Grinning face emoji
}; };
INSTANTIATE_TEST_SUITE_P(Inst, Utf8ToUCS4SingleTest, testing::ValuesIn(utf8_to_ucs4_single_list)); INSTANTIATE_TEST_SUITE_P(Inst, Utf8ToUCS4SingleTest, testing::ValuesIn(utf8_to_ucs4_single_list));
@ -229,8 +231,8 @@ TEST_P(Utf8ToUCS4SingleTest, correctness) {
string_util::unicode_charlist result_list{}; string_util::unicode_charlist result_list{};
const auto [str, codepoint] = GetParam(); const auto [str, codepoint] = GetParam();
bool success = string_util::utf8_to_ucs4(str, result_list); bool valid = string_util::utf8_to_ucs4(str, result_list);
ASSERT_TRUE(success); ASSERT_TRUE(valid);
ASSERT_EQ(1, result_list.size()); ASSERT_EQ(1, result_list.size());
@ -262,8 +264,8 @@ INSTANTIATE_TEST_SUITE_P(Inst, Utf8ToUCS4InvalidTest, testing::ValuesIn(utf8_to_
TEST_P(Utf8ToUCS4InvalidTest, correctness) { TEST_P(Utf8ToUCS4InvalidTest, correctness) {
string_util::unicode_charlist result_list{}; string_util::unicode_charlist result_list{};
const auto str = GetParam(); const auto str = GetParam();
bool success = string_util::utf8_to_ucs4(str, result_list); bool valid = string_util::utf8_to_ucs4(str, result_list);
EXPECT_FALSE(success); EXPECT_FALSE(valid);
EXPECT_EQ(0, result_list.size()); EXPECT_EQ(0, result_list.size());
} }
@ -273,8 +275,8 @@ TEST_P(Utf8ToUCS4InvalidTest, correctness) {
TEST(String, utf8ToUCS4Partial) { TEST(String, utf8ToUCS4Partial) {
string_util::unicode_charlist result_list{}; string_util::unicode_charlist result_list{};
string str = "\xe0\x70\x80"; // a valid ascii character between two invalid characters string str = "\xe0\x70\x80"; // a valid ascii character between two invalid characters
bool success = string_util::utf8_to_ucs4(str, result_list); bool valid = string_util::utf8_to_ucs4(str, result_list);
EXPECT_FALSE(success); EXPECT_FALSE(valid);
EXPECT_EQ(1, result_list.size()); EXPECT_EQ(1, result_list.size());
EXPECT_EQ(0x70, result_list[0].codepoint); EXPECT_EQ(0x70, result_list[0].codepoint);