Make unicode_charlist a vector

2023-05-10 16:11:52 +02:00 · 2023-05-10 16:11:52 +02:00 · 5e5a0a7c4d
commit 5e5a0a7c4d
parent c86519f077
3 changed files with 36 additions and 23 deletions
--- a/include/utils/string.hpp
+++ b/include/utils/string.hpp
@ -1,6 +1,5 @@
 #pragma once

-#include <list>
 #include <sstream>

 #include "common.hpp"
@ -56,7 +55,7 @@ struct unicode_character {
   */
  int length{0};
 };
-using unicode_charlist = std::list<unicode_character>;
+using unicode_charlist = std::vector<unicode_character>;

 bool contains(const string& haystack, const string& needle);
 bool contains_ignore_case(const string& haystack, const string& needle);
--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@ -272,39 +272,54 @@ static pair<int, uint32_t> utf8_get_len(uint8_t leading) {

 /**
 * @brief Create a list of UCS-4 codepoint from a utf-8 encoded string
+ *
+ * If invalid utf8 characters are encountered they are skipped until the next valid codepoint and the function will
+ * eventually return false.
+ *
+ * The result_list is always populated with all valid utf8 codepoints.
+ *
+ * @return Whether the string is completely valid utf8
 */
 bool utf8_to_ucs4(const char* src, unicode_charlist& result_list) {
  assert(src);
+  bool has_errors = false;
  const auto* begin = reinterpret_cast<const uint8_t*>(src);
-  const auto* first = begin;
-  while (*first) {
-    // Number of bytes taken up by this codepoint and the bits contained in the leading byte.
-    auto [len, result] = utf8_get_len(*first);

-    // Invalid lengths
+  const auto* current = begin;
+  while (*current) {
+    // Number of bytes taken up by this codepoint and the bits contained in the leading byte.
+    auto [len, result] = utf8_get_len(*current);
+    auto offset = current - begin;
+
+    /*
+     * Invalid lengths, this byte is not a valid leading byte.
+     * Skip it.
+     */
    if (len <= 0 || len > 4) {
-      return false;
+      has_errors = true;
+      current++;
+      continue;
    }

-    const uint8_t* next = first + 1;
-    for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - first < len); next++) {
+    const uint8_t* next = current + 1;
+    for (; ((*next & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_PREFIX) && (next - current < len); next++) {
      result = result << 6;
      result |= *next & ~UTF8_CONTINUATION_MASK;
    }

-    unicode_character uc_char;
-    uc_char.codepoint = result;
-    uc_char.offset = first - begin;
-    uc_char.length = next - first;
-    result_list.push_back(uc_char);
+    auto actual_len = next - current;
+    current = next;

-    if (uc_char.length != len) {
-      return false;
+    if (actual_len != len) {
+      has_errors = true;
+      continue;
    }

-    first = next;
+    result_list.push_back(unicode_character{result, static_cast<int>(offset), static_cast<int>(actual_len)});
+    current = next;
  }
-  return true;
+
+  return !has_errors;
 }

 /**
--- a/tests/unit_tests/utils/string.cpp
+++ b/tests/unit_tests/utils/string.cpp
@ -193,8 +193,8 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {

  ASSERT_EQ(str.size(), result_list.size());

-  int i = 0;
-  for (const auto& unicode_char : result_list) {
+  for (size_t i = 0; i < str.size(); i++) {
+    const auto& unicode_char = result_list[i];
    auto c = str[i];

    // Matches the single byte character
@ -203,8 +203,6 @@ TEST_P(Utf8ToUCS4AsciiTest, correctness) {
    EXPECT_EQ(i, unicode_char.offset);
    // Only takes a single byte
    EXPECT_EQ(1, unicode_char.length);
-
-    i++;
  }
 }

@ -266,5 +264,6 @@ TEST_P(Utf8ToUCS4InvalidTest, correctness) {
  const auto str = GetParam();
  bool success = string_util::utf8_to_ucs4(str.c_str(), result_list);
  EXPECT_FALSE(success);
+  EXPECT_EQ(0, result_list.size());
 }
 // }}}