From 32c78aa63aabc0c724d76e0fbc408795569c1561 Mon Sep 17 00:00:00 2001
From: patrick96 <p.ziegler96@gmail.com>
Date: Wed, 10 May 2023 16:36:45 +0200
Subject: [PATCH] Cleanup ucs4_to_utf8

---
 include/cairo/context.hpp | 15 ++++++++----
 include/utils/string.hpp  |  2 +-
 src/utils/string.cpp      | 50 ++++++++++++---------------------------
 3 files changed, 27 insertions(+), 40 deletions(-)
diff --git a/include/cairo/context.hpp b/include/cairo/context.hpp
index af4807b4..b0589d0f 100644
--- a/include/cairo/context.hpp
+++ b/include/cairo/context.hpp
@@ -5,6 +5,7 @@
 #include <algorithm>
 #include <cmath>
 #include <deque>
+#include <iomanip>
 #include <iterator>
 
 #include "cairo/font.hpp"
@@ -168,8 +169,14 @@ namespace cairo {
       bool success = string_util::utf8_to_ucs4(utf8, chars);
 
       if (!success) {
-        m_log.warn("Dropping invalid UTF8 text '%s'", utf8);
-        return *this;
+        sstream hex;
+        hex << std::hex << std::setw(2) << std::setfill('0');
+
+        for(const char& c: utf8) {
+          hex << (static_cast<int>(c) & 0xff) << " ";
+        }
+
+        m_log.warn("Dropping invalid parts of UTF8 text '%s' %s", utf8, hex.to_string());
       }
 
       while (!chars.empty()) {
@@ -239,9 +246,9 @@ namespace cairo {
           continue;
         }
 
-        char unicode[6]{'\0'};
+        std::array<char, 5> unicode{};
         string_util::ucs4_to_utf8(unicode, chars.begin()->codepoint);
-        m_log.warn("Dropping unmatched character %s (U+%04x) in '%s'", unicode, chars.begin()->codepoint, t.contents);
+        m_log.warn("Dropping unmatched character '%s' (U+%04x) in '%s'", unicode.data(), chars.begin()->codepoint, t.contents);
         utf8.erase(chars.begin()->offset, chars.begin()->length);
         for (auto&& c : chars) {
           c.offset -= chars.begin()->length;
diff --git a/include/utils/string.hpp b/include/utils/string.hpp
index 3f8ed30a..7d0a3dc5 100644
--- a/include/utils/string.hpp
+++ b/include/utils/string.hpp
@@ -92,7 +92,7 @@ string utf8_truncate(string&& value, size_t len);
 /**
  * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
  */
-size_t ucs4_to_utf8(char* utf8, unsigned int ucs);
+size_t ucs4_to_utf8(std::array<char, 5>& utf8, unsigned int ucs);
 
 string join(const vector<string>& strs, const string& delim);
 vector<string> split(const string& s, char delim);
diff --git a/src/utils/string.cpp b/src/utils/string.cpp
index 54325b4a..4e27b5dd 100644
--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@@ -215,10 +215,8 @@ string trim(string&& value, const char& needle) {
 size_t char_len(const string& value) {
   // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
   // simply count the number of bytes not of this form.
-  //
-  // 0xc0 = 11000000
-  // 0x80 = 10000000
-  return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; });
+  return std::count_if(
+      value.begin(), value.end(), [](char c) { return (c & UTF8_CONTINUATION_MASK) != UTF8_CONTINUATION_PREFIX; });
 }
 
 /**
@@ -235,16 +233,13 @@ string utf8_truncate(string&& value, size_t len) {
   // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
   // simply jump forward to bytes not of that form and truncate starting
   // at that byte if we've counted too many codepoints
-  //
-  // 0xc0 = 11000000
-  // 0x80 = 10000000
   auto it = value.begin();
   auto end = value.end();
   for (size_t i = 0; i < len; ++i) {
     if (it == end)
       break;
     ++it;
-    it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; });
+    it = std::find_if(it, end, [](char c) { return (c & UTF8_CONTINUATION_MASK) != UTF8_CONTINUATION_PREFIX; });
   }
   value.erase(it, end);
 
@@ -325,40 +320,25 @@ bool utf8_to_ucs4(const string& src, unicode_charlist& result_list) {
 /**
  * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
  */
-size_t ucs4_to_utf8(char* utf8, uint32_t ucs) {
+size_t ucs4_to_utf8(std::array<char, 5>& utf8, uint32_t ucs) {
   if (ucs <= 0x7f) {
-    *utf8 = ucs;
+    utf8[0] = ucs;
     return 1;
   } else if (ucs <= 0x07ff) {
-    *(utf8++) = ((ucs >> 6) & 0xff) | 0xc0;
-    *utf8 = (ucs & 0x3f) | 0x80;
+    utf8[0] = ((ucs >> 6) & ~UTF8_LEADING2_MASK) | UTF8_LEADING2_PREFIX;
+    utf8[1] = (ucs & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
     return 2;
   } else if (ucs <= 0xffff) {
-    *(utf8++) = ((ucs >> 12) & 0x0f) | 0xe0;
-    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-    *utf8 = (ucs & 0x3f) | 0x80;
+    utf8[0] = ((ucs >> 12) & ~UTF8_LEADING3_MASK) | UTF8_LEADING3_PREFIX;
+    utf8[1] = ((ucs >> 6) & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
+    utf8[2] = (ucs & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
     return 3;
-  } else if (ucs <= 0x1fffff) {
-    *(utf8++) = ((ucs >> 18) & 0x07) | 0xf0;
-    *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
-    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-    *utf8 = (ucs & 0x3f) | 0x80;
+  } else if (ucs <= 0x10ffff) {
+    utf8[0] = ((ucs >> 18) & ~UTF8_LEADING4_MASK) | UTF8_LEADING4_PREFIX;
+    utf8[1] = ((ucs >> 12) & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
+    utf8[2] = ((ucs >> 6) & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
+    utf8[3] = (ucs & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
     return 4;
-  } else if (ucs <= 0x03ffffff) {
-    *(utf8++) = ((ucs >> 24) & 0x03) | 0xf8;
-    *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
-    *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
-    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-    *utf8 = (ucs & 0x3f) | 0x80;
-    return 5;
-  } else if (ucs <= 0x7fffffff) {
-    *(utf8++) = ((ucs >> 30) & 0x01) | 0xfc;
-    *(utf8++) = ((ucs >> 24) & 0x3f) | 0x80;
-    *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
-    *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
-    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-    *utf8 = (ucs & 0x3f) | 0x80;
-    return 6;
   } else {
     return 0;
   }