Move utf8 conversion code to string utils

2023-05-10 13:01:30 +02:00 · 2023-05-10 13:01:30 +02:00 · 71c65447f8
commit 71c65447f8
parent 6e716296ff
6 changed files with 121 additions and 122 deletions
--- a/include/cairo/context.hpp
+++ b/include/cairo/context.hpp
@ -164,8 +164,8 @@ namespace cairo {
      }

      string utf8 = string(t.contents);
-      utils::unicode_charlist chars;
-      utils::utf8_to_ucs4((const unsigned char*)utf8.c_str(), chars);
+      string_util::unicode_charlist chars;
+      string_util::utf8_to_ucs4((const unsigned char*)utf8.c_str(), chars);

      while (!chars.empty()) {
        auto remaining = chars.size();
@ -235,7 +235,7 @@ namespace cairo {
        }

        char unicode[6]{'\0'};
-        utils::ucs4_to_utf8(unicode, chars.begin()->codepoint);
+        string_util::ucs4_to_utf8(unicode, chars.begin()->codepoint);
        m_log.warn("Dropping unmatched character %s (U+%04x) in '%s'", unicode, chars.begin()->codepoint, t.contents);
        utf8.erase(chars.begin()->offset, chars.begin()->length);
        for (auto&& c : chars) {
--- a/include/cairo/font.hpp
+++ b/include/cairo/font.hpp
@ -39,8 +39,8 @@ class font {
    cairo_set_font_face(m_cairo, cairo_font_face_reference(m_font_face));
  }

-  virtual size_t match(utils::unicode_character& character) = 0;
-  virtual size_t match(utils::unicode_charlist& charlist) = 0;
+  virtual size_t match(string_util::unicode_character& character) = 0;
+  virtual size_t match(string_util::unicode_charlist& charlist) = 0;
  virtual size_t render(const string& text, double x = 0.0, double y = 0.0) = 0;
  virtual void textwidth(const string& text, cairo_text_extents_t* extents) = 0;

@ -187,13 +187,13 @@ class font_fc : public font {
    cairo_set_scaled_font(m_cairo, m_scaled);
  }

-  size_t match(utils::unicode_character& character) override {
+  size_t match(string_util::unicode_character& character) override {
    auto lock = make_unique<utils::ft_face_lock>(m_scaled);
    auto face = static_cast<FT_Face>(*lock);
    return FT_Get_Char_Index(face, character.codepoint) ? 1 : 0;
  }

-  size_t match(utils::unicode_charlist& charlist) override {
+  size_t match(string_util::unicode_charlist& charlist) override {
    auto lock = make_unique<utils::ft_face_lock>(m_scaled);
    auto face = static_cast<FT_Face>(*lock);
    size_t available_chars = 0;
--- a/include/cairo/utils.hpp
+++ b/include/cairo/utils.hpp
@ -2,8 +2,6 @@

 #include <cairo/cairo-ft.h>

-#include <list>
-
 #include "common.hpp"

 POLYBAR_NS
@ -39,32 +37,10 @@ namespace utils {
    FT_Face m_face;
  };

-  /**
-   * @brief Unicode character containing converted codepoint
-   * and details on where its position in the source string
-   */
-  struct unicode_character {
-    explicit unicode_character();
-    unsigned long codepoint;
-    int offset;
-    int length;
-  };
-  using unicode_charlist = std::list<unicode_character>;
-
  /**
   * @see <cairo/cairo.h>
   */
  cairo_operator_t str2operator(const string& mode, cairo_operator_t fallback);
-
-  /**
-   * @brief Create a UCS-4 codepoint from a utf-8 encoded string
-   */
-  bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list);
-
-  /**
-   * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
-   */
-  size_t ucs4_to_utf8(char* utf8, unsigned int ucs);
 } // namespace utils
 } // namespace cairo

--- a/include/utils/string.hpp
+++ b/include/utils/string.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include <list>
 #include <sstream>

 #include "common.hpp"
@ -8,8 +9,6 @@ POLYBAR_NS

 class sstream {
 public:
-  sstream() : m_stream() {}
-
  template <typename T>
  sstream& operator<<(const T& object) {
    m_stream << object;
@ -25,7 +24,7 @@ class sstream {
    return m_stream.str();
  }

-  const string to_string() const {
+   string to_string() const {
    return m_stream.str();
  }

@ -39,6 +38,26 @@ namespace string_util {
 */
 using hash_type = unsigned long;

+/**
+ * @brief Unicode character containing converted codepoint
+ * and details on where its position in the source string
+ */
+struct unicode_character {
+  /**
+   * The numerical codepoint. Between U+0000 and U+10FFFF
+   */
+  unsigned long codepoint{0};
+  /**
+   * Byte offset of this character in the original string
+   */
+  int offset{0};
+  /**
+   * Number of bytes used by this character in the original string
+   */
+  int length{0};
+};
+using unicode_charlist = std::list<unicode_character>;
+
 bool contains(const string& haystack, const string& needle);
 bool contains_ignore_case(const string& haystack, const string& needle);
 bool ends_with(const string& haystack, const string& suffix);
@ -66,6 +85,15 @@ string trim(string&& value, const char& needle = ' ');

 size_t char_len(const string& value);
 string utf8_truncate(string&& value, size_t len);
+/**
+ * @brief Create a UCS-4 codepoint from a utf-8 encoded string
+ */
+bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list);
+
+/**
+ * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
+ */
+size_t ucs4_to_utf8(char* utf8, unsigned int ucs);

 string join(const vector<string>& strs, const string& delim);
 vector<string> split(const string& s, char delim);
--- a/src/cairo/utils.cpp
+++ b/src/cairo/utils.cpp
@ -38,11 +38,6 @@ namespace utils {
    return m_face;
  }

-  // }}}
-  // implementation : unicode_character {{{
-
-  unicode_character::unicode_character() : codepoint(0), offset(0), length(0) {}
-
  // }}}

  /**
@ -87,89 +82,6 @@ namespace utils {
    auto it = modes.find(mode);
    return it != modes.end() ? it->second : fallback;
  }
-
-  /**
-   * @brief Create a UCS-4 codepoint from a utf-8 encoded string
-   */
-  bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list) {
-    if (!src) {
-      return false;
-    }
-    const unsigned char* first = src;
-    while (*first) {
-      int len = 0;
-      unsigned long result = 0;
-      if ((*first >> 7) == 0) {
-        len = 1;
-        result = *first;
-      } else if ((*first >> 5) == 6) {
-        len = 2;
-        result = *first & 31;
-      } else if ((*first >> 4) == 14) {
-        len = 3;
-        result = *first & 15;
-      } else if ((*first >> 3) == 30) {
-        len = 4;
-        result = *first & 7;
-      } else {
-        return false;
-      }
-      const unsigned char* next;
-      for (next = first + 1; *next && ((*next >> 6) == 2) && (next - first < len); next++) {
-        result = result << 6;
-        result |= *next & 63;
-      }
-      unicode_character uc_char;
-      uc_char.codepoint = result;
-      uc_char.offset = first - src;
-      uc_char.length = next - first;
-      result_list.push_back(uc_char);
-      first = next;
-    }
-    return true;
-  }
-
-  /**
-   * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
-   */
-  size_t ucs4_to_utf8(char* utf8, unsigned int ucs) {
-    if (ucs <= 0x7f) {
-      *utf8 = ucs;
-      return 1;
-    } else if (ucs <= 0x07ff) {
-      *(utf8++) = ((ucs >> 6) & 0xff) | 0xc0;
-      *utf8 = (ucs & 0x3f) | 0x80;
-      return 2;
-    } else if (ucs <= 0xffff) {
-      *(utf8++) = ((ucs >> 12) & 0x0f) | 0xe0;
-      *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-      *utf8 = (ucs & 0x3f) | 0x80;
-      return 3;
-    } else if (ucs <= 0x1fffff) {
-      *(utf8++) = ((ucs >> 18) & 0x07) | 0xf0;
-      *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
-      *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-      *utf8 = (ucs & 0x3f) | 0x80;
-      return 4;
-    } else if (ucs <= 0x03ffffff) {
-      *(utf8++) = ((ucs >> 24) & 0x03) | 0xf8;
-      *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
-      *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
-      *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-      *utf8 = (ucs & 0x3f) | 0x80;
-      return 5;
-    } else if (ucs <= 0x7fffffff) {
-      *(utf8++) = ((ucs >> 30) & 0x01) | 0xfc;
-      *(utf8++) = ((ucs >> 24) & 0x3f) | 0x80;
-      *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
-      *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
-      *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
-      *utf8 = (ucs & 0x3f) | 0x80;
-      return 6;
-    } else {
-      return 0;
-    }
-  }
 } // namespace utils
 } // namespace cairo

--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@ -224,6 +224,89 @@ string utf8_truncate(string&& value, size_t len) {
  return forward<string>(value);
 }

+/**
+ * @brief Create a UCS-4 codepoint from a utf-8 encoded string
+ */
+bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list) {
+  if (!src) {
+    return false;
+  }
+  const unsigned char* first = src;
+  while (*first) {
+    int len = 0;
+    unsigned long result = 0;
+    if ((*first >> 7) == 0) {
+      len = 1;
+      result = *first;
+    } else if ((*first >> 5) == 6) {
+      len = 2;
+      result = *first & 31;
+    } else if ((*first >> 4) == 14) {
+      len = 3;
+      result = *first & 15;
+    } else if ((*first >> 3) == 30) {
+      len = 4;
+      result = *first & 7;
+    } else {
+      return false;
+    }
+    const unsigned char* next;
+    for (next = first + 1; *next && ((*next >> 6) == 2) && (next - first < len); next++) {
+      result = result << 6;
+      result |= *next & 63;
+    }
+    unicode_character uc_char;
+    uc_char.codepoint = result;
+    uc_char.offset = first - src;
+    uc_char.length = next - first;
+    result_list.push_back(uc_char);
+    first = next;
+  }
+  return true;
+}
+
+/**
+ * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
+ */
+size_t ucs4_to_utf8(char* utf8, unsigned int ucs) {
+  if (ucs <= 0x7f) {
+    *utf8 = ucs;
+    return 1;
+  } else if (ucs <= 0x07ff) {
+    *(utf8++) = ((ucs >> 6) & 0xff) | 0xc0;
+    *utf8 = (ucs & 0x3f) | 0x80;
+    return 2;
+  } else if (ucs <= 0xffff) {
+    *(utf8++) = ((ucs >> 12) & 0x0f) | 0xe0;
+    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+    *utf8 = (ucs & 0x3f) | 0x80;
+    return 3;
+  } else if (ucs <= 0x1fffff) {
+    *(utf8++) = ((ucs >> 18) & 0x07) | 0xf0;
+    *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
+    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+    *utf8 = (ucs & 0x3f) | 0x80;
+    return 4;
+  } else if (ucs <= 0x03ffffff) {
+    *(utf8++) = ((ucs >> 24) & 0x03) | 0xf8;
+    *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
+    *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
+    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+    *utf8 = (ucs & 0x3f) | 0x80;
+    return 5;
+  } else if (ucs <= 0x7fffffff) {
+    *(utf8++) = ((ucs >> 30) & 0x01) | 0xfc;
+    *(utf8++) = ((ucs >> 24) & 0x3f) | 0x80;
+    *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
+    *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
+    *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+    *utf8 = (ucs & 0x3f) | 0x80;
+    return 6;
+  } else {
+    return 0;
+  }
+}
+
 /**
 * Join all strings in vector into a single string separated by delim
 */