From 71c65447f829769707324113e9847565bbdeec6e Mon Sep 17 00:00:00 2001
From: patrick96
Date: Wed, 10 May 2023 13:01:30 +0200
Subject: [PATCH] Move utf8 conversion code to string utils
---
include/cairo/context.hpp | 6 +--
include/cairo/font.hpp | 8 ++--
include/cairo/utils.hpp | 24 -----------
include/utils/string.hpp | 34 +++++++++++++--
src/cairo/utils.cpp | 88 ---------------------------------------
src/utils/string.cpp | 83 ++++++++++++++++++++++++++++++++++++
6 files changed, 121 insertions(+), 122 deletions(-)
diff --git a/include/cairo/context.hpp b/include/cairo/context.hpp
index 23167dfe..60f3a232 100644
--- a/include/cairo/context.hpp
+++ b/include/cairo/context.hpp
@@ -164,8 +164,8 @@ namespace cairo {
}
string utf8 = string(t.contents);
- utils::unicode_charlist chars;
- utils::utf8_to_ucs4((const unsigned char*)utf8.c_str(), chars);
+ string_util::unicode_charlist chars;
+ string_util::utf8_to_ucs4((const unsigned char*)utf8.c_str(), chars);
while (!chars.empty()) {
auto remaining = chars.size();
@@ -235,7 +235,7 @@ namespace cairo {
}
char unicode[6]{'\0'};
- utils::ucs4_to_utf8(unicode, chars.begin()->codepoint);
+ string_util::ucs4_to_utf8(unicode, chars.begin()->codepoint);
m_log.warn("Dropping unmatched character %s (U+%04x) in '%s'", unicode, chars.begin()->codepoint, t.contents);
utf8.erase(chars.begin()->offset, chars.begin()->length);
for (auto&& c : chars) {
diff --git a/include/cairo/font.hpp b/include/cairo/font.hpp
index 5b396480..d7a14da9 100644
--- a/include/cairo/font.hpp
+++ b/include/cairo/font.hpp
@@ -39,8 +39,8 @@ class font {
cairo_set_font_face(m_cairo, cairo_font_face_reference(m_font_face));
}
- virtual size_t match(utils::unicode_character& character) = 0;
- virtual size_t match(utils::unicode_charlist& charlist) = 0;
+ virtual size_t match(string_util::unicode_character& character) = 0;
+ virtual size_t match(string_util::unicode_charlist& charlist) = 0;
virtual size_t render(const string& text, double x = 0.0, double y = 0.0) = 0;
virtual void textwidth(const string& text, cairo_text_extents_t* extents) = 0;
@@ -187,13 +187,13 @@ class font_fc : public font {
cairo_set_scaled_font(m_cairo, m_scaled);
}
- size_t match(utils::unicode_character& character) override {
+ size_t match(string_util::unicode_character& character) override {
auto lock = make_unique(m_scaled);
auto face = static_cast(*lock);
return FT_Get_Char_Index(face, character.codepoint) ? 1 : 0;
}
- size_t match(utils::unicode_charlist& charlist) override {
+ size_t match(string_util::unicode_charlist& charlist) override {
auto lock = make_unique(m_scaled);
auto face = static_cast(*lock);
size_t available_chars = 0;
diff --git a/include/cairo/utils.hpp b/include/cairo/utils.hpp
index bd0497c1..9b5220ad 100644
--- a/include/cairo/utils.hpp
+++ b/include/cairo/utils.hpp
@@ -2,8 +2,6 @@
#include
-#include
-
#include "common.hpp"
POLYBAR_NS
@@ -39,32 +37,10 @@ namespace utils {
FT_Face m_face;
};
- /**
- * @brief Unicode character containing converted codepoint
- * and details on where its position in the source string
- */
- struct unicode_character {
- explicit unicode_character();
- unsigned long codepoint;
- int offset;
- int length;
- };
- using unicode_charlist = std::list;
-
/**
* @see
*/
cairo_operator_t str2operator(const string& mode, cairo_operator_t fallback);
-
- /**
- * @brief Create a UCS-4 codepoint from a utf-8 encoded string
- */
- bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list);
-
- /**
- * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
- */
- size_t ucs4_to_utf8(char* utf8, unsigned int ucs);
} // namespace utils
} // namespace cairo
diff --git a/include/utils/string.hpp b/include/utils/string.hpp
index 68399933..176ee978 100644
--- a/include/utils/string.hpp
+++ b/include/utils/string.hpp
@@ -1,5 +1,6 @@
#pragma once
+#include
#include
#include "common.hpp"
@@ -8,8 +9,6 @@ POLYBAR_NS
class sstream {
public:
- sstream() : m_stream() {}
-
template
sstream& operator<<(const T& object) {
m_stream << object;
@@ -25,7 +24,7 @@ class sstream {
return m_stream.str();
}
- const string to_string() const {
+ string to_string() const {
return m_stream.str();
}
@@ -39,6 +38,26 @@ namespace string_util {
*/
using hash_type = unsigned long;
+/**
+ * @brief Unicode character containing converted codepoint
+ * and details on where its position in the source string
+ */
+struct unicode_character {
+ /**
+ * The numerical codepoint. Between U+0000 and U+10FFFF
+ */
+ unsigned long codepoint{0};
+ /**
+ * Byte offset of this character in the original string
+ */
+ int offset{0};
+ /**
+ * Number of bytes used by this character in the original string
+ */
+ int length{0};
+};
+using unicode_charlist = std::list;
+
bool contains(const string& haystack, const string& needle);
bool contains_ignore_case(const string& haystack, const string& needle);
bool ends_with(const string& haystack, const string& suffix);
@@ -66,6 +85,15 @@ string trim(string&& value, const char& needle = ' ');
size_t char_len(const string& value);
string utf8_truncate(string&& value, size_t len);
+/**
+ * @brief Create a UCS-4 codepoint from a utf-8 encoded string
+ */
+bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list);
+
+/**
+ * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
+ */
+size_t ucs4_to_utf8(char* utf8, unsigned int ucs);
string join(const vector& strs, const string& delim);
vector split(const string& s, char delim);
diff --git a/src/cairo/utils.cpp b/src/cairo/utils.cpp
index f5649130..b7cc5e06 100644
--- a/src/cairo/utils.cpp
+++ b/src/cairo/utils.cpp
@@ -38,11 +38,6 @@ namespace utils {
return m_face;
}
- // }}}
- // implementation : unicode_character {{{
-
- unicode_character::unicode_character() : codepoint(0), offset(0), length(0) {}
-
// }}}
/**
@@ -87,89 +82,6 @@ namespace utils {
auto it = modes.find(mode);
return it != modes.end() ? it->second : fallback;
}
-
- /**
- * @brief Create a UCS-4 codepoint from a utf-8 encoded string
- */
- bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list) {
- if (!src) {
- return false;
- }
- const unsigned char* first = src;
- while (*first) {
- int len = 0;
- unsigned long result = 0;
- if ((*first >> 7) == 0) {
- len = 1;
- result = *first;
- } else if ((*first >> 5) == 6) {
- len = 2;
- result = *first & 31;
- } else if ((*first >> 4) == 14) {
- len = 3;
- result = *first & 15;
- } else if ((*first >> 3) == 30) {
- len = 4;
- result = *first & 7;
- } else {
- return false;
- }
- const unsigned char* next;
- for (next = first + 1; *next && ((*next >> 6) == 2) && (next - first < len); next++) {
- result = result << 6;
- result |= *next & 63;
- }
- unicode_character uc_char;
- uc_char.codepoint = result;
- uc_char.offset = first - src;
- uc_char.length = next - first;
- result_list.push_back(uc_char);
- first = next;
- }
- return true;
- }
-
- /**
- * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
- */
- size_t ucs4_to_utf8(char* utf8, unsigned int ucs) {
- if (ucs <= 0x7f) {
- *utf8 = ucs;
- return 1;
- } else if (ucs <= 0x07ff) {
- *(utf8++) = ((ucs >> 6) & 0xff) | 0xc0;
- *utf8 = (ucs & 0x3f) | 0x80;
- return 2;
- } else if (ucs <= 0xffff) {
- *(utf8++) = ((ucs >> 12) & 0x0f) | 0xe0;
- *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
- *utf8 = (ucs & 0x3f) | 0x80;
- return 3;
- } else if (ucs <= 0x1fffff) {
- *(utf8++) = ((ucs >> 18) & 0x07) | 0xf0;
- *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
- *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
- *utf8 = (ucs & 0x3f) | 0x80;
- return 4;
- } else if (ucs <= 0x03ffffff) {
- *(utf8++) = ((ucs >> 24) & 0x03) | 0xf8;
- *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
- *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
- *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
- *utf8 = (ucs & 0x3f) | 0x80;
- return 5;
- } else if (ucs <= 0x7fffffff) {
- *(utf8++) = ((ucs >> 30) & 0x01) | 0xfc;
- *(utf8++) = ((ucs >> 24) & 0x3f) | 0x80;
- *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
- *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
- *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
- *utf8 = (ucs & 0x3f) | 0x80;
- return 6;
- } else {
- return 0;
- }
- }
} // namespace utils
} // namespace cairo
diff --git a/src/utils/string.cpp b/src/utils/string.cpp
index 86f7f95d..681eae4d 100644
--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@@ -224,6 +224,89 @@ string utf8_truncate(string&& value, size_t len) {
return forward(value);
}
+/**
+ * @brief Create a UCS-4 codepoint from a utf-8 encoded string
+ */
+bool utf8_to_ucs4(const unsigned char* src, unicode_charlist& result_list) {
+ if (!src) {
+ return false;
+ }
+ const unsigned char* first = src;
+ while (*first) {
+ int len = 0;
+ unsigned long result = 0;
+ if ((*first >> 7) == 0) {
+ len = 1;
+ result = *first;
+ } else if ((*first >> 5) == 6) {
+ len = 2;
+ result = *first & 31;
+ } else if ((*first >> 4) == 14) {
+ len = 3;
+ result = *first & 15;
+ } else if ((*first >> 3) == 30) {
+ len = 4;
+ result = *first & 7;
+ } else {
+ return false;
+ }
+ const unsigned char* next;
+ for (next = first + 1; *next && ((*next >> 6) == 2) && (next - first < len); next++) {
+ result = result << 6;
+ result |= *next & 63;
+ }
+ unicode_character uc_char;
+ uc_char.codepoint = result;
+ uc_char.offset = first - src;
+ uc_char.length = next - first;
+ result_list.push_back(uc_char);
+ first = next;
+ }
+ return true;
+}
+
+/**
+ * @brief Convert a UCS-4 codepoint to a utf-8 encoded string
+ */
+size_t ucs4_to_utf8(char* utf8, unsigned int ucs) {
+ if (ucs <= 0x7f) {
+ *utf8 = ucs;
+ return 1;
+ } else if (ucs <= 0x07ff) {
+ *(utf8++) = ((ucs >> 6) & 0xff) | 0xc0;
+ *utf8 = (ucs & 0x3f) | 0x80;
+ return 2;
+ } else if (ucs <= 0xffff) {
+ *(utf8++) = ((ucs >> 12) & 0x0f) | 0xe0;
+ *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+ *utf8 = (ucs & 0x3f) | 0x80;
+ return 3;
+ } else if (ucs <= 0x1fffff) {
+ *(utf8++) = ((ucs >> 18) & 0x07) | 0xf0;
+ *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
+ *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+ *utf8 = (ucs & 0x3f) | 0x80;
+ return 4;
+ } else if (ucs <= 0x03ffffff) {
+ *(utf8++) = ((ucs >> 24) & 0x03) | 0xf8;
+ *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
+ *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
+ *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+ *utf8 = (ucs & 0x3f) | 0x80;
+ return 5;
+ } else if (ucs <= 0x7fffffff) {
+ *(utf8++) = ((ucs >> 30) & 0x01) | 0xfc;
+ *(utf8++) = ((ucs >> 24) & 0x3f) | 0x80;
+ *(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
+ *(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
+ *(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
+ *utf8 = (ucs & 0x3f) | 0x80;
+ return 6;
+ } else {
+ return 0;
+ }
+}
+
/**
* Join all strings in vector into a single string separated by delim
*/