Cleanup ucs4_to_utf8
This commit is contained in:
parent
40bc8c7955
commit
32c78aa63a
@ -5,6 +5,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <deque>
|
#include <deque>
|
||||||
|
#include <iomanip>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
|
||||||
#include "cairo/font.hpp"
|
#include "cairo/font.hpp"
|
||||||
@ -168,8 +169,14 @@ namespace cairo {
|
|||||||
bool success = string_util::utf8_to_ucs4(utf8, chars);
|
bool success = string_util::utf8_to_ucs4(utf8, chars);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
m_log.warn("Dropping invalid UTF8 text '%s'", utf8);
|
sstream hex;
|
||||||
return *this;
|
hex << std::hex << std::setw(2) << std::setfill('0');
|
||||||
|
|
||||||
|
for(const char& c: utf8) {
|
||||||
|
hex << (static_cast<int>(c) & 0xff) << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_log.warn("Dropping invalid parts of UTF8 text '%s' %s", utf8, hex.to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
while (!chars.empty()) {
|
while (!chars.empty()) {
|
||||||
@ -239,9 +246,9 @@ namespace cairo {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
char unicode[6]{'\0'};
|
std::array<char, 5> unicode{};
|
||||||
string_util::ucs4_to_utf8(unicode, chars.begin()->codepoint);
|
string_util::ucs4_to_utf8(unicode, chars.begin()->codepoint);
|
||||||
m_log.warn("Dropping unmatched character %s (U+%04x) in '%s'", unicode, chars.begin()->codepoint, t.contents);
|
m_log.warn("Dropping unmatched character '%s' (U+%04x) in '%s'", unicode.data(), chars.begin()->codepoint, t.contents);
|
||||||
utf8.erase(chars.begin()->offset, chars.begin()->length);
|
utf8.erase(chars.begin()->offset, chars.begin()->length);
|
||||||
for (auto&& c : chars) {
|
for (auto&& c : chars) {
|
||||||
c.offset -= chars.begin()->length;
|
c.offset -= chars.begin()->length;
|
||||||
|
@ -92,7 +92,7 @@ string utf8_truncate(string&& value, size_t len);
|
|||||||
/**
|
/**
|
||||||
* @brief Convert a UCS-4 codepoint to a utf-8 encoded string
|
* @brief Convert a UCS-4 codepoint to a utf-8 encoded string
|
||||||
*/
|
*/
|
||||||
size_t ucs4_to_utf8(char* utf8, unsigned int ucs);
|
size_t ucs4_to_utf8(std::array<char, 5>& utf8, unsigned int ucs);
|
||||||
|
|
||||||
string join(const vector<string>& strs, const string& delim);
|
string join(const vector<string>& strs, const string& delim);
|
||||||
vector<string> split(const string& s, char delim);
|
vector<string> split(const string& s, char delim);
|
||||||
|
@ -215,10 +215,8 @@ string trim(string&& value, const char& needle) {
|
|||||||
size_t char_len(const string& value) {
|
size_t char_len(const string& value) {
|
||||||
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
||||||
// simply count the number of bytes not of this form.
|
// simply count the number of bytes not of this form.
|
||||||
//
|
return std::count_if(
|
||||||
// 0xc0 = 11000000
|
value.begin(), value.end(), [](char c) { return (c & UTF8_CONTINUATION_MASK) != UTF8_CONTINUATION_PREFIX; });
|
||||||
// 0x80 = 10000000
|
|
||||||
return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -235,16 +233,13 @@ string utf8_truncate(string&& value, size_t len) {
|
|||||||
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
||||||
// simply jump forward to bytes not of that form and truncate starting
|
// simply jump forward to bytes not of that form and truncate starting
|
||||||
// at that byte if we've counted too many codepoints
|
// at that byte if we've counted too many codepoints
|
||||||
//
|
|
||||||
// 0xc0 = 11000000
|
|
||||||
// 0x80 = 10000000
|
|
||||||
auto it = value.begin();
|
auto it = value.begin();
|
||||||
auto end = value.end();
|
auto end = value.end();
|
||||||
for (size_t i = 0; i < len; ++i) {
|
for (size_t i = 0; i < len; ++i) {
|
||||||
if (it == end)
|
if (it == end)
|
||||||
break;
|
break;
|
||||||
++it;
|
++it;
|
||||||
it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; });
|
it = std::find_if(it, end, [](char c) { return (c & UTF8_CONTINUATION_MASK) != UTF8_CONTINUATION_PREFIX; });
|
||||||
}
|
}
|
||||||
value.erase(it, end);
|
value.erase(it, end);
|
||||||
|
|
||||||
@ -325,40 +320,25 @@ bool utf8_to_ucs4(const string& src, unicode_charlist& result_list) {
|
|||||||
/**
|
/**
|
||||||
* @brief Convert a UCS-4 codepoint to a utf-8 encoded string
|
* @brief Convert a UCS-4 codepoint to a utf-8 encoded string
|
||||||
*/
|
*/
|
||||||
size_t ucs4_to_utf8(char* utf8, uint32_t ucs) {
|
size_t ucs4_to_utf8(std::array<char, 5>& utf8, uint32_t ucs) {
|
||||||
if (ucs <= 0x7f) {
|
if (ucs <= 0x7f) {
|
||||||
*utf8 = ucs;
|
utf8[0] = ucs;
|
||||||
return 1;
|
return 1;
|
||||||
} else if (ucs <= 0x07ff) {
|
} else if (ucs <= 0x07ff) {
|
||||||
*(utf8++) = ((ucs >> 6) & 0xff) | 0xc0;
|
utf8[0] = ((ucs >> 6) & ~UTF8_LEADING2_MASK) | UTF8_LEADING2_PREFIX;
|
||||||
*utf8 = (ucs & 0x3f) | 0x80;
|
utf8[1] = (ucs & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
|
||||||
return 2;
|
return 2;
|
||||||
} else if (ucs <= 0xffff) {
|
} else if (ucs <= 0xffff) {
|
||||||
*(utf8++) = ((ucs >> 12) & 0x0f) | 0xe0;
|
utf8[0] = ((ucs >> 12) & ~UTF8_LEADING3_MASK) | UTF8_LEADING3_PREFIX;
|
||||||
*(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
|
utf8[1] = ((ucs >> 6) & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
|
||||||
*utf8 = (ucs & 0x3f) | 0x80;
|
utf8[2] = (ucs & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
|
||||||
return 3;
|
return 3;
|
||||||
} else if (ucs <= 0x1fffff) {
|
} else if (ucs <= 0x10ffff) {
|
||||||
*(utf8++) = ((ucs >> 18) & 0x07) | 0xf0;
|
utf8[0] = ((ucs >> 18) & ~UTF8_LEADING4_MASK) | UTF8_LEADING4_PREFIX;
|
||||||
*(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
|
utf8[1] = ((ucs >> 12) & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
|
||||||
*(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
|
utf8[2] = ((ucs >> 6) & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
|
||||||
*utf8 = (ucs & 0x3f) | 0x80;
|
utf8[3] = (ucs & ~UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_PREFIX;
|
||||||
return 4;
|
return 4;
|
||||||
} else if (ucs <= 0x03ffffff) {
|
|
||||||
*(utf8++) = ((ucs >> 24) & 0x03) | 0xf8;
|
|
||||||
*(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
|
|
||||||
*(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
|
|
||||||
*(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
|
|
||||||
*utf8 = (ucs & 0x3f) | 0x80;
|
|
||||||
return 5;
|
|
||||||
} else if (ucs <= 0x7fffffff) {
|
|
||||||
*(utf8++) = ((ucs >> 30) & 0x01) | 0xfc;
|
|
||||||
*(utf8++) = ((ucs >> 24) & 0x3f) | 0x80;
|
|
||||||
*(utf8++) = ((ucs >> 18) & 0x3f) | 0x80;
|
|
||||||
*(utf8++) = ((ucs >> 12) & 0x3f) | 0x80;
|
|
||||||
*(utf8++) = ((ucs >> 6) & 0x3f) | 0x80;
|
|
||||||
*utf8 = (ucs & 0x3f) | 0x80;
|
|
||||||
return 6;
|
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user