fix(label): Truncate label replacements based on codepoint count
This helps ensure that when a string is truncated it is not done in the middle of a utf8 multi-byte sequence. This doesn't 100% correspond to user-perceived characters, but it should be pretty close in most cases.
This commit is contained in:
parent
1d06df25a9
commit
73faa18cf0
@ -81,6 +81,9 @@ namespace string_util {
|
||||
string rtrim(string&& value, const char& needle = ' ');
|
||||
string trim(string&& value, const char& needle = ' ');
|
||||
|
||||
size_t char_len(const string& value);
|
||||
string utf8_truncate(string&& value, size_t len);
|
||||
|
||||
string join(const vector<string>& strs, const string& delim);
|
||||
vector<string>& split_into(const string& s, char delim, vector<string>& container);
|
||||
vector<string> split(const string& s, char delim);
|
||||
|
@ -48,8 +48,8 @@ namespace drawtypes {
|
||||
|
||||
for (auto&& tok : m_tokens) {
|
||||
if (token == tok.token) {
|
||||
if (tok.max != 0_z && replacement.length() > tok.max) {
|
||||
replacement = replacement.erase(tok.max) + tok.suffix;
|
||||
if (tok.max != 0_z && string_util::char_len(replacement) > tok.max) {
|
||||
replacement = string_util::utf8_truncate(std::move(replacement), tok.max) + tok.suffix;
|
||||
} else if (tok.min != 0_z && replacement.length() < tok.min) {
|
||||
replacement.insert(0_z, tok.min - replacement.length(), ' ');
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
@ -145,6 +146,48 @@ namespace string_util {
|
||||
return rtrim(ltrim(forward<string>(value), needle), needle);
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts the number of codepoints in a utf8 encoded string.
|
||||
*/
|
||||
size_t char_len(const string& value) {
|
||||
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
||||
// simply count the number of bytes not of this form.
|
||||
//
|
||||
// 0xc0 = 11000000
|
||||
// 0x80 = 10000000
|
||||
return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; });
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncates a utf8 string at len number of codepoints. This isn't 100%
|
||||
* matching the user-perceived character count, but it should be close
|
||||
* enough and avoids having to pull in something like ICU to count actual
|
||||
* grapheme clusters.
|
||||
*/
|
||||
string utf8_truncate(string&& value, size_t len) {
|
||||
if (value.empty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
||||
// simply jump forward to bytes not of that form and truncate starting
|
||||
// at that byte if we've counted too many codepoints
|
||||
//
|
||||
// 0xc0 = 11000000
|
||||
// 0x80 = 10000000
|
||||
auto it = value.begin();
|
||||
auto end = value.end();
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
if (it == end)
|
||||
break;
|
||||
++it;
|
||||
it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; });
|
||||
}
|
||||
value.erase(it, end);
|
||||
|
||||
return forward<string>(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Join all strings in vector into a single string separated by delim
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user