fix(label): Truncate label replacements based on codepoint count
This helps ensure that when a string is truncated it is not done in the middle of a utf8 multi-byte sequence. This doesn't 100% correspond to user-perceived characters, but it should be pretty close in most cases.
This commit is contained in:
parent
1d06df25a9
commit
73faa18cf0
@ -81,6 +81,9 @@ namespace string_util {
|
|||||||
string rtrim(string&& value, const char& needle = ' ');
|
string rtrim(string&& value, const char& needle = ' ');
|
||||||
string trim(string&& value, const char& needle = ' ');
|
string trim(string&& value, const char& needle = ' ');
|
||||||
|
|
||||||
|
size_t char_len(const string& value);
|
||||||
|
string utf8_truncate(string&& value, size_t len);
|
||||||
|
|
||||||
string join(const vector<string>& strs, const string& delim);
|
string join(const vector<string>& strs, const string& delim);
|
||||||
vector<string>& split_into(const string& s, char delim, vector<string>& container);
|
vector<string>& split_into(const string& s, char delim, vector<string>& container);
|
||||||
vector<string> split(const string& s, char delim);
|
vector<string> split(const string& s, char delim);
|
||||||
|
@ -48,8 +48,8 @@ namespace drawtypes {
|
|||||||
|
|
||||||
for (auto&& tok : m_tokens) {
|
for (auto&& tok : m_tokens) {
|
||||||
if (token == tok.token) {
|
if (token == tok.token) {
|
||||||
if (tok.max != 0_z && replacement.length() > tok.max) {
|
if (tok.max != 0_z && string_util::char_len(replacement) > tok.max) {
|
||||||
replacement = replacement.erase(tok.max) + tok.suffix;
|
replacement = string_util::utf8_truncate(std::move(replacement), tok.max) + tok.suffix;
|
||||||
} else if (tok.min != 0_z && replacement.length() < tok.min) {
|
} else if (tok.min != 0_z && replacement.length() < tok.min) {
|
||||||
replacement.insert(0_z, tok.min - replacement.length(), ' ');
|
replacement.insert(0_z, tok.min - replacement.length(), ' ');
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include <algorithm>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -145,6 +146,48 @@ namespace string_util {
|
|||||||
return rtrim(ltrim(forward<string>(value), needle), needle);
|
return rtrim(ltrim(forward<string>(value), needle), needle);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts the number of codepoints in a utf8 encoded string.
|
||||||
|
*/
|
||||||
|
size_t char_len(const string& value) {
|
||||||
|
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
||||||
|
// simply count the number of bytes not of this form.
|
||||||
|
//
|
||||||
|
// 0xc0 = 11000000
|
||||||
|
// 0x80 = 10000000
|
||||||
|
return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; });
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Truncates a utf8 string at len number of codepoints. This isn't 100%
|
||||||
|
* matching the user-perceived character count, but it should be close
|
||||||
|
* enough and avoids having to pull in something like ICU to count actual
|
||||||
|
* grapheme clusters.
|
||||||
|
*/
|
||||||
|
string utf8_truncate(string&& value, size_t len) {
|
||||||
|
if (value.empty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
|
||||||
|
// simply jump forward to bytes not of that form and truncate starting
|
||||||
|
// at that byte if we've counted too many codepoints
|
||||||
|
//
|
||||||
|
// 0xc0 = 11000000
|
||||||
|
// 0x80 = 10000000
|
||||||
|
auto it = value.begin();
|
||||||
|
auto end = value.end();
|
||||||
|
for (size_t i = 0; i < len; ++i) {
|
||||||
|
if (it == end)
|
||||||
|
break;
|
||||||
|
++it;
|
||||||
|
it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; });
|
||||||
|
}
|
||||||
|
value.erase(it, end);
|
||||||
|
|
||||||
|
return forward<string>(value);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Join all strings in vector into a single string separated by delim
|
* Join all strings in vector into a single string separated by delim
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user