fix(label): Truncate label replacements based on codepoint count

This helps ensure that when a string is truncated it is not done in the middle of a utf8 multi-byte sequence. This doesn't 100% correspond to user-perceived characters, but it should be pretty close in most cases.
2017-02-14 13:39:07 -06:00 · 2017-02-14 13:39:07 -06:00 · 73faa18cf0
commit 73faa18cf0
parent 1d06df25a9
3 changed files with 48 additions and 2 deletions
--- a/include/utils/string.hpp
+++ b/include/utils/string.hpp
@ -81,6 +81,9 @@ namespace string_util {
  string rtrim(string&& value, const char& needle = ' ');
  string trim(string&& value, const char& needle = ' ');
  size_t char_len(const string& value);
  string utf8_truncate(string&& value, size_t len);
  string join(const vector<string>& strs, const string& delim);
  vector<string>& split_into(const string& s, char delim, vector<string>& container);
  vector<string> split(const string& s, char delim);
--- a/src/drawtypes/label.cpp
+++ b/src/drawtypes/label.cpp
@ -48,8 +48,8 @@ namespace drawtypes {
    for (auto&& tok : m_tokens) {
      if (token == tok.token) {
-        if (tok.max != 0_z && replacement.length() > tok.max) {
+        if (tok.max != 0_z && string_util::char_len(replacement) > tok.max) {
-          replacement = replacement.erase(tok.max) + tok.suffix;
+          replacement = string_util::utf8_truncate(std::move(replacement), tok.max) + tok.suffix;
        } else if (tok.min != 0_z && replacement.length() < tok.min) {
          replacement.insert(0_z, tok.min - replacement.length(), ' ');
        }
--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@ -1,3 +1,4 @@
 #include <algorithm>
 #include <cstring>
 #include <iomanip>
 #include <sstream>
@ -145,6 +146,48 @@ namespace string_util {
    return rtrim(ltrim(forward<string>(value), needle), needle);
  }
  /**
   * Counts the number of codepoints in a utf8 encoded string.
   */
  size_t char_len(const string& value) {
    // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
    // simply count the number of bytes not of this form.
    //
    // 0xc0 = 11000000
    // 0x80 = 10000000
    return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; });
  }
  /**
   * Truncates a utf8 string at len number of codepoints. This isn't 100%
   * matching the user-perceived character count, but it should be close
   * enough and avoids having to pull in something like ICU to count actual
   * grapheme clusters.
   */
  string utf8_truncate(string&& value, size_t len) {
    if (value.empty()) {
      return "";
    }
    // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
    // simply jump forward to bytes not of that form and truncate starting
    // at that byte if we've counted too many codepoints
    //
    // 0xc0 = 11000000
    // 0x80 = 10000000
    auto it = value.begin();
    auto end = value.end();
    for (size_t i = 0; i < len; ++i) {
      if (it == end)
        break;
      ++it;
      it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; });
    }
    value.erase(it, end);
    return forward<string>(value);
  }
  /**
   * Join all strings in vector into a single string separated by delim
   */