From 3030bf4274313d8b9724b0a8f2c920dfe1f08bf4 Mon Sep 17 00:00:00 2001
From: LinFor <linfor@gmail.com>
Date: Wed, 11 Nov 2020 23:14:39 +0300
Subject: [PATCH] Handle UTF in long filenames (#20087)

Co-authored-by: Scott Lahteine <thinkyhead@users.noreply.github.com>
---
 Marlin/Configuration_adv.h       |   4 ++
 Marlin/src/lcd/fontutils.cpp     | 107 +++++++++++++------------------
 Marlin/src/sd/SdBaseFile.cpp     |  58 +++++++++++++++--
 Marlin/src/sd/SdFatConfig.h      |   7 +-
 buildroot/share/fonts/genpages.c |  48 +++++---------
 5 files changed, 124 insertions(+), 100 deletions(-)

diff --git a/Marlin/Configuration_adv.h b/Marlin/Configuration_adv.h
index ba56862d5db..b308b05e19b 100644
--- a/Marlin/Configuration_adv.h
+++ b/Marlin/Configuration_adv.h
@@ -1256,6 +1256,10 @@
                                       // Note: Only affects SCROLL_LONG_FILENAMES with SDSORT_CACHE_NAMES but not SDSORT_DYNAMIC_RAM.
   #endif
 
+  // Allow international symbols in long filenames. To display correctly, the
+  // LCD's font must contain the characters. Check your selected LCD language.
+  #define UTF_FILENAME_SUPPORT
+
   // This allows hosts to request long names for files and folders with M33
   //#define LONG_FILENAME_HOST_SUPPORT
 
diff --git a/Marlin/src/lcd/fontutils.cpp b/Marlin/src/lcd/fontutils.cpp
index 22b54c72de7..4aaf6218442 100644
--- a/Marlin/src/lcd/fontutils.cpp
+++ b/Marlin/src/lcd/fontutils.cpp
@@ -9,6 +9,8 @@
 
 #include "../inc/MarlinConfig.h"
 
+#define MAX_UTF8_CHAR_SIZE 4
+
 #if HAS_WIRED_LCD
   #include "marlinui.h"
   #include "../MarlinCore.h"
@@ -79,6 +81,8 @@ uint8_t* get_utf8_value_cb(uint8_t *pstart, read_byte_cb_t cb_read_byte, wchar_t
   uint32_t val = 0;
   uint8_t *p = pstart;
 
+  #define NEXT_6_BITS() do{ val <<= 6; p++; valcur = cb_read_byte(p); val |= (valcur & 0x3F); }while(0)
+
   uint8_t valcur = cb_read_byte(p);
   if (0 == (0x80 & valcur)) {
     val = valcur;
@@ -86,74 +90,51 @@ uint8_t* get_utf8_value_cb(uint8_t *pstart, read_byte_cb_t cb_read_byte, wchar_t
   }
   else if (0xC0 == (0xE0 & valcur)) {
     val = valcur & 0x1F;
-    val <<= 6;
-    p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    p++;
-  }
-  else if (0xE0 == (0xF0 & valcur)) {
-    val = valcur & 0x0F;
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    p++;
-  }
-  else if (0xF0 == (0xF8 & valcur)) {
-    val = valcur & 0x07;
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    p++;
-  }
-  else if (0xF8 == (0xFC & valcur)) {
-    val = valcur & 0x03;
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    p++;
-  }
-  else if (0xFC == (0xFE & valcur)) {
-    val = valcur & 0x01;
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
-    val <<= 6; p++;
-    valcur = cb_read_byte(p);
-    val |= (valcur & 0x3F);
+    NEXT_6_BITS();
     p++;
   }
+  #if MAX_UTF8_CHAR_SIZE >= 3
+    else if (0xE0 == (0xF0 & valcur)) {
+      val = valcur & 0x0F;
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      p++;
+    }
+  #endif
+  #if MAX_UTF8_CHAR_SIZE >= 4
+    else if (0xF0 == (0xF8 & valcur)) {
+      val = valcur & 0x07;
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      p++;
+    }
+  #endif
+  #if MAX_UTF8_CHAR_SIZE >= 5
+    else if (0xF8 == (0xFC & valcur)) {
+      val = valcur & 0x03;
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      p++;
+    }
+  #endif
+  #if MAX_UTF8_CHAR_SIZE >= 6
+    else if (0xFC == (0xFE & valcur)) {
+      val = valcur & 0x01;
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      NEXT_6_BITS();
+      p++;
+    }
+  #endif
   else if (0x80 == (0xC0 & valcur))
     for (; 0x80 == (0xC0 & valcur); ) { p++; valcur = cb_read_byte(p); }
   else
-    for (; ((0xFE & valcur) > 0xFC); ) { p++; valcur = cb_read_byte(p); }
+    for (; 0xFC < (0xFE & valcur); ) { p++; valcur = cb_read_byte(p); }
 
   if (pval) *pval = val;
 
diff --git a/Marlin/src/sd/SdBaseFile.cpp b/Marlin/src/sd/SdBaseFile.cpp
index 46ed9372ab6..acc5ba17f27 100644
--- a/Marlin/src/sd/SdBaseFile.cpp
+++ b/Marlin/src/sd/SdBaseFile.cpp
@@ -1103,19 +1103,67 @@ int8_t SdBaseFile::readDir(dir_t* dir, char* longFilename) {
         if (WITHIN(seq, 1, MAX_VFAT_ENTRIES)) {
           // TODO: Store the filename checksum to verify if a long-filename-unaware system modified the file table.
           n = (seq - 1) * (FILENAME_LENGTH);
-          LOOP_L_N(i, FILENAME_LENGTH)
-            longFilename[n + i] = (i < 5) ? VFAT->name1[i] : (i < 11) ? VFAT->name2[i - 5] : VFAT->name3[i - 11];
+          LOOP_L_N(i, FILENAME_LENGTH) {
+            uint16_t utf16_ch = (i < 5) ? VFAT->name1[i] : (i < 11) ? VFAT->name2[i - 5] : VFAT->name3[i - 11];
+            #if ENABLED(UTF_FILENAME_SUPPORT)
+              // We can't reconvert to UTF-8 here as UTF-8 is variable-size encoding, but joining LFN blocks
+              // needs static bytes addressing. So here just store full UTF-16LE words to re-convert later.
+              uint16_t idx = (n + i) * 2; // This is fixed as FAT LFN always contain UTF-16LE encoding
+              longFilename[idx] = utf16_ch & 0xFF;
+              longFilename[idx+1] = (utf16_ch >> 8) & 0xFF;
+            #else
+              // Replace all multibyte characters to '_'
+              longFilename[n + i] = (utf16_ch > 0xFF) ? '_' : (utf16_ch & 0xFF);
+            #endif
+          }
           // If this VFAT entry is the last one, add a NUL terminator at the end of the string
-          if (VFAT->sequenceNumber & 0x40) longFilename[n + FILENAME_LENGTH] = '\0';
+          if (VFAT->sequenceNumber & 0x40) longFilename[(n + FILENAME_LENGTH) * LONG_FILENAME_CHARSIZE] = '\0';
         }
       }
     }
+
     // Return if normal file or subdirectory
-    if (DIR_IS_FILE_OR_SUBDIR(dir)) return n;
+    if (DIR_IS_FILE_OR_SUBDIR(dir)) {
+      #if ENABLED(UTF_FILENAME_SUPPORT)
+        // Convert filename from utf-16 to utf-8 as Marlin expects
+        #if LONG_FILENAME_CHARSIZE > 2
+          // Add warning for developers for currently not supported 3-byte cases (Conversion series of 2-byte
+          // codepoints to 3-byte in-place will break the rest of filename)
+          #error "Currently filename re-encoding is done in-place. It may break the remaining chars to use 3-byte codepoints."
+        #endif
+        uint16_t currentPos = 0;
+        LOOP_L_N(i, (LONG_FILENAME_LENGTH / 2)) {
+          uint16_t idx = i * 2; // This is fixed as FAT LFN always contain UTF-16LE encoding
+
+          uint16_t utf16_ch = longFilename[idx] | (longFilename[idx + 1] << 8);
+          if (0xD800 == (utf16_ch & 0xF800))                                    // Surrogate pair - encode as '_'
+            longFilename[currentPos++] = '_';
+          else if (0 == (utf16_ch & 0xFF80))                                    // Encode as 1-byte utf-8 char
+            longFilename[currentPos++] = utf16_ch & 0x007F;
+          else if (0 == (utf16_ch & 0xF800)) {                                  // Encode as 2-byte utf-8 char
+            longFilename[currentPos++] = 0xC0 | ((utf16_ch >> 6) & 0x1F);
+            longFilename[currentPos++] = 0x80 | (utf16_ch & 0x3F);
+          }
+          else {
+            #if LONG_FILENAME_CHARSIZE > 2                                      // Encode as 3-byte utf-8 char
+              longFilename[currentPos++] = 0xE0 | ((utf16_ch >> 12) & 0x0F);
+              longFilename[currentPos++] = 0xC0 | ((utf16_ch >> 6) & 0x3F);
+              longFilename[currentPos++] = 0xC0 | (utf16_ch & 0x3F);
+            #else                                                               // Encode as '_'
+              longFilename[currentPos++] = '_';
+            #endif
+          }
+
+          if (0 == utf16_ch) break; // End of filename
+        }
+        return currentPos;
+      #else
+        return n;
+      #endif
+    }
   }
 }
 
-
 // Read next directory entry into the cache
 // Assumes file is correctly positioned
 dir_t* SdBaseFile::readDirCache() {
diff --git a/Marlin/src/sd/SdFatConfig.h b/Marlin/src/sd/SdFatConfig.h
index 8f0596c5ddb..13ac3a74878 100644
--- a/Marlin/src/sd/SdFatConfig.h
+++ b/Marlin/src/sd/SdFatConfig.h
@@ -103,5 +103,10 @@
 
 #define FILENAME_LENGTH 13 // Number of UTF-16 characters per entry
 
+// UTF-8 may use up to 3 bytes to represent single UTF-16 code point.
+// We discard 3-byte characters allowing only 2-bytes
+// or 1-byte if UTF_FILENAME_SUPPORT disabled.
+#define LONG_FILENAME_CHARSIZE TERN(UTF_FILENAME_SUPPORT, 2, 1)
+
 // Total bytes needed to store a single long filename
-#define LONG_FILENAME_LENGTH (FILENAME_LENGTH * MAX_VFAT_ENTRIES + 1)
+#define LONG_FILENAME_LENGTH (FILENAME_LENGTH * LONG_FILENAME_CHARSIZE * MAX_VFAT_ENTRIES + 1)
diff --git a/buildroot/share/fonts/genpages.c b/buildroot/share/fonts/genpages.c
index 2a87b19d47f..c855ceac509 100644
--- a/buildroot/share/fonts/genpages.c
+++ b/buildroot/share/fonts/genpages.c
@@ -71,63 +71,49 @@ uint8_t* get_utf8_value(uint8_t *pstart, wchar_t *pval) {
 
   assert(NULL != pstart);
 
+  #define NEXT_6_BITS() do{ val <<= 6; p++; val |= (*p & 0x3F); }while(0)
+
   if (0 == (0x80 & *p)) {
     val = (size_t)*p;
     p++;
   }
   else if (0xC0 == (0xE0 & *p)) {
     val = *p & 0x1F;
-    val <<= 6;
-    p++;
-    val |= (*p & 0x3F);
+    NEXT_6_BITS();
     p++;
     assert((wchar_t)val == get_val_utf82uni(pstart));
   }
   else if (0xE0 == (0xF0 & *p)) {
     val = *p & 0x0F;
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
+    NEXT_6_BITS();
+    NEXT_6_BITS();
     p++;
     assert((wchar_t)val == get_val_utf82uni(pstart));
   }
   else if (0xF0 == (0xF8 & *p)) {
     val = *p & 0x07;
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
+    NEXT_6_BITS();
+    NEXT_6_BITS();
+    NEXT_6_BITS();
     p++;
     assert((wchar_t)val == get_val_utf82uni(pstart));
   }
   else if (0xF8 == (0xFC & *p)) {
     val = *p & 0x03;
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
+    NEXT_6_BITS();
+    NEXT_6_BITS();
+    NEXT_6_BITS();
+    NEXT_6_BITS();
     p++;
     assert((wchar_t)val == get_val_utf82uni(pstart));
   }
   else if (0xFC == (0xFE & *p)) {
     val = *p & 0x01;
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
-    val <<= 6; p++;
-    val |= (*p & 0x3F);
+    NEXT_6_BITS();
+    NEXT_6_BITS();
+    NEXT_6_BITS();
+    NEXT_6_BITS();
+    NEXT_6_BITS();
     p++;
     assert((wchar_t)val == get_val_utf82uni(pstart));
   }