build: Add source file encoding check

Source files are checked using a small utility in src/build-utils This is done to prevent bugs in build and localization caused by weird non-UTF-8 encodings interpreted by MSVC in terms of local codepages rather than UTF-8.
2019-08-19 12:25:18 +02:00 · 2019-08-19 12:25:18 +02:00 · 0ded335488
commit 0ded335488
parent f937209619
7 changed files with 168 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,5 +1,6 @@
 project(PrusaSlicer-native)

+add_subdirectory(build-utils)
 add_subdirectory(admesh)
 add_subdirectory(avrdude)
 # boost/nowide
--- a/src/avrdude/CMakeLists.txt
+++ b/src/avrdude/CMakeLists.txt
@ -100,6 +100,9 @@ add_dependencies(avrdude gen_conf_h)
 add_executable(avrdude-slic3r main-standalone.cpp)
 target_link_libraries(avrdude-slic3r avrdude)

+encoding_check(avrdude)
+encoding_check(avrdude-slic3r)
+
 if (WIN32)
    target_compile_definitions(avrdude PRIVATE WIN32NATIVE=1)
    if(MSVC)
--- a/src/build-utils/CMakeLists.txt
+++ b/src/build-utils/CMakeLists.txt
@ -0,0 +1,39 @@
+
+add_executable(encoding-check encoding-check.cpp)
+
+# A global no-op target which depends on all encodings checks,
+# and on which in turn all checked targets depend.
+# This is done to make encoding checks the first thing to be
+# performed before actually compiling any sources of the checked targets
+# to make the check fail as early as possible.
+add_custom_target(global-encoding-check
+    ALL
+    DEPENDS encoding-check
+)
+
+# Function that adds source file encoding check to a target
+# using the above encoding-check binary
+
+function(encoding_check TARGET)
+    # Obtain target source files
+    get_target_property(T_SOURCES ${TARGET} SOURCES)
+
+    # Define top-level encoding check target for this ${TARGET}
+    add_custom_target(encoding-check-${TARGET}
+        DEPENDS encoding-check ${T_SOURCES}
+        COMMENT "Checking source files encodings for target ${TARGET}"
+    )
+
+    # Add checking of each source file as a subcommand of encoding-check-${TARGET}
+    foreach(file ${T_SOURCES})
+        add_custom_command(TARGET encoding-check-${TARGET}
+            COMMAND $<TARGET_FILE:encoding-check> ${TARGET} ${file}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        )
+    endforeach()
+
+    # This adds dependency on encoding-check-${TARGET} to ${TARET}
+    # via the global-encoding-check
+    add_dependencies(global-encoding-check encoding-check-${TARGET})
+    add_dependencies(${TARGET} global-encoding-check)
+endfunction()
--- a/src/build-utils/encoding-check.cpp
+++ b/src/build-utils/encoding-check.cpp
@ -0,0 +1,119 @@
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+
+
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * I wrote this code mainly as a specification of functionality; there
+ * are no doubt performance optimizations possible for certain CPUs.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+
+unsigned char *utf8_check(unsigned char *s)
+{
+    while (*s) {
+        if (*s < 0x80) {
+            // 0xxxxxxx
+            s++;
+        } else if ((s[0] & 0xe0) == 0xc0) {
+            // 110xxxxx 10xxxxxx
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[0] & 0xfe) == 0xc0) {         // overlong?
+                return s;
+            } else {
+                s += 2;
+            }
+        } else if ((s[0] & 0xf0) == 0xe0) {
+            // 1110xxxx 10xxxxxx 10xxxxxx
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || // overlong?
+                (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || // surrogate?
+                (s[0] == 0xef && s[1] == 0xbf &&
+                (s[2] & 0xfe) == 0xbe)) {                  // U+FFFE or U+FFFF?
+                return s;
+            } else {
+                s += 3;
+            }
+        } else if ((s[0] & 0xf8) == 0xf0) {
+            // 11110xxX 10xxxxxx 10xxxxxx 10xxxxxx
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                (s[3] & 0xc0) != 0x80 ||
+                (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||      // overlong?
+                (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { // > U+10FFFF?
+                return s;
+            } else {
+                s += 4;
+            }
+        } else {
+            return s;
+        }
+    }
+
+    return NULL;
+}
+
+
+int main(int argc, char const *argv[])
+{
+    if (argc != 3) {
+        std::cerr << "Usage: " << argv[0] << " <program/library> <file>" << std::endl;
+        return -1;
+    }
+
+    const char* target = argv[1];
+    const char* filename = argv[2];
+
+    const auto error_exit = [=](const char* error) {
+        std::cerr << "\n\tError: " << error << ": " << filename << "\n"
+            << "\tTarget: " << target << "\n"
+            << std::endl;
+        std::exit(-2);
+    };
+
+    std::ifstream file(filename, std::ios::binary | std::ios::ate);
+    const auto size = file.tellg();
+
+    if (size == 0) {
+        return 0;
+    }
+
+    file.seekg(0, std::ios::beg);
+    std::vector<char> buffer(size);
+
+    if (file.read(buffer.data(), size)) {
+        buffer.push_back('\0');
+
+        // Check UTF-8 validity
+        if (utf8_check(reinterpret_cast<unsigned char*>(buffer.data())) != nullptr) {
+            error_exit("Source file does not contain (valid) UTF-8");
+        }
+
+        // Check against a BOM mark
+        if (buffer.size() >= 3
+            && buffer[0] == '\xef'
+            && buffer[1] == '\xbb'
+            && buffer[2] == '\xbf') {
+            error_exit("Source file is valid UTF-8 but contains a BOM mark");
+        }
+    } else {
+        error_exit("Could not read source file");
+    }
+
+    return 0;
+}
--- a/src/libslic3r/CMakeLists.txt
+++ b/src/libslic3r/CMakeLists.txt
@ -187,6 +187,8 @@ add_library(libslic3r STATIC
    SLA/SLARasterWriter.cpp
 )

+encoding_check(libslic3r)
+
 if (SLIC3R_PCH AND NOT SLIC3R_SYNTAXONLY)
    add_precompiled_header(libslic3r pchheader.hpp FORCEINCLUDE)
 endif ()
--- a/src/semver/CMakeLists.txt
+++ b/src/semver/CMakeLists.txt
@ -5,3 +5,5 @@ add_library(semver STATIC
    semver.c
    semver.h
 )
+
+encoding_check(semver)
--- a/src/slic3r/CMakeLists.txt
+++ b/src/slic3r/CMakeLists.txt
@ -163,6 +163,8 @@ endif ()

 add_library(libslic3r_gui STATIC ${SLIC3R_GUI_SOURCES})

+encoding_check(libslic3r_gui)
+
 target_link_libraries(libslic3r_gui libslic3r avrdude cereal imgui ${GLEW_LIBRARIES})
 if (SLIC3R_PCH AND NOT SLIC3R_SYNTAXONLY)
    add_precompiled_header(libslic3r_gui pchheader.hpp FORCEINCLUDE)