PrusaSlicer-NonPlainar/src/build-utils/encoding-check.cpp
Vojtech Kral 0ded335488 build: Add source file encoding check
Source files are checked using a small utility in src/build-utils
This is done to prevent bugs in build and localization caused
by weird non-UTF-8 encodings interpreted by MSVC in terms of local codepages
rather than UTF-8.
2019-08-19 17:04:14 +02:00

120 lines
3.7 KiB
C++

#include <vector>
#include <iostream>
#include <fstream>
#include <cstdlib>
/*
* The utf8_check() function scans the '\0'-terminated string starting
* at s. It returns a pointer to the first byte of the first malformed
* or overlong UTF-8 sequence found, or NULL if the string contains
* only correct UTF-8. It also spots UTF-8 sequences that could cause
* trouble if converted to UTF-16, namely surrogate characters
* (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
* routine is very likely to find a malformed sequence if the input
* uses any other encoding than UTF-8. It therefore can be used as a
* very effective heuristic for distinguishing between UTF-8 and other
* encodings.
*
* I wrote this code mainly as a specification of functionality; there
* are no doubt performance optimizations possible for certain CPUs.
*
* Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
* License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
*/
unsigned char *utf8_check(unsigned char *s)
{
while (*s) {
if (*s < 0x80) {
// 0xxxxxxx
s++;
} else if ((s[0] & 0xe0) == 0xc0) {
// 110xxxxx 10xxxxxx
if ((s[1] & 0xc0) != 0x80 ||
(s[0] & 0xfe) == 0xc0) { // overlong?
return s;
} else {
s += 2;
}
} else if ((s[0] & 0xf0) == 0xe0) {
// 1110xxxx 10xxxxxx 10xxxxxx
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || // overlong?
(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || // surrogate?
(s[0] == 0xef && s[1] == 0xbf &&
(s[2] & 0xfe) == 0xbe)) { // U+FFFE or U+FFFF?
return s;
} else {
s += 3;
}
} else if ((s[0] & 0xf8) == 0xf0) {
// 11110xxX 10xxxxxx 10xxxxxx 10xxxxxx
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || // overlong?
(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) { // > U+10FFFF?
return s;
} else {
s += 4;
}
} else {
return s;
}
}
return NULL;
}
int main(int argc, char const *argv[])
{
if (argc != 3) {
std::cerr << "Usage: " << argv[0] << " <program/library> <file>" << std::endl;
return -1;
}
const char* target = argv[1];
const char* filename = argv[2];
const auto error_exit = [=](const char* error) {
std::cerr << "\n\tError: " << error << ": " << filename << "\n"
<< "\tTarget: " << target << "\n"
<< std::endl;
std::exit(-2);
};
std::ifstream file(filename, std::ios::binary | std::ios::ate);
const auto size = file.tellg();
if (size == 0) {
return 0;
}
file.seekg(0, std::ios::beg);
std::vector<char> buffer(size);
if (file.read(buffer.data(), size)) {
buffer.push_back('\0');
// Check UTF-8 validity
if (utf8_check(reinterpret_cast<unsigned char*>(buffer.data())) != nullptr) {
error_exit("Source file does not contain (valid) UTF-8");
}
// Check against a BOM mark
if (buffer.size() >= 3
&& buffer[0] == '\xef'
&& buffer[1] == '\xbb'
&& buffer[2] == '\xbf') {
error_exit("Source file is valid UTF-8 but contains a BOM mark");
}
} else {
error_exit("Could not read source file");
}
return 0;
}