Rewrite encoding detection
Now feeds all data to uchardet, when uchardet is available. The file size limit is removed. When uchardet is not available, we look for UTF-8 BOM. This should make loading UTF-8-BOM files faster. Because Aegisub always save file in UTF-8-BOM, this should also guarentee Aegisub will load large (>100MB) file saved by itself. See Aegisub/Aegisub#110
This commit is contained in:
parent
4c6d370d51
commit
f733297499
1 changed files with 21 additions and 25 deletions
|
@ -29,37 +29,33 @@ namespace agi { namespace charset {
|
||||||
std::string Detect(agi::fs::path const& file) {
|
std::string Detect(agi::fs::path const& file) {
|
||||||
agi::read_file_mapping fp(file);
|
agi::read_file_mapping fp(file);
|
||||||
|
|
||||||
|
#ifdef WITH_UCHARDET
|
||||||
|
agi::scoped_holder<uchardet_t> ud(uchardet_new(), uchardet_delete);
|
||||||
|
for (uint64_t offset = 0; offset < fp.size(); ) {
|
||||||
|
auto read = std::min<uint64_t>(65536, fp.size() - offset);
|
||||||
|
auto buf = fp.read(offset, read);
|
||||||
|
uchardet_handle_data(ud, buf, read);
|
||||||
|
offset += read;
|
||||||
|
}
|
||||||
|
uchardet_data_end(ud);
|
||||||
|
std::string encoding = uchardet_get_charset(ud);
|
||||||
|
return encoding.empty() ? "binary" : encoding;
|
||||||
|
#else
|
||||||
|
|
||||||
|
// Look for utf-8 BOM
|
||||||
|
if (fp.size() >= 3) {
|
||||||
|
const char* buf = fp.read(0, 3);
|
||||||
|
if (!strncmp(buf, "\xef\xbb\xbf", 3))
|
||||||
|
return "utf-8";
|
||||||
|
}
|
||||||
|
|
||||||
// If it's over 100 MB it's either binary or big enough that we won't
|
// If it's over 100 MB it's either binary or big enough that we won't
|
||||||
// be able to do anything useful with it anyway
|
// be able to do anything useful with it anyway
|
||||||
if (fp.size() > 100 * 1024 * 1024)
|
if (fp.size() > 100 * 1024 * 1024)
|
||||||
return "binary";
|
return "binary";
|
||||||
|
|
||||||
uint64_t binaryish = 0;
|
uint64_t binaryish = 0;
|
||||||
|
auto read = std::min<uint64_t>(65536, fp.size());
|
||||||
#ifdef WITH_UCHARDET
|
|
||||||
agi::scoped_holder<uchardet_t> ud(uchardet_new(), uchardet_delete);
|
|
||||||
for (uint64_t offset = 0; offset < fp.size(); ) {
|
|
||||||
auto read = std::min<uint64_t>(4096, fp.size() - offset);
|
|
||||||
auto buf = fp.read(offset, read);
|
|
||||||
uchardet_handle_data(ud, buf, read);
|
|
||||||
uchardet_data_end(ud);
|
|
||||||
if (*uchardet_get_charset(ud))
|
|
||||||
return uchardet_get_charset(ud);
|
|
||||||
|
|
||||||
offset += read;
|
|
||||||
|
|
||||||
// A dumb heuristic to detect binary files
|
|
||||||
for (size_t i = 0; i < read; ++i) {
|
|
||||||
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
|
|
||||||
++binaryish;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (binaryish > offset / 8)
|
|
||||||
return "binary";
|
|
||||||
}
|
|
||||||
return uchardet_get_charset(ud);
|
|
||||||
#else
|
|
||||||
auto read = std::min<uint64_t>(4096, fp.size());
|
|
||||||
auto buf = fp.read(0, read);
|
auto buf = fp.read(0, read);
|
||||||
for (size_t i = 0; i < read; ++i) {
|
for (size_t i = 0; i < read; ++i) {
|
||||||
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
|
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
|
||||||
|
|
Loading…
Reference in a new issue