Always feed the entire file into uchardet when detection is needed

uchardet will report that a file is "ascii" if the first page has no bytes
>127, so we need to actually look at the entire file in case the first higher
byte is later in the file.
This commit is contained in:
Thomas Goyne 2019-09-28 13:56:56 -07:00
parent 8d17a0e88a
commit 6ca879938d

View file

@ -59,9 +59,6 @@ std::string Detect(agi::fs::path const& file) {
auto read = std::min<uint64_t>(4096, fp.size() - offset); auto read = std::min<uint64_t>(4096, fp.size() - offset);
auto buf = fp.read(offset, read); auto buf = fp.read(offset, read);
uchardet_handle_data(ud, buf, read); uchardet_handle_data(ud, buf, read);
uchardet_data_end(ud);
if (*uchardet_get_charset(ud))
return uchardet_get_charset(ud);
offset += read; offset += read;
@ -74,6 +71,7 @@ std::string Detect(agi::fs::path const& file) {
if (binaryish > offset / 8) if (binaryish > offset / 8)
return "binary"; return "binary";
} }
uchardet_data_end(ud);
return uchardet_get_charset(ud); return uchardet_get_charset(ud);
#else #else
auto read = std::min<uint64_t>(4096, fp.size()); auto read = std::min<uint64_t>(4096, fp.size());