Always feed the entire file into uchardet when detection is needed
uchardet will report that a file is "ascii" if the first page has no bytes >127, so we need to actually look at the entire file in case the first higher byte is later in the file.
This commit is contained in:
parent
8d17a0e88a
commit
6ca879938d
1 changed files with 1 additions and 3 deletions
|
@ -59,9 +59,6 @@ std::string Detect(agi::fs::path const& file) {
|
||||||
auto read = std::min<uint64_t>(4096, fp.size() - offset);
|
auto read = std::min<uint64_t>(4096, fp.size() - offset);
|
||||||
auto buf = fp.read(offset, read);
|
auto buf = fp.read(offset, read);
|
||||||
uchardet_handle_data(ud, buf, read);
|
uchardet_handle_data(ud, buf, read);
|
||||||
uchardet_data_end(ud);
|
|
||||||
if (*uchardet_get_charset(ud))
|
|
||||||
return uchardet_get_charset(ud);
|
|
||||||
|
|
||||||
offset += read;
|
offset += read;
|
||||||
|
|
||||||
|
@ -74,6 +71,7 @@ std::string Detect(agi::fs::path const& file) {
|
||||||
if (binaryish > offset / 8)
|
if (binaryish > offset / 8)
|
||||||
return "binary";
|
return "binary";
|
||||||
}
|
}
|
||||||
|
uchardet_data_end(ud);
|
||||||
return uchardet_get_charset(ud);
|
return uchardet_get_charset(ud);
|
||||||
#else
|
#else
|
||||||
auto read = std::min<uint64_t>(4096, fp.size());
|
auto read = std::min<uint64_t>(4096, fp.size());
|
||||||
|
|
Loading…
Reference in a new issue