Always feed the entire file into uchardet when detection is needed
uchardet will report that a file is "ascii" if the first page has no bytes >127, so we need to actually look at the entire file in case the first higher byte is later in the file.
This commit is contained in:
parent
8d17a0e88a
commit
6ca879938d
1 changed files with 1 additions and 3 deletions
|
@ -59,9 +59,6 @@ std::string Detect(agi::fs::path const& file) {
|
|||
auto read = std::min<uint64_t>(4096, fp.size() - offset);
|
||||
auto buf = fp.read(offset, read);
|
||||
uchardet_handle_data(ud, buf, read);
|
||||
uchardet_data_end(ud);
|
||||
if (*uchardet_get_charset(ud))
|
||||
return uchardet_get_charset(ud);
|
||||
|
||||
offset += read;
|
||||
|
||||
|
@ -74,6 +71,7 @@ std::string Detect(agi::fs::path const& file) {
|
|||
if (binaryish > offset / 8)
|
||||
return "binary";
|
||||
}
|
||||
uchardet_data_end(ud);
|
||||
return uchardet_get_charset(ud);
|
||||
#else
|
||||
auto read = std::min<uint64_t>(4096, fp.size());
|
||||
|
|
Loading…
Reference in a new issue