Add some dumb heuristics to detect binary files in the charset detector to avoid taking forever feeding hundreds of MB through it. Closes #1438.
Originally committed to SVN as r6405.
This commit is contained in:
parent
d68a395499
commit
983ffc1e83
1 changed files with 33 additions and 10 deletions
|
@ -20,11 +20,8 @@
|
||||||
|
|
||||||
#include "charset_ucd.h"
|
#include "charset_ucd.h"
|
||||||
|
|
||||||
#ifndef LAGI_PRE
|
|
||||||
#include <memory>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "libaegisub/io.h"
|
#include "libaegisub/io.h"
|
||||||
|
#include "libaegisub/scoped_ptr.h"
|
||||||
|
|
||||||
#include "../../universalchardet/nsCharSetProber.h"
|
#include "../../universalchardet/nsCharSetProber.h"
|
||||||
|
|
||||||
|
@ -34,13 +31,39 @@ namespace agi {
|
||||||
|
|
||||||
UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) {
|
UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) {
|
||||||
{
|
{
|
||||||
std::auto_ptr<std::ifstream> fp(io::Open(file, true));
|
agi::scoped_ptr<std::ifstream> fp(io::Open(file, true));
|
||||||
|
|
||||||
while (!mDone && !fp->eof()) {
|
// If it's over 100 MB it's either binary or big enough that we won't
|
||||||
char buf[512];
|
// be able to do anything useful with it anyway
|
||||||
fp->read(buf, 512);
|
fp->seekg(0, std::ios::end);
|
||||||
size_t bytes = (size_t)fp->gcount();
|
if (fp->tellg() > 100 * 1024 * 1024) {
|
||||||
HandleData(buf, (PRUint32)bytes);
|
list.insert(CLDPair(1.f, "binary"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
fp->seekg(0, std::ios::beg);
|
||||||
|
|
||||||
|
std::streamsize binaryish = 0;
|
||||||
|
std::streamsize bytes = 0;
|
||||||
|
|
||||||
|
while (!mDone && *fp) {
|
||||||
|
char buf[4096];
|
||||||
|
fp->read(buf, sizeof(buf));
|
||||||
|
std::streamsize read = fp->gcount();
|
||||||
|
HandleData(buf, (PRUint32)read);
|
||||||
|
|
||||||
|
// A dumb heuristic to detect binary files
|
||||||
|
if (!mDone) {
|
||||||
|
bytes += read;
|
||||||
|
for (std::streamsize i = 0; i < read; ++i) {
|
||||||
|
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
|
||||||
|
++binaryish;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (binaryish > bytes / 8) {
|
||||||
|
list.insert(CLDPair(1.f, "binary"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue