2010-05-28 07:40:21 +00:00
|
|
|
// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
|
|
|
|
//
|
|
|
|
// Permission to use, copy, modify, and distribute this software for any
|
|
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
|
|
// copyright notice and this permission notice appear in all copies.
|
|
|
|
//
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
|
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
|
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
|
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
|
|
|
|
/// @file charset_ucd.cpp
|
|
|
|
/// @brief Character set detection using Universalchardet
|
|
|
|
/// @ingroup libaegisub
|
|
|
|
|
2010-06-17 00:23:44 +00:00
|
|
|
#include "charset_ucd.h"
|
2010-05-28 07:40:21 +00:00
|
|
|
|
|
|
|
#include "libaegisub/io.h"
|
2012-01-31 04:03:55 +00:00
|
|
|
#include "libaegisub/scoped_ptr.h"
|
2010-05-28 07:40:21 +00:00
|
|
|
|
2010-12-08 14:32:30 +00:00
|
|
|
#include "../../universalchardet/nsCharSetProber.h"
|
|
|
|
|
2010-05-28 07:40:21 +00:00
|
|
|
namespace agi {
|
|
|
|
namespace charset {
|
|
|
|
|
2012-11-13 06:44:33 -08:00
|
|
|
UCDetect::UCDetect(const std::string &file)
|
|
|
|
: nsUniversalDetector(NS_FILTER_ALL)
|
|
|
|
{
|
2010-05-28 07:40:21 +00:00
|
|
|
{
|
2012-01-31 04:03:55 +00:00
|
|
|
agi::scoped_ptr<std::ifstream> fp(io::Open(file, true));
|
|
|
|
|
|
|
|
// If it's over 100 MB it's either binary or big enough that we won't
|
|
|
|
// be able to do anything useful with it anyway
|
|
|
|
fp->seekg(0, std::ios::end);
|
|
|
|
if (fp->tellg() > 100 * 1024 * 1024) {
|
2012-11-13 06:44:33 -08:00
|
|
|
list.emplace_back(1.f, "binary");
|
2012-01-31 04:03:55 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
fp->seekg(0, std::ios::beg);
|
|
|
|
|
|
|
|
std::streamsize binaryish = 0;
|
|
|
|
std::streamsize bytes = 0;
|
|
|
|
|
|
|
|
while (!mDone && *fp) {
|
|
|
|
char buf[4096];
|
|
|
|
fp->read(buf, sizeof(buf));
|
|
|
|
std::streamsize read = fp->gcount();
|
|
|
|
HandleData(buf, (PRUint32)read);
|
|
|
|
|
|
|
|
// A dumb heuristic to detect binary files
|
|
|
|
if (!mDone) {
|
|
|
|
bytes += read;
|
|
|
|
for (std::streamsize i = 0; i < read; ++i) {
|
|
|
|
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
|
|
|
|
++binaryish;
|
|
|
|
}
|
2010-05-28 07:40:21 +00:00
|
|
|
|
2012-01-31 04:03:55 +00:00
|
|
|
if (binaryish > bytes / 8) {
|
2012-11-13 06:44:33 -08:00
|
|
|
list.emplace_back(1.f, "binary");
|
2012-01-31 04:03:55 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2010-05-28 07:40:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DataEnd();
|
|
|
|
|
2012-11-13 06:44:33 -08:00
|
|
|
if (mDetectedCharset)
|
|
|
|
list.emplace_back(1.f, mDetectedCharset);
|
|
|
|
else {
|
2010-05-28 07:40:21 +00:00
|
|
|
switch (mInputState) {
|
|
|
|
case eHighbyte: {
|
|
|
|
for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) {
|
2012-11-13 06:44:33 -08:00
|
|
|
if (!mCharSetProbers[i]) continue;
|
|
|
|
|
|
|
|
float conf = mCharSetProbers[i]->GetConfidence();
|
|
|
|
if (conf > 0.01f)
|
|
|
|
list.emplace_back(conf, mCharSetProbers[i]->GetCharSetName());
|
2010-05-28 07:40:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case ePureAscii:
|
2012-11-13 06:44:33 -08:00
|
|
|
list.emplace_back(1.f, "US-ASCII");
|
2010-05-28 07:40:21 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2012-11-13 06:44:33 -08:00
|
|
|
throw UnknownCharset("Unknown character set.");
|
2010-05-28 07:40:21 +00:00
|
|
|
}
|
|
|
|
|
2010-05-28 13:08:00 +00:00
|
|
|
if (list.empty() && (mInputState == eHighbyte))
|
2012-11-13 06:44:33 -08:00
|
|
|
throw UnknownCharset("Unknown character set.");
|
2010-05-28 07:40:21 +00:00
|
|
|
|
2012-11-13 06:44:33 -08:00
|
|
|
typedef std::pair<float, std::string> const& result;
|
2012-11-15 15:55:38 -08:00
|
|
|
sort(begin(list), end(list), [](result lft, result rgt) { return lft.first > rgt.first; });
|
2010-05-28 13:08:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-13 06:44:33 -08:00
|
|
|
std::string UCDetect::Single() const {
|
|
|
|
return list.front().second;
|
|
|
|
}
|
2010-05-28 07:40:21 +00:00
|
|
|
|
|
|
|
} // namespace util
|
|
|
|
} // namespace agi
|