2010-05-28 09:40:21 +02:00
|
|
|
// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
|
|
|
|
//
|
|
|
|
// Permission to use, copy, modify, and distribute this software for any
|
|
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
|
|
// copyright notice and this permission notice appear in all copies.
|
|
|
|
//
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
|
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
|
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
|
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
|
|
|
|
/// @file charset.cpp
|
|
|
|
/// @brief Character set detection and manipulation utilities.
|
|
|
|
/// @ingroup libaegisub
|
|
|
|
|
2013-01-04 16:01:50 +01:00
|
|
|
#include "libaegisub/charset.h"
|
2010-05-28 09:40:21 +02:00
|
|
|
|
2014-03-21 16:06:41 +01:00
|
|
|
#include "libaegisub/file_mapping.h"
|
2010-05-28 09:40:21 +02:00
|
|
|
|
2013-01-04 16:01:50 +01:00
|
|
|
#include <string>
|
|
|
|
|
|
|
|
#ifndef _WIN32
|
|
|
|
#define _X86_ 1
|
|
|
|
#endif
|
|
|
|
|
2014-03-10 18:22:28 +01:00
|
|
|
#include "../../vendor/universalchardet/nscore.h"
|
|
|
|
#include "../../vendor/universalchardet/nsUniversalDetector.h"
|
|
|
|
#include "../../vendor/universalchardet/nsMBCSGroupProber.h"
|
|
|
|
#include "../../vendor/universalchardet/nsCharSetProber.h"
|
2013-01-04 16:01:50 +01:00
|
|
|
|
|
|
|
namespace {
|
|
|
|
using namespace agi::charset;
|
|
|
|
|
2014-03-13 02:39:07 +01:00
|
|
|
class UCDetect final : public nsUniversalDetector {
|
2013-01-04 16:01:50 +01:00
|
|
|
/// List of detected character sets
|
|
|
|
CharsetListDetected list;
|
|
|
|
|
2013-11-21 18:13:36 +01:00
|
|
|
void Report(const char*) override {}
|
2013-01-04 16:01:50 +01:00
|
|
|
|
|
|
|
public:
|
|
|
|
/// @brief Detect character set of a file using UniversalCharDetect
|
|
|
|
/// @param file File to check
|
|
|
|
UCDetect(agi::fs::path const& file)
|
|
|
|
: nsUniversalDetector(NS_FILTER_ALL)
|
|
|
|
{
|
|
|
|
{
|
2014-03-21 16:06:41 +01:00
|
|
|
agi::read_file_mapping fp(file);
|
2013-01-04 16:01:50 +01:00
|
|
|
|
|
|
|
// If it's over 100 MB it's either binary or big enough that we won't
|
|
|
|
// be able to do anything useful with it anyway
|
2014-03-21 16:06:41 +01:00
|
|
|
if (fp.size() > 100 * 1024 * 1024) {
|
2013-01-04 16:01:50 +01:00
|
|
|
list.emplace_back(1.f, "binary");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-03-21 16:06:41 +01:00
|
|
|
uint64_t binaryish = 0;
|
|
|
|
for (uint64_t offset = 0; !mDone && offset < fp.size(); ) {
|
|
|
|
auto read = std::min<uint64_t>(4096, fp.size() - offset);
|
|
|
|
auto buf = fp.read(offset, read);
|
2013-01-04 16:01:50 +01:00
|
|
|
HandleData(buf, (PRUint32)read);
|
2014-03-21 16:06:41 +01:00
|
|
|
offset += read;
|
2010-05-28 09:40:21 +02:00
|
|
|
|
2013-01-04 16:01:50 +01:00
|
|
|
// A dumb heuristic to detect binary files
|
|
|
|
if (!mDone) {
|
2014-03-21 16:06:41 +01:00
|
|
|
for (size_t i = 0; i < read; ++i) {
|
2013-01-04 16:01:50 +01:00
|
|
|
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
|
|
|
|
++binaryish;
|
|
|
|
}
|
|
|
|
|
2014-03-21 16:06:41 +01:00
|
|
|
if (binaryish > offset / 8) {
|
2013-01-04 16:01:50 +01:00
|
|
|
list.emplace_back(1.f, "binary");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DataEnd();
|
|
|
|
|
|
|
|
if (mDetectedCharset)
|
|
|
|
list.emplace_back(1.f, mDetectedCharset);
|
|
|
|
else {
|
|
|
|
switch (mInputState) {
|
2013-11-21 18:13:36 +01:00
|
|
|
case eHighbyte:
|
|
|
|
for (auto& elem : mCharSetProbers) {
|
|
|
|
if (!elem) continue;
|
2013-01-04 16:01:50 +01:00
|
|
|
|
2013-11-21 18:13:36 +01:00
|
|
|
float conf = elem->GetConfidence();
|
2013-01-04 16:01:50 +01:00
|
|
|
if (conf > 0.01f)
|
2013-11-21 18:13:36 +01:00
|
|
|
list.emplace_back(conf, elem->GetCharSetName());
|
2013-01-04 16:01:50 +01:00
|
|
|
}
|
|
|
|
break;
|
2013-11-21 18:13:36 +01:00
|
|
|
|
2013-01-04 16:01:50 +01:00
|
|
|
case ePureAscii:
|
|
|
|
list.emplace_back(1.f, "US-ASCII");
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
throw UnknownCharset("Unknown character set.");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (list.empty() && (mInputState == eHighbyte))
|
|
|
|
throw UnknownCharset("Unknown character set.");
|
|
|
|
|
|
|
|
typedef std::pair<float, std::string> const& result;
|
|
|
|
sort(begin(list), end(list), [](result lft, result rgt) { return lft.first > rgt.first; });
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// @brief Detect character set of a file using UniversalCharDet
|
|
|
|
CharsetListDetected List() const { return list; }
|
|
|
|
};
|
2010-05-28 09:40:21 +02:00
|
|
|
}
|
|
|
|
|
2013-01-04 16:01:50 +01:00
|
|
|
namespace agi { namespace charset {
|
|
|
|
std::string Detect(agi::fs::path const& file) {
|
|
|
|
return DetectAll(file).front().second;
|
|
|
|
}
|
|
|
|
|
|
|
|
CharsetListDetected DetectAll(agi::fs::path const& file) {
|
|
|
|
return UCDetect(file).List();
|
|
|
|
}
|
|
|
|
} }
|