340cc1d7f1
Originally committed to SVN as r4504.
97 lines
2.4 KiB
C++
97 lines
2.4 KiB
C++
// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
|
|
//
|
|
// Permission to use, copy, modify, and distribute this software for any
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
// copyright notice and this permission notice appear in all copies.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
//
|
|
// $Id$
|
|
|
|
/// @file charset_ucd.cpp
|
|
/// @brief Character set detection using Universalchardet
|
|
/// @ingroup libaegisub
|
|
|
|
|
|
#ifndef LAGI_PRE
|
|
#include <fstream>
|
|
#include <string>
|
|
#include <map>
|
|
|
|
#include "../../universalchardet/nsCharSetProber.h"
|
|
#endif
|
|
|
|
#include "libaegisub/charset.h"
|
|
#include "charset_ucd.h"
|
|
#include "libaegisub/io.h"
|
|
|
|
namespace agi {
|
|
namespace charset {
|
|
|
|
UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
|
|
{
|
|
std::ifstream *fp;
|
|
fp = io::Open(file);
|
|
|
|
while (!mDone && !fp->eof()) {
|
|
char buf[512];
|
|
fp->read(buf, 512);
|
|
size_t bytes = fp->gcount();
|
|
HandleData(buf, bytes);
|
|
}
|
|
}
|
|
|
|
DataEnd();
|
|
|
|
if (mDetectedCharset) {
|
|
list.insert(CLDPair(1, mDetectedCharset));
|
|
} else {
|
|
|
|
switch (mInputState) {
|
|
case eHighbyte: {
|
|
for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) {
|
|
if (mCharSetProbers[i]) {
|
|
float conf = mCharSetProbers[i]->GetConfidence();
|
|
if (conf > 0.01f) {
|
|
list.insert(CLDPair(conf, mCharSetProbers[i]->GetCharSetName()));
|
|
}
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
case ePureAscii:
|
|
list.insert(CLDPair(1, "US-ASCII"));
|
|
break;
|
|
|
|
default:
|
|
throw UnknownCharset("Unknown chararacter set.");
|
|
}
|
|
|
|
if (list.empty() && (mInputState == eHighbyte))
|
|
throw UnknownCharset("Unknown chararacter set.");
|
|
|
|
|
|
} // if mDetectedCharset else
|
|
}
|
|
|
|
std::string UCDetect::Single() {
|
|
/// @todo Add a debug log here since this shouldn't happen.
|
|
if (list.empty()) {
|
|
throw UnknownCharset("Unknown chararacter set.");
|
|
}
|
|
|
|
CharsetListDetected::const_iterator i_lst = list.begin();
|
|
return i_lst->second;
|
|
}
|
|
|
|
|
|
|
|
} // namespace util
|
|
} // namespace agi
|