forked from mia/Aegisub
Add support for detecting character sets using UniversalChardet -- this superceeds character_detect.cpp.
Originally committed to SVN as r4367.
This commit is contained in:
parent
8dab221f8b
commit
e408fe49c9
6 changed files with 238 additions and 0 deletions
|
@ -20,6 +20,8 @@ DISTCLEANFILES += lagi_pre.h.gch
|
||||||
endif
|
endif
|
||||||
|
|
||||||
libaegisub_2_2_la_SOURCES = \
|
libaegisub_2_2_la_SOURCES = \
|
||||||
|
common/charset.cpp \
|
||||||
|
common/charset_ucd.cpp \
|
||||||
common/mru.cpp \
|
common/mru.cpp \
|
||||||
common/option.cpp \
|
common/option.cpp \
|
||||||
common/option_visit.cpp \
|
common/option_visit.cpp \
|
||||||
|
|
39
aegisub/libaegisub/common/charset.cpp
Normal file
39
aegisub/libaegisub/common/charset.cpp
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice appear in all copies.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
//
|
||||||
|
// $Id$
|
||||||
|
|
||||||
|
/// @file charset.cpp
|
||||||
|
/// @brief Character set detection and manipulation utilities.
|
||||||
|
/// @ingroup libaegisub
|
||||||
|
|
||||||
|
|
||||||
|
#include "libaegisub/charset.h"
|
||||||
|
#include "charset_ucd.h"
|
||||||
|
|
||||||
|
namespace agi {
|
||||||
|
namespace charset {
|
||||||
|
|
||||||
|
const std::string Detect(const std::string file) {
|
||||||
|
UCDetect ucd(file);
|
||||||
|
return ucd.Single();
|
||||||
|
}
|
||||||
|
|
||||||
|
void DetectAll(const std::string file, CharsetListDetected &list) {
|
||||||
|
UCDetect ucd(file);
|
||||||
|
ucd.List(list);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace util
|
||||||
|
} // namespace agi
|
87
aegisub/libaegisub/common/charset_ucd.cpp
Normal file
87
aegisub/libaegisub/common/charset_ucd.cpp
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice appear in all copies.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
//
|
||||||
|
// $Id$
|
||||||
|
|
||||||
|
/// @file charset_ucd.cpp
|
||||||
|
/// @brief Character set detection using Universalchardet
|
||||||
|
/// @ingroup libaegisub
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef LAGI_PRE
|
||||||
|
#include "../../universalchardet/nsCharSetProber.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "libaegisub/charset.h"
|
||||||
|
#include "charset_ucd.h"
|
||||||
|
#include "libaegisub/io.h"
|
||||||
|
|
||||||
|
namespace agi {
|
||||||
|
namespace charset {
|
||||||
|
|
||||||
|
|
||||||
|
UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL), binary(0) {
|
||||||
|
{
|
||||||
|
std::ifstream *fp;
|
||||||
|
fp = io::Open(file);
|
||||||
|
|
||||||
|
while (!mDone && !fp->eof()) {
|
||||||
|
char buf[512];
|
||||||
|
fp->read(buf, 512);
|
||||||
|
size_t bytes = fp->gcount();
|
||||||
|
HandleData(buf, bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DataEnd();
|
||||||
|
|
||||||
|
if (mDetectedCharset) {
|
||||||
|
charset.assign(mDetectedCharset);
|
||||||
|
} else {
|
||||||
|
|
||||||
|
switch (mInputState) {
|
||||||
|
case eHighbyte: {
|
||||||
|
for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) {
|
||||||
|
if (mCharSetProbers[i]) {
|
||||||
|
float conf = mCharSetProbers[i]->GetConfidence();
|
||||||
|
if (conf > 0.01f) {
|
||||||
|
list.insert(std::pair<float, std::string>(conf, mCharSetProbers[i]->GetCharSetName()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!list.empty()) {
|
||||||
|
CharsetListDetected::const_iterator i_lst = list.begin();
|
||||||
|
charset.assign(i_lst->second);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ePureAscii:
|
||||||
|
charset.assign("US-ASCII");
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw UnknownCharset("Unknown chararacter set.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((list.empty() && (mInputState == eHighbyte)) || charset.empty())
|
||||||
|
throw UnknownCharset("Unknown chararacter set.");
|
||||||
|
|
||||||
|
|
||||||
|
} // if mDetectedCharset else
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace util
|
||||||
|
} // namespace agi
|
57
aegisub/libaegisub/common/charset_ucd.h
Normal file
57
aegisub/libaegisub/common/charset_ucd.h
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice appear in all copies.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
//
|
||||||
|
// $Id$
|
||||||
|
|
||||||
|
/// @file charset_ucd.h
|
||||||
|
/// @brief Character set detection using Universalchardet
|
||||||
|
/// @ingroup libaegisub
|
||||||
|
|
||||||
|
#ifndef LAGI_PRE
|
||||||
|
#include "../../universalchardet/nscore.h"
|
||||||
|
#include "../../universalchardet/nsUniversalDetector.h"
|
||||||
|
#include "../../universalchardet/nsMBCSGroupProber.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace agi {
|
||||||
|
namespace charset {
|
||||||
|
|
||||||
|
class UCDetect : public nsUniversalDetector {
|
||||||
|
|
||||||
|
/// Character set
|
||||||
|
std::string charset;
|
||||||
|
|
||||||
|
/// List of detected character sets.
|
||||||
|
CharsetListDetected list;
|
||||||
|
|
||||||
|
/// Stub.
|
||||||
|
void Report(const char* aCharset) {};
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
/// @brief Detect character set of a file using UniversalCharDetect
|
||||||
|
/// @param file File to check
|
||||||
|
UCDetect(const std::string file);
|
||||||
|
|
||||||
|
/// @brief Detect character set of a file using UniversalCharDet
|
||||||
|
/// @param out[out] Map to load list into ordered by confidence
|
||||||
|
void List(CharsetListDetected &out) { out = list; }
|
||||||
|
|
||||||
|
/// @brief Return a single character set (highest confidence)
|
||||||
|
/// @return Character set
|
||||||
|
std::string Single() { return charset; }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace util
|
||||||
|
} // namespace agi
|
47
aegisub/libaegisub/include/libaegisub/charset.h
Normal file
47
aegisub/libaegisub/include/libaegisub/charset.h
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice appear in all copies.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
//
|
||||||
|
// $Id$
|
||||||
|
|
||||||
|
/// @file charset.h
|
||||||
|
/// @brief Character set detection and manipulation utilities.
|
||||||
|
/// @ingroup libaegisub
|
||||||
|
|
||||||
|
#ifndef LAGI_PRE
|
||||||
|
#endif
|
||||||
|
#include <map>
|
||||||
|
#include <libaegisub/exception.h>
|
||||||
|
|
||||||
|
namespace agi {
|
||||||
|
namespace charset {
|
||||||
|
|
||||||
|
DEFINE_BASE_EXCEPTION_NOINNER(CharsetError, agi::Exception)
|
||||||
|
DEFINE_SIMPLE_EXCEPTION_NOINNER(UnknownCharset, CharsetError, "charset/unknown")
|
||||||
|
|
||||||
|
/// List of detected encodings.
|
||||||
|
typedef std::map<float, std::string, std::greater_equal<float> > CharsetListDetected;
|
||||||
|
|
||||||
|
/// @brief Return a complete list of detected character sets ordered by precidence.
|
||||||
|
/// @param file File to check
|
||||||
|
/// @param[in] list Map to load detected list into.
|
||||||
|
void DetectAll(const std::string file, CharsetListDetected &list);
|
||||||
|
|
||||||
|
/// @brief Returns the character set with the highest confidence
|
||||||
|
/// @param file File to check
|
||||||
|
/// @return Detected character set.
|
||||||
|
const std::string Detect(const std::string file);
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace util
|
||||||
|
} // namespace agi
|
|
@ -30,3 +30,9 @@
|
||||||
#include "libaegisub/cajun/visitor.h"
|
#include "libaegisub/cajun/visitor.h"
|
||||||
#include "libaegisub/cajun/writer.h"
|
#include "libaegisub/cajun/writer.h"
|
||||||
|
|
||||||
|
// Universalchardet
|
||||||
|
#include "../universalchardet/nscore.h"
|
||||||
|
#include "../universalchardet/nsUniversalDetector.h"
|
||||||
|
#include "../universalchardet/nsMBCSGroupProber.h"
|
||||||
|
#include "../universalchardet/nsCharSetProber.h"
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue