Make UCDetect a little less insane

This commit is contained in:
Thomas Goyne 2012-11-13 06:44:33 -08:00
parent 2dd1da8333
commit 47c678bd63
5 changed files with 31 additions and 43 deletions

View file

@ -25,8 +25,8 @@ std::string Detect(const std::string &file) {
return UCDetect(file).Single(); return UCDetect(file).Single();
} }
void DetectAll(const std::string& file, CharsetListDetected &list) { CharsetListDetected DetectAll(const std::string& file) {
UCDetect(file).List(list); return UCDetect(file).List();
} }
} // namespace util } // namespace util

View file

@ -23,11 +23,14 @@
#include "../../universalchardet/nsCharSetProber.h" #include "../../universalchardet/nsCharSetProber.h"
#include <boost/range/algorithm.hpp>
namespace agi { namespace agi {
namespace charset { namespace charset {
UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) { UCDetect::UCDetect(const std::string &file)
: nsUniversalDetector(NS_FILTER_ALL)
{
{ {
agi::scoped_ptr<std::ifstream> fp(io::Open(file, true)); agi::scoped_ptr<std::ifstream> fp(io::Open(file, true));
@ -35,7 +38,7 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL)
// be able to do anything useful with it anyway // be able to do anything useful with it anyway
fp->seekg(0, std::ios::end); fp->seekg(0, std::ios::end);
if (fp->tellg() > 100 * 1024 * 1024) { if (fp->tellg() > 100 * 1024 * 1024) {
list.insert(CLDPair(1.f, "binary")); list.emplace_back(1.f, "binary");
return; return;
} }
fp->seekg(0, std::ios::beg); fp->seekg(0, std::ios::beg);
@ -58,7 +61,7 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL)
} }
if (binaryish > bytes / 8) { if (binaryish > bytes / 8) {
list.insert(CLDPair(1.f, "binary")); list.emplace_back(1.f, "binary");
return; return;
} }
} }
@ -67,49 +70,40 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL)
DataEnd(); DataEnd();
if (mDetectedCharset) { if (mDetectedCharset)
list.insert(CLDPair(1.f, mDetectedCharset)); list.emplace_back(1.f, mDetectedCharset);
} else { else {
switch (mInputState) { switch (mInputState) {
case eHighbyte: { case eHighbyte: {
for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) { for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) {
if (mCharSetProbers[i]) { if (!mCharSetProbers[i]) continue;
float conf = mCharSetProbers[i]->GetConfidence();
if (conf > 0.01f) { float conf = mCharSetProbers[i]->GetConfidence();
list.insert(CLDPair(conf, mCharSetProbers[i]->GetCharSetName())); if (conf > 0.01f)
} list.emplace_back(conf, mCharSetProbers[i]->GetCharSetName());
}
} }
break; break;
} }
case ePureAscii: case ePureAscii:
list.insert(CLDPair(1.f, "US-ASCII")); list.emplace_back(1.f, "US-ASCII");
break; break;
default: default:
throw UnknownCharset("Unknown chararacter set."); throw UnknownCharset("Unknown character set.");
} }
if (list.empty() && (mInputState == eHighbyte)) if (list.empty() && (mInputState == eHighbyte))
throw UnknownCharset("Unknown chararacter set."); throw UnknownCharset("Unknown character set.");
typedef std::pair<float, std::string> const& result;
} // if mDetectedCharset else boost::sort(list, [](result lft, result rgt) { return lft.first > rgt.first; });
}
std::string UCDetect::Single() {
/// @todo Add a debug log here since this shouldn't happen.
if (list.empty()) {
throw UnknownCharset("Unknown chararacter set.");
} }
CharsetListDetected::const_iterator i_lst = list.begin();
return i_lst->second;
} }
std::string UCDetect::Single() const {
return list.front().second;
}
} // namespace util } // namespace util
} // namespace agi } // namespace agi

View file

@ -34,10 +34,6 @@ namespace agi {
namespace charset { namespace charset {
class UCDetect : public nsUniversalDetector { class UCDetect : public nsUniversalDetector {
/// For insertion into CharsetListDetected
typedef std::pair<float, std::string> CLDPair;
/// List of detected character sets. /// List of detected character sets.
CharsetListDetected list; CharsetListDetected list;
@ -51,12 +47,11 @@ public:
UCDetect(const std::string &file); UCDetect(const std::string &file);
/// @brief Detect character set of a file using UniversalCharDet /// @brief Detect character set of a file using UniversalCharDet
/// @param out[out] Map to load list into ordered by confidence CharsetListDetected List() const { return list; }
void List(CharsetListDetected &out) { out = list; }
/// @brief Return a single character set (highest confidence) /// @brief Return a single character set (highest confidence)
/// @return Character set /// @return Character set
std::string Single(); std::string Single() const;
}; };
} // namespace util } // namespace util

View file

@ -19,8 +19,8 @@
#ifndef LAGI_PRE #ifndef LAGI_PRE
#include <fstream> #include <fstream>
#include <functional> #include <functional>
#include <map>
#include <string> #include <string>
#include <vector>
#endif #endif
#include <libaegisub/exception.h> #include <libaegisub/exception.h>
@ -32,18 +32,17 @@ DEFINE_BASE_EXCEPTION_NOINNER(CharsetError, agi::Exception)
DEFINE_SIMPLE_EXCEPTION_NOINNER(UnknownCharset, CharsetError, "charset/unknown") DEFINE_SIMPLE_EXCEPTION_NOINNER(UnknownCharset, CharsetError, "charset/unknown")
/// List of detected encodings. /// List of detected encodings.
typedef std::map<float, std::string, std::greater_equal<float> > CharsetListDetected; typedef std::vector<std::pair<float, std::string>> CharsetListDetected;
/// @brief Return a complete list of detected character sets ordered by precedence. /// @brief Return a complete list of detected character sets ordered by precedence.
/// @param file File to check /// @param file File to check
/// @param[out] list Map to load detected list into. /// @return List of possible charsets sorted by probability
void DetectAll(const std::string &file, CharsetListDetected &list); CharsetListDetected DetectAll(std::string const& file);
/// @brief Returns the character set with the highest confidence /// @brief Returns the character set with the highest confidence
/// @param file File to check /// @param file File to check
/// @return Detected character set. /// @return Detected character set.
std::string Detect(const std::string &file); std::string Detect(const std::string &file);
} // namespace util } // namespace util
} // namespace agi } // namespace agi

View file

@ -55,7 +55,7 @@ wxString GetEncoding(wxString const& filename) {
agi::charset::CharsetListDetected list; agi::charset::CharsetListDetected list;
try { try {
agi::charset::DetectAll(from_wx(filename), list); list = agi::charset::DetectAll(from_wx(filename));
} catch (const agi::charset::UnknownCharset&) { } catch (const agi::charset::UnknownCharset&) {
/// @todo If the charset is unknown we need to display a complete list of character sets. /// @todo If the charset is unknown we need to display a complete list of character sets.
} }