From 47c678bd63c56df6142ad618a3926f442b43fb35 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Tue, 13 Nov 2012 06:44:33 -0800 Subject: [PATCH] Make UCDetect a little less insane --- aegisub/libaegisub/common/charset.cpp | 4 +- aegisub/libaegisub/common/charset_ucd.cpp | 50 ++++++++----------- aegisub/libaegisub/common/charset_ucd.h | 9 +--- .../libaegisub/include/libaegisub/charset.h | 9 ++-- aegisub/src/charset_detect.cpp | 2 +- 5 files changed, 31 insertions(+), 43 deletions(-) diff --git a/aegisub/libaegisub/common/charset.cpp b/aegisub/libaegisub/common/charset.cpp index 079fefee1..9dc1f92be 100644 --- a/aegisub/libaegisub/common/charset.cpp +++ b/aegisub/libaegisub/common/charset.cpp @@ -25,8 +25,8 @@ std::string Detect(const std::string &file) { return UCDetect(file).Single(); } -void DetectAll(const std::string& file, CharsetListDetected &list) { - UCDetect(file).List(list); +CharsetListDetected DetectAll(const std::string& file) { + return UCDetect(file).List(); } } // namespace util diff --git a/aegisub/libaegisub/common/charset_ucd.cpp b/aegisub/libaegisub/common/charset_ucd.cpp index c2a533f15..0cf6886f1 100644 --- a/aegisub/libaegisub/common/charset_ucd.cpp +++ b/aegisub/libaegisub/common/charset_ucd.cpp @@ -23,11 +23,14 @@ #include "../../universalchardet/nsCharSetProber.h" +#include namespace agi { namespace charset { -UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) { +UCDetect::UCDetect(const std::string &file) +: nsUniversalDetector(NS_FILTER_ALL) +{ { agi::scoped_ptr fp(io::Open(file, true)); @@ -35,7 +38,7 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) // be able to do anything useful with it anyway fp->seekg(0, std::ios::end); if (fp->tellg() > 100 * 1024 * 1024) { - list.insert(CLDPair(1.f, "binary")); + list.emplace_back(1.f, "binary"); return; } fp->seekg(0, std::ios::beg); @@ -58,7 +61,7 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) } if (binaryish > bytes / 8) { - list.insert(CLDPair(1.f, "binary")); + list.emplace_back(1.f, "binary"); return; } } @@ -67,49 +70,40 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) DataEnd(); - if (mDetectedCharset) { - list.insert(CLDPair(1.f, mDetectedCharset)); - } else { - + if (mDetectedCharset) + list.emplace_back(1.f, mDetectedCharset); + else { switch (mInputState) { case eHighbyte: { for (PRInt32 i=0; iGetConfidence(); - if (conf > 0.01f) { - list.insert(CLDPair(conf, mCharSetProbers[i]->GetCharSetName())); - } - } + if (!mCharSetProbers[i]) continue; + + float conf = mCharSetProbers[i]->GetConfidence(); + if (conf > 0.01f) + list.emplace_back(conf, mCharSetProbers[i]->GetCharSetName()); } break; } case ePureAscii: - list.insert(CLDPair(1.f, "US-ASCII")); + list.emplace_back(1.f, "US-ASCII"); break; default: - throw UnknownCharset("Unknown chararacter set."); + throw UnknownCharset("Unknown character set."); } if (list.empty() && (mInputState == eHighbyte)) - throw UnknownCharset("Unknown chararacter set."); + throw UnknownCharset("Unknown character set."); - - } // if mDetectedCharset else -} - -std::string UCDetect::Single() { - /// @todo Add a debug log here since this shouldn't happen. - if (list.empty()) { - throw UnknownCharset("Unknown chararacter set."); + typedef std::pair const& result; + boost::sort(list, [](result lft, result rgt) { return lft.first > rgt.first; }); } - - CharsetListDetected::const_iterator i_lst = list.begin(); - return i_lst->second; } - +std::string UCDetect::Single() const { + return list.front().second; +} } // namespace util } // namespace agi diff --git a/aegisub/libaegisub/common/charset_ucd.h b/aegisub/libaegisub/common/charset_ucd.h index ed7c92825..134d1e51d 100644 --- a/aegisub/libaegisub/common/charset_ucd.h +++ b/aegisub/libaegisub/common/charset_ucd.h @@ -34,10 +34,6 @@ namespace agi { namespace charset { class UCDetect : public nsUniversalDetector { - - /// For insertion into CharsetListDetected - typedef std::pair CLDPair; - /// List of detected character sets. CharsetListDetected list; @@ -51,12 +47,11 @@ public: UCDetect(const std::string &file); /// @brief Detect character set of a file using UniversalCharDet - /// @param out[out] Map to load list into ordered by confidence - void List(CharsetListDetected &out) { out = list; } + CharsetListDetected List() const { return list; } /// @brief Return a single character set (highest confidence) /// @return Character set - std::string Single(); + std::string Single() const; }; } // namespace util diff --git a/aegisub/libaegisub/include/libaegisub/charset.h b/aegisub/libaegisub/include/libaegisub/charset.h index a1c652c85..4c521538d 100644 --- a/aegisub/libaegisub/include/libaegisub/charset.h +++ b/aegisub/libaegisub/include/libaegisub/charset.h @@ -19,8 +19,8 @@ #ifndef LAGI_PRE #include #include -#include #include +#include #endif #include @@ -32,18 +32,17 @@ DEFINE_BASE_EXCEPTION_NOINNER(CharsetError, agi::Exception) DEFINE_SIMPLE_EXCEPTION_NOINNER(UnknownCharset, CharsetError, "charset/unknown") /// List of detected encodings. -typedef std::map > CharsetListDetected; +typedef std::vector> CharsetListDetected; /// @brief Return a complete list of detected character sets ordered by precedence. /// @param file File to check -/// @param[out] list Map to load detected list into. -void DetectAll(const std::string &file, CharsetListDetected &list); +/// @return List of possible charsets sorted by probability +CharsetListDetected DetectAll(std::string const& file); /// @brief Returns the character set with the highest confidence /// @param file File to check /// @return Detected character set. std::string Detect(const std::string &file); - } // namespace util } // namespace agi diff --git a/aegisub/src/charset_detect.cpp b/aegisub/src/charset_detect.cpp index b48a851ce..9ecd766ec 100644 --- a/aegisub/src/charset_detect.cpp +++ b/aegisub/src/charset_detect.cpp @@ -55,7 +55,7 @@ wxString GetEncoding(wxString const& filename) { agi::charset::CharsetListDetected list; try { - agi::charset::DetectAll(from_wx(filename), list); + list = agi::charset::DetectAll(from_wx(filename)); } catch (const agi::charset::UnknownCharset&) { /// @todo If the charset is unknown we need to display a complete list of character sets. }