forked from mia/Aegisub
Make UCDetect a little less insane
This commit is contained in:
parent
2dd1da8333
commit
47c678bd63
|
@ -25,8 +25,8 @@ std::string Detect(const std::string &file) {
|
||||||
return UCDetect(file).Single();
|
return UCDetect(file).Single();
|
||||||
}
|
}
|
||||||
|
|
||||||
void DetectAll(const std::string& file, CharsetListDetected &list) {
|
CharsetListDetected DetectAll(const std::string& file) {
|
||||||
UCDetect(file).List(list);
|
return UCDetect(file).List();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
|
|
|
@ -23,11 +23,14 @@
|
||||||
|
|
||||||
#include "../../universalchardet/nsCharSetProber.h"
|
#include "../../universalchardet/nsCharSetProber.h"
|
||||||
|
|
||||||
|
#include <boost/range/algorithm.hpp>
|
||||||
|
|
||||||
namespace agi {
|
namespace agi {
|
||||||
namespace charset {
|
namespace charset {
|
||||||
|
|
||||||
UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL) {
|
UCDetect::UCDetect(const std::string &file)
|
||||||
|
: nsUniversalDetector(NS_FILTER_ALL)
|
||||||
|
{
|
||||||
{
|
{
|
||||||
agi::scoped_ptr<std::ifstream> fp(io::Open(file, true));
|
agi::scoped_ptr<std::ifstream> fp(io::Open(file, true));
|
||||||
|
|
||||||
|
@ -35,7 +38,7 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL)
|
||||||
// be able to do anything useful with it anyway
|
// be able to do anything useful with it anyway
|
||||||
fp->seekg(0, std::ios::end);
|
fp->seekg(0, std::ios::end);
|
||||||
if (fp->tellg() > 100 * 1024 * 1024) {
|
if (fp->tellg() > 100 * 1024 * 1024) {
|
||||||
list.insert(CLDPair(1.f, "binary"));
|
list.emplace_back(1.f, "binary");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
fp->seekg(0, std::ios::beg);
|
fp->seekg(0, std::ios::beg);
|
||||||
|
@ -58,7 +61,7 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (binaryish > bytes / 8) {
|
if (binaryish > bytes / 8) {
|
||||||
list.insert(CLDPair(1.f, "binary"));
|
list.emplace_back(1.f, "binary");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -67,49 +70,40 @@ UCDetect::UCDetect(const std::string &file): nsUniversalDetector(NS_FILTER_ALL)
|
||||||
|
|
||||||
DataEnd();
|
DataEnd();
|
||||||
|
|
||||||
if (mDetectedCharset) {
|
if (mDetectedCharset)
|
||||||
list.insert(CLDPair(1.f, mDetectedCharset));
|
list.emplace_back(1.f, mDetectedCharset);
|
||||||
} else {
|
else {
|
||||||
|
|
||||||
switch (mInputState) {
|
switch (mInputState) {
|
||||||
case eHighbyte: {
|
case eHighbyte: {
|
||||||
for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) {
|
for (PRInt32 i=0; i<NUM_OF_CHARSET_PROBERS; i++) {
|
||||||
if (mCharSetProbers[i]) {
|
if (!mCharSetProbers[i]) continue;
|
||||||
float conf = mCharSetProbers[i]->GetConfidence();
|
|
||||||
if (conf > 0.01f) {
|
float conf = mCharSetProbers[i]->GetConfidence();
|
||||||
list.insert(CLDPair(conf, mCharSetProbers[i]->GetCharSetName()));
|
if (conf > 0.01f)
|
||||||
}
|
list.emplace_back(conf, mCharSetProbers[i]->GetCharSetName());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ePureAscii:
|
case ePureAscii:
|
||||||
list.insert(CLDPair(1.f, "US-ASCII"));
|
list.emplace_back(1.f, "US-ASCII");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw UnknownCharset("Unknown chararacter set.");
|
throw UnknownCharset("Unknown character set.");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (list.empty() && (mInputState == eHighbyte))
|
if (list.empty() && (mInputState == eHighbyte))
|
||||||
throw UnknownCharset("Unknown chararacter set.");
|
throw UnknownCharset("Unknown character set.");
|
||||||
|
|
||||||
|
typedef std::pair<float, std::string> const& result;
|
||||||
} // if mDetectedCharset else
|
boost::sort(list, [](result lft, result rgt) { return lft.first > rgt.first; });
|
||||||
}
|
|
||||||
|
|
||||||
std::string UCDetect::Single() {
|
|
||||||
/// @todo Add a debug log here since this shouldn't happen.
|
|
||||||
if (list.empty()) {
|
|
||||||
throw UnknownCharset("Unknown chararacter set.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CharsetListDetected::const_iterator i_lst = list.begin();
|
|
||||||
return i_lst->second;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string UCDetect::Single() const {
|
||||||
|
return list.front().second;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
} // namespace agi
|
} // namespace agi
|
||||||
|
|
|
@ -34,10 +34,6 @@ namespace agi {
|
||||||
namespace charset {
|
namespace charset {
|
||||||
|
|
||||||
class UCDetect : public nsUniversalDetector {
|
class UCDetect : public nsUniversalDetector {
|
||||||
|
|
||||||
/// For insertion into CharsetListDetected
|
|
||||||
typedef std::pair<float, std::string> CLDPair;
|
|
||||||
|
|
||||||
/// List of detected character sets.
|
/// List of detected character sets.
|
||||||
CharsetListDetected list;
|
CharsetListDetected list;
|
||||||
|
|
||||||
|
@ -51,12 +47,11 @@ public:
|
||||||
UCDetect(const std::string &file);
|
UCDetect(const std::string &file);
|
||||||
|
|
||||||
/// @brief Detect character set of a file using UniversalCharDet
|
/// @brief Detect character set of a file using UniversalCharDet
|
||||||
/// @param out[out] Map to load list into ordered by confidence
|
CharsetListDetected List() const { return list; }
|
||||||
void List(CharsetListDetected &out) { out = list; }
|
|
||||||
|
|
||||||
/// @brief Return a single character set (highest confidence)
|
/// @brief Return a single character set (highest confidence)
|
||||||
/// @return Character set
|
/// @return Character set
|
||||||
std::string Single();
|
std::string Single() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
|
|
|
@ -19,8 +19,8 @@
|
||||||
#ifndef LAGI_PRE
|
#ifndef LAGI_PRE
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <map>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
#endif
|
#endif
|
||||||
#include <libaegisub/exception.h>
|
#include <libaegisub/exception.h>
|
||||||
|
|
||||||
|
@ -32,18 +32,17 @@ DEFINE_BASE_EXCEPTION_NOINNER(CharsetError, agi::Exception)
|
||||||
DEFINE_SIMPLE_EXCEPTION_NOINNER(UnknownCharset, CharsetError, "charset/unknown")
|
DEFINE_SIMPLE_EXCEPTION_NOINNER(UnknownCharset, CharsetError, "charset/unknown")
|
||||||
|
|
||||||
/// List of detected encodings.
|
/// List of detected encodings.
|
||||||
typedef std::map<float, std::string, std::greater_equal<float> > CharsetListDetected;
|
typedef std::vector<std::pair<float, std::string>> CharsetListDetected;
|
||||||
|
|
||||||
/// @brief Return a complete list of detected character sets ordered by precedence.
|
/// @brief Return a complete list of detected character sets ordered by precedence.
|
||||||
/// @param file File to check
|
/// @param file File to check
|
||||||
/// @param[out] list Map to load detected list into.
|
/// @return List of possible charsets sorted by probability
|
||||||
void DetectAll(const std::string &file, CharsetListDetected &list);
|
CharsetListDetected DetectAll(std::string const& file);
|
||||||
|
|
||||||
/// @brief Returns the character set with the highest confidence
|
/// @brief Returns the character set with the highest confidence
|
||||||
/// @param file File to check
|
/// @param file File to check
|
||||||
/// @return Detected character set.
|
/// @return Detected character set.
|
||||||
std::string Detect(const std::string &file);
|
std::string Detect(const std::string &file);
|
||||||
|
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
} // namespace agi
|
} // namespace agi
|
||||||
|
|
|
@ -55,7 +55,7 @@ wxString GetEncoding(wxString const& filename) {
|
||||||
agi::charset::CharsetListDetected list;
|
agi::charset::CharsetListDetected list;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
agi::charset::DetectAll(from_wx(filename), list);
|
list = agi::charset::DetectAll(from_wx(filename));
|
||||||
} catch (const agi::charset::UnknownCharset&) {
|
} catch (const agi::charset::UnknownCharset&) {
|
||||||
/// @todo If the charset is unknown we need to display a complete list of character sets.
|
/// @todo If the charset is unknown we need to display a complete list of character sets.
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue