Don't bother storing storing a single charset into an std::string, instead insert it into the std::map and fix Single() to return the first element. This keeps things simple and also ensures that DetectAll() will always return atleast one element which wasn't being done before.
Originally committed to SVN as r4369.
This commit is contained in:
parent
6736f5e292
commit
9d854b69f3
2 changed files with 18 additions and 12 deletions
|
@ -30,7 +30,6 @@
|
||||||
namespace agi {
|
namespace agi {
|
||||||
namespace charset {
|
namespace charset {
|
||||||
|
|
||||||
|
|
||||||
UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
|
UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
|
||||||
{
|
{
|
||||||
std::ifstream *fp;
|
std::ifstream *fp;
|
||||||
|
@ -47,7 +46,7 @@ UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
|
||||||
DataEnd();
|
DataEnd();
|
||||||
|
|
||||||
if (mDetectedCharset) {
|
if (mDetectedCharset) {
|
||||||
charset.assign(mDetectedCharset);
|
list.insert(CLDPair(1, mDetectedCharset));
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
switch (mInputState) {
|
switch (mInputState) {
|
||||||
|
@ -56,32 +55,39 @@ UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL) {
|
||||||
if (mCharSetProbers[i]) {
|
if (mCharSetProbers[i]) {
|
||||||
float conf = mCharSetProbers[i]->GetConfidence();
|
float conf = mCharSetProbers[i]->GetConfidence();
|
||||||
if (conf > 0.01f) {
|
if (conf > 0.01f) {
|
||||||
list.insert(std::pair<float, std::string>(conf, mCharSetProbers[i]->GetCharSetName()));
|
list.insert(CLDPair(conf, mCharSetProbers[i]->GetCharSetName()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!list.empty()) {
|
|
||||||
CharsetListDetected::const_iterator i_lst = list.begin();
|
|
||||||
charset.assign(i_lst->second);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ePureAscii:
|
case ePureAscii:
|
||||||
charset.assign("US-ASCII");
|
list.insert(CLDPair(1, "US-ASCII"));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw UnknownCharset("Unknown chararacter set.");
|
throw UnknownCharset("Unknown chararacter set.");
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((list.empty() && (mInputState == eHighbyte)) || charset.empty())
|
if (list.empty() && (mInputState == eHighbyte))
|
||||||
throw UnknownCharset("Unknown chararacter set.");
|
throw UnknownCharset("Unknown chararacter set.");
|
||||||
|
|
||||||
|
|
||||||
} // if mDetectedCharset else
|
} // if mDetectedCharset else
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string UCDetect::Single() {
|
||||||
|
/// @todo Add a debug log here since this shouldn't happen.
|
||||||
|
if (list.empty()) {
|
||||||
|
throw UnknownCharset("Unknown chararacter set.");
|
||||||
|
}
|
||||||
|
|
||||||
|
CharsetListDetected::const_iterator i_lst = list.begin();
|
||||||
|
return i_lst->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
} // namespace agi
|
} // namespace agi
|
||||||
|
|
|
@ -29,8 +29,8 @@ namespace agi {
|
||||||
|
|
||||||
class UCDetect : public nsUniversalDetector {
|
class UCDetect : public nsUniversalDetector {
|
||||||
|
|
||||||
/// Character set
|
/// For insertion into CharsetListDetected
|
||||||
std::string charset;
|
typedef std::pair<float, std::string> CLDPair;
|
||||||
|
|
||||||
/// List of detected character sets.
|
/// List of detected character sets.
|
||||||
CharsetListDetected list;
|
CharsetListDetected list;
|
||||||
|
@ -50,7 +50,7 @@ public:
|
||||||
|
|
||||||
/// @brief Return a single character set (highest confidence)
|
/// @brief Return a single character set (highest confidence)
|
||||||
/// @return Character set
|
/// @return Character set
|
||||||
std::string Single() { return charset; }
|
std::string Single();
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace util
|
} // namespace util
|
||||||
|
|
Loading…
Reference in a new issue