diff --git a/aegisub/libaegisub/Makefile.am b/aegisub/libaegisub/Makefile.am index af8c1cb15..b788abaf7 100644 --- a/aegisub/libaegisub/Makefile.am +++ b/aegisub/libaegisub/Makefile.am @@ -20,6 +20,8 @@ DISTCLEANFILES += lagi_pre.h.gch endif libaegisub_2_2_la_SOURCES = \ + common/charset.cpp \ + common/charset_ucd.cpp \ common/mru.cpp \ common/option.cpp \ common/option_visit.cpp \ diff --git a/aegisub/libaegisub/common/charset.cpp b/aegisub/libaegisub/common/charset.cpp new file mode 100644 index 000000000..9a5fefbea --- /dev/null +++ b/aegisub/libaegisub/common/charset.cpp @@ -0,0 +1,39 @@ +// Copyright (c) 2010, Amar Takhar +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset.cpp +/// @brief Character set detection and manipulation utilities. +/// @ingroup libaegisub + + +#include "libaegisub/charset.h" +#include "charset_ucd.h" + +namespace agi { + namespace charset { + +const std::string Detect(const std::string file) { + UCDetect ucd(file); + return ucd.Single(); +} + +void DetectAll(const std::string file, CharsetListDetected &list) { + UCDetect ucd(file); + ucd.List(list); +} + + } // namespace util +} // namespace agi diff --git a/aegisub/libaegisub/common/charset_ucd.cpp b/aegisub/libaegisub/common/charset_ucd.cpp new file mode 100644 index 000000000..57d841402 --- /dev/null +++ b/aegisub/libaegisub/common/charset_ucd.cpp @@ -0,0 +1,87 @@ +// Copyright (c) 2010, Amar Takhar +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_ucd.cpp +/// @brief Character set detection using Universalchardet +/// @ingroup libaegisub + + +#ifndef LAGI_PRE +#include "../../universalchardet/nsCharSetProber.h" +#endif + +#include "libaegisub/charset.h" +#include "charset_ucd.h" +#include "libaegisub/io.h" + +namespace agi { + namespace charset { + + +UCDetect::UCDetect(const std::string file): nsUniversalDetector(NS_FILTER_ALL), binary(0) { + { + std::ifstream *fp; + fp = io::Open(file); + + while (!mDone && !fp->eof()) { + char buf[512]; + fp->read(buf, 512); + size_t bytes = fp->gcount(); + HandleData(buf, bytes); + } + } + + DataEnd(); + + if (mDetectedCharset) { + charset.assign(mDetectedCharset); + } else { + + switch (mInputState) { + case eHighbyte: { + for (PRInt32 i=0; iGetConfidence(); + if (conf > 0.01f) { + list.insert(std::pair(conf, mCharSetProbers[i]->GetCharSetName())); + } + } + } + + if (!list.empty()) { + CharsetListDetected::const_iterator i_lst = list.begin(); + charset.assign(i_lst->second); + } + break; + } + case ePureAscii: + charset.assign("US-ASCII"); + break; + + default: + throw UnknownCharset("Unknown chararacter set."); + } + + if ((list.empty() && (mInputState == eHighbyte)) || charset.empty()) + throw UnknownCharset("Unknown chararacter set."); + + + } // if mDetectedCharset else +} + + + } // namespace util +} // namespace agi diff --git a/aegisub/libaegisub/common/charset_ucd.h b/aegisub/libaegisub/common/charset_ucd.h new file mode 100644 index 000000000..496516467 --- /dev/null +++ b/aegisub/libaegisub/common/charset_ucd.h @@ -0,0 +1,57 @@ +// Copyright (c) 2010, Amar Takhar +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_ucd.h +/// @brief Character set detection using Universalchardet +/// @ingroup libaegisub + +#ifndef LAGI_PRE +#include "../../universalchardet/nscore.h" +#include "../../universalchardet/nsUniversalDetector.h" +#include "../../universalchardet/nsMBCSGroupProber.h" +#endif + +namespace agi { + namespace charset { + +class UCDetect : public nsUniversalDetector { + + /// Character set + std::string charset; + + /// List of detected character sets. + CharsetListDetected list; + + /// Stub. + void Report(const char* aCharset) {}; + +public: + + /// @brief Detect character set of a file using UniversalCharDetect + /// @param file File to check + UCDetect(const std::string file); + + /// @brief Detect character set of a file using UniversalCharDet + /// @param out[out] Map to load list into ordered by confidence + void List(CharsetListDetected &out) { out = list; } + + /// @brief Return a single character set (highest confidence) + /// @return Character set + std::string Single() { return charset; } +}; + + } // namespace util +} // namespace agi diff --git a/aegisub/libaegisub/include/libaegisub/charset.h b/aegisub/libaegisub/include/libaegisub/charset.h new file mode 100644 index 000000000..71c9cb169 --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/charset.h @@ -0,0 +1,47 @@ +// Copyright (c) 2010, Amar Takhar +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset.h +/// @brief Character set detection and manipulation utilities. +/// @ingroup libaegisub + +#ifndef LAGI_PRE +#endif +#include +#include + +namespace agi { + namespace charset { + +DEFINE_BASE_EXCEPTION_NOINNER(CharsetError, agi::Exception) +DEFINE_SIMPLE_EXCEPTION_NOINNER(UnknownCharset, CharsetError, "charset/unknown") + +/// List of detected encodings. +typedef std::map > CharsetListDetected; + +/// @brief Return a complete list of detected character sets ordered by precidence. +/// @param file File to check +/// @param[in] list Map to load detected list into. +void DetectAll(const std::string file, CharsetListDetected &list); + +/// @brief Returns the character set with the highest confidence +/// @param file File to check +/// @return Detected character set. +const std::string Detect(const std::string file); + + + } // namespace util +} // namespace agi diff --git a/aegisub/libaegisub/lagi_pre.h b/aegisub/libaegisub/lagi_pre.h index 9e4155a8b..04ae7c4b6 100644 --- a/aegisub/libaegisub/lagi_pre.h +++ b/aegisub/libaegisub/lagi_pre.h @@ -30,3 +30,9 @@ #include "libaegisub/cajun/visitor.h" #include "libaegisub/cajun/writer.h" +// Universalchardet +#include "../universalchardet/nscore.h" +#include "../universalchardet/nsUniversalDetector.h" +#include "../universalchardet/nsMBCSGroupProber.h" +#include "../universalchardet/nsCharSetProber.h" +