From b6d29443a3f67d4f2d2dc9e4efeb94ef761e4105 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Thu, 3 Jun 2010 20:32:25 +0000 Subject: [PATCH] Move most character set conversion code to libaegisub and make everything use the new conversion functionality. Originally committed to SVN as r4423. --- .../libaegisub_vs2008.vcproj | 21 +- .../build/tests_vs2008/tests_vs2008.vcproj | 4 + aegisub/libaegisub/Makefile.am | 1 + aegisub/libaegisub/common/charset_conv.cpp | 327 ++++++++++++++ .../include/libaegisub/charset_conv.h | 107 +++++ .../include/libaegisub/charset_conv_win.h | 29 ++ .../include/libaegisub/charsets.def | 116 +++++ aegisub/libaegisub/include/libaegisub/io.h | 6 +- aegisub/libaegisub/lagi_pre.h | 1 + aegisub/libaegisub/windows/access.cpp | 8 +- .../libaegisub/windows/charset_conv_win.cpp | 49 +++ aegisub/libaegisub/windows/io.cpp | 17 +- aegisub/libaegisub/windows/util.cpp | 20 +- aegisub/src/charset_conv.cpp | 416 +----------------- aegisub/src/charset_conv.h | 117 +---- aegisub/src/compat.cpp | 4 +- aegisub/src/compat.h | 4 +- aegisub/src/dialog_export.cpp | 8 +- aegisub/src/frame_main.cpp | 6 +- aegisub/src/frame_main_events.cpp | 13 +- aegisub/src/hotkeys.cpp | 2 +- aegisub/src/main.cpp | 4 + aegisub/src/preferences.cpp | 3 +- aegisub/src/spellchecker_hunspell.cpp | 97 ++-- aegisub/src/spellchecker_hunspell.h | 8 +- aegisub/src/text_file_reader.cpp | 29 +- aegisub/src/text_file_reader.h | 19 +- aegisub/src/text_file_writer.cpp | 21 +- aegisub/src/text_file_writer.h | 11 +- aegisub/src/video_provider_manager.cpp | 4 +- aegisub/tests/Makefile.am | 1 + aegisub/tests/libaegisub_iconv.cpp | 138 ++++++ 32 files changed, 967 insertions(+), 644 deletions(-) create mode 100644 aegisub/libaegisub/common/charset_conv.cpp create mode 100644 aegisub/libaegisub/include/libaegisub/charset_conv.h create mode 100644 aegisub/libaegisub/include/libaegisub/charset_conv_win.h create mode 100644 aegisub/libaegisub/include/libaegisub/charsets.def create mode 100644 aegisub/libaegisub/windows/charset_conv_win.cpp create mode 100644 aegisub/tests/libaegisub_iconv.cpp diff --git a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj index ca206a97d..c31455340 100644 --- a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj +++ b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj @@ -20,6 +20,7 @@ + + @@ -303,6 +308,18 @@ RelativePath="..\..\libaegisub\include\libaegisub\access.h" > + + + + + + diff --git a/aegisub/build/tests_vs2008/tests_vs2008.vcproj b/aegisub/build/tests_vs2008/tests_vs2008.vcproj index 0d26ac173..34ad939ed 100644 --- a/aegisub/build/tests_vs2008/tests_vs2008.vcproj +++ b/aegisub/build/tests_vs2008/tests_vs2008.vcproj @@ -228,6 +228,10 @@ RelativePath="..\..\tests\libaegisub_cajun.cpp" > + + diff --git a/aegisub/libaegisub/Makefile.am b/aegisub/libaegisub/Makefile.am index 415962123..796edd872 100644 --- a/aegisub/libaegisub/Makefile.am +++ b/aegisub/libaegisub/Makefile.am @@ -21,6 +21,7 @@ endif libaegisub_2_2_la_SOURCES = \ common/charset.cpp \ + common/charset_conv.cpp \ common/charset_ucd.cpp \ common/mru.cpp \ common/option.cpp \ diff --git a/aegisub/libaegisub/common/charset_conv.cpp b/aegisub/libaegisub/common/charset_conv.cpp new file mode 100644 index 000000000..7fb1718d1 --- /dev/null +++ b/aegisub/libaegisub/common/charset_conv.cpp @@ -0,0 +1,327 @@ +// Copyright (c) 2010, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_conv.cpp +/// @brief Wrapper for libiconv to present a more C++-friendly API +/// @ingroup libaegisub + +#ifndef LAGI_PRE +#endif + +#include +#include + +// Check if we can use advanced fallback capabilities added in GNU's iconv +// implementation +#if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG) +#define ICONV_POSIX +#endif + +static const iconv_t iconv_invalid = (iconv_t)-1; +static const size_t iconv_failed = (size_t)-1; + +namespace { + struct ltstr { + bool operator()(const char* s1, const char* s2) const { + return strcmp(s1, s2) < 0; + } + }; +} + +/// @brief Map a user-friendly encoding name to the real encoding name +static const char* GetRealEncodingName(const char* name) { + static std::map prettyNames; + + if (prettyNames.empty()) { +# define ADD(pretty, real) prettyNames[pretty] = real +# include +# undef ADD + } + + std::map::iterator real = prettyNames.find(name); + if (real != prettyNames.end()) { + return real->second; + } + return name; +} + + +namespace agi { + namespace charset { + +#ifdef ICONV_POSIX +class IconvWrapper::Converter { +public: + Converter(bool, const char*) { } + size_t operator()(iconv_t cd, char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { + return iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); + } +}; +#else +class IconvWrapper::Converter : public iconv_fallbacks { +private: + bool subst; + char invalidRep[4]; + size_t invalidRepSize; + static void fallback( + unsigned int code, + void (*callback) (const char *buf, size_t buflen, void* callback_arg), + void *callback_arg, + void *convPtr) + { + // At some point in the future, this should probably switch to a real mapping + // For now, there's just three cases: BOM to nothing, '\' to itself + // (for Shift-JIS, which does not have \) and everything else to '?' + if (code == 0xFEFF) return; + if (code == 0x5C) callback("\\", 1, callback_arg); + else { + Converter *self = static_cast(convPtr); + callback(self->invalidRep, self->invalidRepSize, callback_arg); + } + } +public: + Converter(bool subst, const char* targetEnc) + : subst(subst) + { + data = this; + mb_to_uc_fallback = NULL; + mb_to_wc_fallback = NULL; + uc_to_mb_fallback = fallback; + wc_to_mb_fallback = NULL; + + char sbuff[] = "?"; + char* src = sbuff; + char* dst = invalidRep; + size_t dstLen = 4; + size_t srcLen = 1; + + iconv_t cd = iconv_open(GetRealEncodingName(targetEnc), "UTF-8"); + assert(cd != iconv_invalid); + size_t res = iconv(cd, &src, &srcLen, &dst, &dstLen); + assert(res != iconv_failed); + assert(srcLen == 0); + iconv_close(cd); + + invalidRepSize = 4 - dstLen; + } + size_t operator()(iconv_t cd, char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { + size_t res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); + + if (!subst) return res; + + // Save original errno so we can return it rather than the result from iconvctl + int err = errno; + + // Some characters in the input string do not exist in the output encoding + if (res == iconv_failed && err == EILSEQ) { + // first try transliteration only + int transliterate = 1; + iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate); + res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); + err = errno; + transliterate = 0; + iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate); + } + if (res == iconv_failed && err == EILSEQ) { + // Conversion still failed with transliteration enabled, so try our substitution + iconvctl(cd, ICONV_SET_FALLBACKS, this); + res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); + err = errno; + iconvctl(cd, ICONV_SET_FALLBACKS, NULL); + } + if (res == iconv_failed && err == E2BIG && *outbytesleft == 0) { + // Check for E2BIG false positives + char buff[4]; + size_t buffsize = 4; + char* out = buff; + char* in = *inbuf; + size_t insize = *inbytesleft; + + iconvctl(cd, ICONV_SET_FALLBACKS, this); + res = iconv(cd, &in, &insize, &out, &buffsize); + // If no bytes of the output buffer were used, the original + // conversion may have been successful + if (buffsize == 4) { + err = errno; + } + else { + res = iconv_failed; + } + iconvctl(cd, ICONV_SET_FALLBACKS, NULL); + } + + errno = err; + return res; + } +}; +#endif + +// Calculate the size of NUL in the given character set +static size_t NulSize(const char* encoding) { + // We need a character set to convert from with a known encoding of NUL + // UTF-8 seems like the obvious choice + iconv_t cd = iconv_open(GetRealEncodingName(encoding), "UTF-8"); + assert(cd != iconv_invalid); + + char dbuff[4]; + char sbuff[] = ""; + char* dst = dbuff; + char* src = sbuff; + size_t dstLen = sizeof(dbuff); + size_t srcLen = 1; + + size_t ret = iconv(cd, &src, &srcLen, &dst, &dstLen); + assert(ret != iconv_failed); + assert(dst - dbuff > 0); + iconv_close(cd); + + return dst - dbuff; +} + +IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding, bool enableSubst) +: toNulLen(0) +, fromNulLen(0) +, conv(NULL) +{ + cd = iconv_open(GetRealEncodingName(destEncoding), GetRealEncodingName(sourceEncoding)); + if (cd == iconv_invalid) { + throw UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding); + } + + // These need to be set only after we verify that the source and des + // charsets are valid + toNulLen = NulSize(destEncoding); + fromNulLen = NulSize(sourceEncoding); + conv.reset(new Converter(enableSubst, destEncoding)); +} +IconvWrapper::~IconvWrapper() { + if (cd != iconv_invalid) iconv_close(cd); +} + +std::string IconvWrapper::Convert(std::string const& source) { + std::string dest; + Convert(source, dest); + return dest; +} +void IconvWrapper::Convert(std::string const& source, std::string &dest) { + /// @todo Investigate if it's worth using ropes to avoid having to convert + /// everything twice. It probably isn't. + size_t len = RequiredBufferSize(source); + dest.resize(len); + + // This is technically invalid as C++03 does not require that strings use + // a single contiguous block of memory. However, no implementation has ever + // not done so and C++0x does require that it be contiguous + Convert(source.data(), source.size(), &dest[0], len); +} + +size_t IconvWrapper::Convert(const char* source, size_t sourceSize, char *dest, size_t destSize) { + if (sourceSize == (size_t)-1) { + sourceSize = SrcStrLen(source); + } + // POSIX requires that inbuf be const char **, but libiconv uses char** + size_t res = (*conv)(cd, const_cast(&source), &sourceSize, &dest, &destSize); + + if (res == iconv_failed) { + switch (errno) { + case E2BIG: + throw BufferTooSmall( + "Destination buffer was not large enough to fit converted " + "string."); + case EINVAL: + throw BadInput( + "One or more characters in the input string were not valid " + "characters in the given input encoding"); + case EILSEQ: + throw BadOutput( + "One or more characters could not be converted to the " + "selected target encoding and the version of iconv " + "Aegisub was built with does not have useful fallbacks. " + "For best results, please build Aegisub using a recent " + "version of GNU iconv."); + default: + throw ConversionFailure("An unknown conversion failure occured"); + } + } + return res; +} + +size_t IconvWrapper::Convert(const char** source, size_t* sourceSize, char** dest, size_t* destSize) { + return (*conv)(cd, const_cast(source), sourceSize, dest, destSize); +} + +size_t IconvWrapper::RequiredBufferSize(std::string const& str) { + return RequiredBufferSize(str.data(), str.size()); +} + +size_t IconvWrapper::RequiredBufferSize(const char* src, size_t srcLen) { + char buff[512]; + size_t charsWritten = 0; + size_t res; + + do { + char* dst = buff; + size_t dstSize = sizeof(buff); + res = (*conv)(cd, const_cast(&src), &srcLen, &dst, &dstSize); + + charsWritten += dst - buff; + } while (res == iconv_failed && errno == E2BIG); + + if (res == iconv_failed) { + switch (errno) { + case EINVAL: + throw BadInput( + "One or more characters in the input string were not valid " + "characters in the given input encoding"); + case EILSEQ: + throw BadOutput( + "One or more characters could not be converted to the " + "selected target encoding and the version of iconv " + "Aegisub was built with does not have useful fallbacks. " + "For best results, please build Aegisub using a recent " + "version of GNU iconv."); + default: + throw ConversionFailure("An unknown conversion failure occured"); + } + } + return charsWritten; +} + +static size_t mbstrlen(const char* str, size_t nulLen) { + const char *ptr; + switch (nulLen) { + case 1: + return strlen(str); + case 2: + for (ptr = str; *reinterpret_cast(ptr) != 0; ptr += 2) ; + return ptr - str; + case 4: + for (ptr = str; *reinterpret_cast(ptr) != 0; ptr += 4) ; + return ptr - str; + default: + return (size_t)-1; + } +} + +size_t IconvWrapper::SrcStrLen(const char* str) { + return mbstrlen(str, fromNulLen); + +} +size_t IconvWrapper::DstStrLen(const char* str) { + return mbstrlen(str, toNulLen); +} + } +} diff --git a/aegisub/libaegisub/include/libaegisub/charset_conv.h b/aegisub/libaegisub/include/libaegisub/charset_conv.h new file mode 100644 index 000000000..8786e0451 --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/charset_conv.h @@ -0,0 +1,107 @@ +// Copyright (c) 2010, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_conv.h +/// @brief Wrapper for libiconv to present a more C++-friendly API +/// @ingroup libaegisub + +#ifndef LAGI_PRE +#include +#include +#include +#include +#endif + +#include + +namespace agi { + namespace charset { + +DEFINE_BASE_EXCEPTION_NOINNER(ConvError, Exception) +DEFINE_SIMPLE_EXCEPTION_NOINNER(UnsupportedConversion, ConvError, "iconv/unsupported") +DEFINE_SIMPLE_EXCEPTION_NOINNER(ConversionFailure, ConvError, "iconv/failed") +DEFINE_SIMPLE_EXCEPTION_NOINNER(BufferTooSmall, ConversionFailure, "iconv/failed/E2BIG") +DEFINE_SIMPLE_EXCEPTION_NOINNER(BadInput, ConversionFailure, "iconv/failed/EILSEQ") +DEFINE_SIMPLE_EXCEPTION_NOINNER(BadOutput, ConversionFailure, "iconv/failed/EINVAL") + +/// @brief Get a list of support encodings with user-friendly names +template +T const& GetEncodingsList() { + static T nameList; + if (nameList.empty()) { +# define ADD(pretty, real) nameList.push_back(pretty) +# include +# undef ADD + } + return nameList; +} + +typedef void* iconv_t; + +/// @brief A C++ wrapper for iconv +class IconvWrapper { +private: + // Helper class that abstracts away the differences betwen libiconv and + // POSIX iconv implementations + class Converter; + + iconv_t cd; + size_t toNulLen; + size_t fromNulLen; + std::auto_ptr conv; + +public: + /// @brief Create a converter + /// @param sourceEncoding Source encoding name, may be a pretty name + /// @param destEncoding Destination encoding name, may be a pretty name + /// @param enableSubst If true, when possible characters will be + /// mutilated or dropped rather than a letting a + /// conversion fail + IconvWrapper(const char* sourceEncoding, const char* destEncoding, bool enableSubst = true); + ~IconvWrapper(); + + /// @brief Convert a string from the source to destination charset + /// @param source String to convert + /// @return Converted string. Note that std::string always uses a single byte + /// terminator, so c_str() may not return a valid string if the dest + /// charset has wider terminators + std::string Convert(std::string const& source); + /// @brief Convert a string from the source to destination charset + /// @param source String to convert + /// @param[out] dest String to place the result in + void Convert(std::string const& source, std::string &dest); + size_t Convert(const char* source, size_t sourceSize, char* dest, size_t destSize); + /// Bare wrapper around iconv; see iconv documention for details + size_t Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); + + /// @brief Get the required buffer size required to fit the source string in the target charset + /// @param source A string in the source charset + /// @param sourceSize Length of the source in bytes + /// @return Bytes required, including NUL terminator if applicable + size_t RequiredBufferSize(const char* source, size_t sourceSize); + /// @brief Get the required buffer size required to fit the source string in the target charset + /// @param str A string in the source charset + /// @return Bytes required, not including space needed for NUL terminator + size_t RequiredBufferSize(std::string const& str); + + /// Encoding-aware strlen for strings encoding in the source charset + size_t SrcStrLen(const char* str); + /// Encoding-aware strlen for strings encoding in the destination charset + size_t DstStrLen(const char* str); +}; + + } +} diff --git a/aegisub/libaegisub/include/libaegisub/charset_conv_win.h b/aegisub/libaegisub/include/libaegisub/charset_conv_win.h new file mode 100644 index 000000000..00dda5322 --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/charset_conv_win.h @@ -0,0 +1,29 @@ +// Copyright (c) 2010, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_conv_win.h +/// @brief Windows-specific charset conversion stuff +/// @ingroup libaegisub windows + +#include + +namespace agi { + namespace charset { + /// Convert a UTF-8 string to a string suitable for use with Win32 API functions + std::wstring ConvertW(std::string const& src); + std::string ConvertW(std::wstring const& src); + } +} diff --git a/aegisub/libaegisub/include/libaegisub/charsets.def b/aegisub/libaegisub/include/libaegisub/charsets.def new file mode 100644 index 000000000..02e88eb3b --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/charsets.def @@ -0,0 +1,116 @@ +ADD("Local", ""); + +ADD("Unicode (UTF-8)", "utf-8"); +ADD("Unicode (UTF-16)", "utf-16"); +ADD("Unicode (UTF-16BE)", "utf-16be"); +ADD("Unicode (UTF-16LE)", "utf-16le"); +ADD("Unicode (UTF-32)", "utf-32"); +ADD("Unicode (UTF-32BE)", "utf-32be"); +ADD("Unicode (UTF-32LE)", "utf-32le"); +ADD("Unicode (UTF-7)", "utf-7"); + +ADD("Arabic (IBM-864)", "ibm864"); +ADD("Arabic (IBM-864-I)", "ibm864i"); +ADD("Arabic (ISO-8859-6)", "iso-8859-6"); +ADD("Arabic (ISO-8859-6-E)", "iso-8859-6-e"); +ADD("Arabic (ISO-8859-6-I)", "iso-8859-6-i"); +ADD("Arabic (Langbox ISO-8859-6.16)", "x-iso-8859-6-16"); +ADD("Arabic (Langbox ISO-8859-6.8x)", "x-iso-8859-6-8-x"); +ADD("Arabic (MacArabic)", "x-mac-arabic"); +ADD("Arabic (Windows-1256)", "windows-1256"); + +ADD("Armenian (ARMSCII-8)", "armscii-8"); + +ADD("Baltic (ISO-8859-13)", "iso-8859-13"); +ADD("Baltic (ISO-8859-4)", "iso-8859-4"); +ADD("Baltic (Windows-1257)", "windows-1257"); + +ADD("Celtic (ISO-8859-14)", "iso-8859-14"); + +ADD("Central European (IBM-852)", "ibm852"); +ADD("Central European (ISO-8859-2)", "iso-8859-2"); +ADD("Central European (MacCE)", "x-mac-ce"); +ADD("Central European (Windows-1250)", "windows-1250"); + +ADD("Chinese Simplified (GB18030)", "gb18030"); +ADD("Chinese Simplified (GB2312)", "gb2312"); +ADD("Chinese Simplified (GBK)", "x-gbk"); +ADD("Chinese Simplified (HZ)", "hz-gb-2312"); +ADD("Chinese Simplified (ISO-2022-CN)", "iso-2022-cn"); +ADD("Chinese Traditional (Big5)", "big5"); +ADD("Chinese Traditional (Big5-HKSCS)", "big5-hkscs"); +ADD("Chinese Traditional (EUC-TW)", "x-euc-tw"); + +ADD("Croatian (MacCroatian)", "x-mac-croatian"); + +ADD("Cyrillic (IBM-855)", "ibm855"); +ADD("Cyrillic (ISO-8859-5)", "iso-8859-5"); +ADD("Cyrillic (ISO-IR-111)", "iso-ir-111"); +ADD("Cyrillic (KOI8-R)", "koi8-r"); +ADD("Cyrillic (MacCyrillic)", "x-mac-cyrillic"); +ADD("Cyrillic (Windows-1251)", "windows-1251"); +ADD("Cyrillic/Russian (CP-866)", "ibm866"); +ADD("Cyrillic/Ukrainian (KOI8-U)", "koi8-u"); +ADD("Cyrillic/Ukrainian (MacUkrainian)", "x-mac-ukrainian"); + +ADD("English (US-ASCII)", "us-ascii"); + +ADD("Farsi (MacFarsi)", "x-mac-farsi"); + +ADD("Georgian (GEOSTD8)", "geostd8"); + +ADD("Greek (ISO-8859-7)", "iso-8859-7"); +ADD("Greek (MacGreek)", "x-mac-greek"); +ADD("Greek (Windows-1253)", "windows-1253"); + +ADD("Gujarati (MacGujarati)", "x-mac-gujarati"); +ADD("Gurmukhi (MacGurmukhi)", "x-mac-gurmukhi"); + +ADD("Hebrew (IBM-862)", "ibm862"); +ADD("Hebrew (ISO-8859-8-E)", "iso-8859-8-e"); +ADD("Hebrew (ISO-8859-8-I)", "iso-8859-8-i"); +ADD("Hebrew (MacHebrew)", "x-mac-hebrew"); +ADD("Hebrew (Windows-1255)", "windows-1255"); +ADD("Hebrew Visual (ISO-8859-8)", "iso-8859-8"); + +ADD("Hindi (MacDevanagari)", "x-mac-devanagari"); +ADD("Hindi (SunDevanagari)", "x-sun-unicode-india-0"); + +ADD("Icelandic (MacIcelandic)", "x-mac-icelandic"); + +ADD("Japanese (EUC-JP)", "euc-jp"); +ADD("Japanese (ISO-2022-JP)", "iso-2022-jp"); +ADD("Japanese (Shift_JIS)", "shift_jis"); + +ADD("Korean (EUC-KR)", "euc-kr"); +ADD("Korean (ISO-2022-KR)", "iso-2022-kr"); +ADD("Korean (JOHAB)", "x-johab"); +ADD("Korean (UHC)", "x-windows-949"); + +ADD("Nordic (ISO-8859-10)", "iso-8859-10"); + +ADD("Romanian (ISO-8859-16)", "iso-8859-16"); +ADD("Romanian (MacRomanian)", "x-mac-romanian"); + +ADD("South European (ISO-8859-3)", "iso-8859-3"); + +ADD("Thai (IBM-874)", "ibm874"); +ADD("Thai (ISO-8859-11)", "iso-8859-11"); +ADD("Thai (TIS-620)", "tis-620"); +ADD("Thai (Windows-874)", "windows-874"); + +ADD("Turkish (IBM-857)", "ibm857"); +ADD("Turkish (ISO-8859-9)", "iso-8859-9"); +ADD("Turkish (MacTurkish)", "x-mac-turkish"); +ADD("Turkish (Windows-1254)", "windows-1254"); + +ADD("Vietnamese (TCVN)", "x-viet-tcvn5712"); +ADD("Vietnamese (VISCII)", "viscii"); +ADD("Vietnamese (VPS)", "x-viet-vps"); +ADD("Vietnamese (Windows-1258)", "windows-1258"); + +ADD("Western (IBM-850)", "ibm850"); +ADD("Western (ISO-8859-1)", "iso-8859-1"); +ADD("Western (ISO-8859-15)", "iso-8859-15"); +ADD("Western (MacRoman)", "x-mac-roman"); +ADD("Western (Windows-1252)", "windows-1252"); diff --git a/aegisub/libaegisub/include/libaegisub/io.h b/aegisub/libaegisub/include/libaegisub/io.h index 672adb764..f23548e10 100644 --- a/aegisub/libaegisub/include/libaegisub/io.h +++ b/aegisub/libaegisub/include/libaegisub/io.h @@ -45,9 +45,9 @@ class Save { const std::string file_name; public: - Save(const std::string& file); - ~Save(); - std::ofstream& Get(); + Save(const std::string& file); + ~Save(); + std::ofstream& Get(); }; diff --git a/aegisub/libaegisub/lagi_pre.h b/aegisub/libaegisub/lagi_pre.h index 893e20fca..482d986de 100644 --- a/aegisub/libaegisub/lagi_pre.h +++ b/aegisub/libaegisub/lagi_pre.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/aegisub/libaegisub/windows/access.cpp b/aegisub/libaegisub/windows/access.cpp index 0c9efec32..70db2a476 100644 --- a/aegisub/libaegisub/windows/access.cpp +++ b/aegisub/libaegisub/windows/access.cpp @@ -25,8 +25,9 @@ #include #endif -#include "libaegisub/util.h" -#include "libaegisub/util_win.h" +#include +#include +#include namespace agi { namespace acs { @@ -57,8 +58,7 @@ is a short (and incomplete) todo requires detecting the filesystem being used. */ void Check(const std::string &file, acs::Type type) { - std::wstring wfile; - wfile.assign(file.begin(), file.end()); + std::wstring wfile = agi::charset::ConvertW(file); SECURITY_DESCRIPTOR* sd; DWORD len = 0; diff --git a/aegisub/libaegisub/windows/charset_conv_win.cpp b/aegisub/libaegisub/windows/charset_conv_win.cpp new file mode 100644 index 000000000..e71c54ba7 --- /dev/null +++ b/aegisub/libaegisub/windows/charset_conv_win.cpp @@ -0,0 +1,49 @@ +// Copyright (c) 2010, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file charset_conv_win.h +/// @brief Windows-specific charset conversion stuff +/// @ingroup libaegisub windows + +#include + +namespace agi { + namespace charset { + +std::wstring ConvertW(std::string const& source) { + static IconvWrapper w32Conv("utf-8", "utf-16le", false); + + std::wstring dest; + size_t len = w32Conv.RequiredBufferSize(source); + dest.resize(len / sizeof(wchar_t)); + w32Conv.Convert(source.data(), source.size(), reinterpret_cast(&dest[0]), len); + return dest; +} + +std::string ConvertW(std::wstring const& source) { + static IconvWrapper w32Conv("utf-16le", "utf-8", false); + + std::string dest; + size_t srcLen = source.size() * sizeof(wchar_t); + const char* src = reinterpret_cast(source.c_str()); + size_t len = w32Conv.RequiredBufferSize(src, srcLen); + dest.resize(len); + w32Conv.Convert(src, srcLen, &dest[0], len); + return dest; +} + + } +} diff --git a/aegisub/libaegisub/windows/io.cpp b/aegisub/libaegisub/windows/io.cpp index 4627134a7..12b692df3 100644 --- a/aegisub/libaegisub/windows/io.cpp +++ b/aegisub/libaegisub/windows/io.cpp @@ -26,6 +26,7 @@ #include #endif +#include #include "libaegisub/io.h" #include "libaegisub/log.h" #include "libaegisub/util.h" @@ -34,11 +35,13 @@ namespace agi { namespace io { +using agi::charset::ConvertW; + std::ifstream* Open(const std::string &file) { LOG_D("agi/io/open/file") << file; acs::CheckFileRead(file); - std::ifstream *stream = new std::ifstream(file.c_str()); + std::ifstream *stream = new std::ifstream(ConvertW(file).c_str()); if (stream->fail()) { delete stream; @@ -53,7 +56,7 @@ Save::Save(const std::string& file): file_name(file) { LOG_D("agi/io/save/file") << file; const std::string pwd = util::DirName(file); - acs::CheckDirWrite(pwd.c_str()); + acs::CheckDirWrite(pwd); try { acs::CheckFileWrite(file); @@ -61,23 +64,19 @@ Save::Save(const std::string& file): file_name(file) { // If the file doesn't exist we create a 0 byte file, this so so // util::Rename will find it, and to let users know something went // wrong by leaving a 0 byte file. - std::ofstream fp_touch(file.c_str()); + std::ofstream fp_touch(ConvertW(file).c_str()); } /// @todo This is a temp hack, proper implementation needs to come after /// Windows support is added. The code in the destructor needs fixing /// as well. - const std::string tmp = file + "_tmp"; - // This will open to file.XXXX. (tempfile) - fp = new std::ofstream(tmp.c_str()); + fp = new std::ofstream(ConvertW(file + "_tmp").c_str()); } Save::~Save() { - - const std::string tmp(file_name + "_tmp"); delete fp; - util::Rename(tmp, file_name); + util::Rename(file_name + "_tmp", file_name); } std::ofstream& Save::Get() { diff --git a/aegisub/libaegisub/windows/util.cpp b/aegisub/libaegisub/windows/util.cpp index 1e422d040..ab36c4727 100644 --- a/aegisub/libaegisub/windows/util.cpp +++ b/aegisub/libaegisub/windows/util.cpp @@ -30,23 +30,22 @@ #endif -//#include #include "libaegisub/types.h" +#include #include "libaegisub/util.h" #include "libaegisub/util_win.h" namespace agi { namespace util { +using agi::charset::ConvertW; const std::string DirName(const std::string& path) { if (path.find('/') == std::string::npos) { - const std::string cwd("."); - return cwd; + return "."; } - const std::string stripped = path.substr(0, path.rfind("/")+1); - return stripped; + return path.substr(0, path.rfind("/")+1); } void Rename(const std::string& from, const std::string& to) { @@ -58,19 +57,18 @@ void Rename(const std::string& from, const std::string& to) { acs::CheckDirWrite(DirName(to)); } - MoveFileExA(from.c_str(), to.c_str(), MOVEFILE_REPLACE_EXISTING); + MoveFileEx(ConvertW(from).c_str(), ConvertW(to).c_str(), MOVEFILE_REPLACE_EXISTING); } std::string ErrorString(DWORD error) { - LPSTR lpstr = NULL; + LPWSTR lpstr = NULL; - if(FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, error, 0, (LPSTR)&lpstr, 0, NULL) == 0) { + if(FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, error, 0, reinterpret_cast(&lpstr), 0, NULL) == 0) { /// @todo Return the actual 'unknown error' string from windows. - std::string str("Unknown Error"); - return str; + return "Unknown Error"; } - std::string str(lpstr); + std::string str = ConvertW(lpstr); LocalFree(lpstr); return str; } diff --git a/aegisub/src/charset_conv.cpp b/aegisub/src/charset_conv.cpp index c98170126..839eb5e20 100644 --- a/aegisub/src/charset_conv.cpp +++ b/aegisub/src/charset_conv.cpp @@ -42,107 +42,21 @@ #include #include -#include #include #endif -WX_DECLARE_STRING_HASH_MAP(wxString, PrettyNamesHash); +class AegisubCSConvImpl : public AegisubCSConv { +public: + AegisubCSConvImpl() { } +}; -#if wxUSE_THREADS -static wxMutex encodingListMutex; -#endif - -static const iconv_t iconv_invalid = (iconv_t)-1; -static const size_t iconv_failed = (size_t)-1; -#define ICONV_CONST_CAST(a) const_cast(a) - -static wxArrayString *supportedEncodings = NULL; -static wxArrayString *prettyEncodingList = NULL; -static PrettyNamesHash *prettyEncodingHash = NULL; - -AegisubCSConv::AegisubCSConv(const wxChar *mbEncName, bool enableSubst) -: wcCharsetName(WCHAR_T_ENCODING) -, mbCharsetName(GetRealEncodingName(mbEncName)) -, mbNulLen(0) -, enableSubst(enableSubst) -, m2w(wcCharsetName, mbCharsetName) -, w2m(mbCharsetName, wcCharsetName) +AegisubCSConv::AegisubCSConv() +: conv("wchar_t", "") { - if (m2w == iconv_invalid || w2m == iconv_invalid) { - throw wxString::Format(L"Character set %s is not supported.", mbEncName); - } - - if (enableSubst) { - invalidRepSize = FromWChar(invalidRep, sizeof(invalidRep), L"?") - GetMBNulLen(); - -#ifndef ICONV_POSIX - fallbacks.data = this; - fallbacks.mb_to_uc_fallback = NULL; - fallbacks.mb_to_wc_fallback = NULL; - fallbacks.uc_to_mb_fallback = ucToMbFallback; - fallbacks.wc_to_mb_fallback = NULL; -#endif - } } -wxMBConv * AegisubCSConv::Clone() const { - AegisubCSConv *c = new AegisubCSConv(mbCharsetName); - c->mbNulLen = mbNulLen; - return c; -} - -/// @brief Calculate the size of NUL in the target encoding via iconv -/// @return The size in bytes of NUL -size_t AegisubCSConv::GetMBNulLen() const { - if (mbNulLen == 0) { - const wchar_t nulStr[] = L""; - char outBuff[8]; - size_t inLen = sizeof(wchar_t); - size_t outLen = sizeof(outBuff); - char * inPtr = (char *)nulStr; - char * outPtr = outBuff; - - size_t res = iconv(w2m, &inPtr, &inLen, &outPtr, &outLen); - - if (res != 0) - mbNulLen = (size_t)-1; - else - mbNulLen = sizeof(outBuff) - outLen; - } - return mbNulLen; -} - -size_t AegisubCSConv::MBBuffLen(const char * str) const { - size_t nulLen = GetMBNulLen(); - const char *ptr; - switch (nulLen) { - case 1: - return strlen(str); - case 2: - for (ptr = str; *reinterpret_cast(ptr) != 0; ptr += 2) ; - return ptr - str; - case 4: - for (ptr = str; *reinterpret_cast(ptr) != 0; ptr += 4) ; - return ptr - str; - default: - return (size_t)-1; - } -} - -/// @brief Convert a string from multibyte to wide characters -/// @param dst Destination buffer. -/// @param dstSize Length of destination buffer in wchar_ts -/// @param src Source multibyte string -/// @param srcLen Length of source buffer in bytes, or -1 to autodetect -/// @return The number of wchar_ts needed to store the string in the target charset size_t AegisubCSConv::ToWChar(wchar_t *dst, size_t dstSize, const char *src, size_t srcLen) const { - return doConversion( - m2w, - reinterpret_cast(dst), - dstSize * sizeof(wchar_t), - const_cast(src), - srcLen == wxNO_LEN ? MBBuffLen(src) + GetMBNulLen() : srcLen - ) / sizeof(wchar_t); + throw agi::charset::UnsupportedConversion("Cannot convert to local with csConvLocal"); } /// @brief Convert a string from wide characters to multibyte @@ -152,309 +66,19 @@ size_t AegisubCSConv::ToWChar(wchar_t *dst, size_t dstSize, const char *src, siz /// @param srcLen Length in wchar_ts of source, or -1 to autodetect /// @return The number of bytes needed to store the string in the target charset size_t AegisubCSConv::FromWChar(char *dst, size_t dstSize, const wchar_t *src, size_t srcLen) const { - return doConversion( - w2m, - dst, - dstSize, - reinterpret_cast(const_cast(src)), - (srcLen == wxNO_LEN ? wcslen(src) + 1 : srcLen) * sizeof(wchar_t) - ); -} - -// Perform a conversion if a buffer is given or calculate the needed buffer size if not -size_t AegisubCSConv::doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const { - if (dstSize > 0) { - return iconvWrapper(cd, &src, &srcSize, &dst, &dstSize); - } - - // No destination given, so calculate the needed buffer size instead - char buff[32]; - size_t buffSize = 32; - size_t charsWritten = 0; - size_t res; - - do { - dst = buff; - dstSize = buffSize; - res = iconvWrapper(cd, &src, &srcSize, &dst, &dstSize); - - charsWritten += dst - buff; - } while (res == iconv_failed && errno == E2BIG); - - if (res == iconv_failed) return wxCONV_FAILED; - return charsWritten; -} - -// Actually perform a conversion via iconv -size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft, - char **outbuf, size_t *outbytesleft) const { - -#if wxUSE_THREADS - wxMutexLocker lock(iconvMutex); -#endif - - char *outbuforig = *outbuf; - size_t res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); - - if (res != iconv_failed) - return *outbuf - outbuforig; - if (!enableSubst) - return iconv_failed; - -#ifdef ICONV_POSIX - if (errno == EILSEQ) { - throw - L"One or more characters do not fit in the selected " - L"encoding and the version of iconv Aegisub was built with" - L" does not have useful fallbacks. For best results, " - L"please rebuild Aegisub using a recent version of GNU iconv."; - } - return wxCONV_FAILED; -#else - // Save original errno so we can return it rather than the result from iconvctl - int err = errno; - - // Some characters in the input string do not exist in the output encoding - if (res == iconv_failed && err == EILSEQ) { - // first try transliteration only - int transliterate = 1; - iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate); - res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); - err = errno; - transliterate = 0; - iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate); - } - if (res == iconv_failed && err == EILSEQ) { - // Conversion still failed with transliteration enabled, so try our substitution - iconvctl(cd, ICONV_SET_FALLBACKS, &fallbacks); - res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); - err = errno; - iconvctl(cd, ICONV_SET_FALLBACKS, NULL); - } - if (res == iconv_failed && err == EILSEQ) { - // Conversion still failed, so just drop any invalid characters - int discard = 1; - iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, &discard); - res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); - err = errno; - discard = 0; - iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, &discard); - } - - errno = err; - if (res == iconv_failed) return wxCONV_FAILED; - return *outbuf - outbuforig; -#endif -} - - -/// @brief GNU iconv character substitution callback -/// @param code Unicode character which could not be converted -/// @param callback Callback to tell iconv what string to use instead -/// @param callback_arg Iconv userdata for callback -/// @param convPtr AegisubCSConv instance to use -void AegisubCSConv::ucToMbFallback( - unsigned int code, - void (*callback) (const char *buf, size_t buflen, void* callback_arg), - void *callback_arg, - void *convPtr) -{ - // At some point in the future, this should probably switch to a real mapping - // For now, there's just three cases: BOM to nothing, '\' to itself - // (for Shift-JIS, which does not have \) and everything else to '?' - if (code == 0xFEFF) return; - if (code == 0x5C) callback("\\", 1, callback_arg); - else { - AegisubCSConv *self = static_cast(convPtr); - callback(self->invalidRep, self->invalidRepSize, callback_arg); - } -} - -#ifndef ICONV_POSIX -/// @brief Callback for iconvlist -/// @param namescount Number of names in names -/// @param names Names to add to the list -/// @param data Unused userdata field -int addEncoding(unsigned int namescount, const char * const * names, void* data) { - for (unsigned int i = 0; i < namescount; i++) { - supportedEncodings->Add(wxString::FromAscii(names[i])); - } - return 0; -} -#endif - -wxArrayString AegisubCSConv::GetAllSupportedEncodings() { -#if wxUSE_THREADS - wxMutexLocker lock(encodingListMutex); -#endif - if (supportedEncodings == NULL) { - supportedEncodings = new wxArrayString(); -#ifndef ICONV_POSIX - iconvlist(addEncoding, NULL); - supportedEncodings->Sort(); -#endif - } - return *supportedEncodings; -} - -wxString AegisubCSConv::GetRealEncodingName(wxString name) { - if (name.Lower() == L"local") return wxLocale::GetSystemEncodingName(); - if (prettyEncodingList == NULL) return name; - - PrettyNamesHash::iterator realName = prettyEncodingHash->find(name); - if (realName != prettyEncodingHash->end()) { - return realName->second; - } - return name; -} - -wxArrayString AegisubCSConv::GetEncodingsList() { -#if wxUSE_THREADS - wxMutexLocker lock(encodingListMutex); -#endif - if (prettyEncodingList == NULL) { - struct { const char *pretty, *real; } encodingNames[] = { - {"Unicode (UTF-8)", "utf-8"}, - {"Unicode (UTF-16)", "utf-16"}, - {"Unicode (UTF-16BE)", "utf-16be"}, - {"Unicode (UTF-16LE)", "utf-16le"}, - {"Unicode (UTF-32)", "utf-32"}, - {"Unicode (UTF-32BE)", "utf-32be"}, - {"Unicode (UTF-32LE)", "utf-32le"}, - {"Unicode (UTF-7)", "utf-7"}, - - {"Arabic (IBM-864)", "ibm864"}, - {"Arabic (IBM-864-I)", "ibm864i"}, - {"Arabic (ISO-8859-6)", "iso-8859-6"}, - {"Arabic (ISO-8859-6-E)", "iso-8859-6-e"}, - {"Arabic (ISO-8859-6-I)", "iso-8859-6-i"}, - {"Arabic (Langbox ISO-8859-6.16)", "x-iso-8859-6-16"}, - {"Arabic (Langbox ISO-8859-6.8x)", "x-iso-8859-6-8-x"}, - {"Arabic (MacArabic)", "x-mac-arabic"}, - {"Arabic (Windows-1256)", "windows-1256"}, - - {"Armenian (ARMSCII-8)", "armscii-8"}, - - {"Baltic (ISO-8859-13)", "iso-8859-13"}, - {"Baltic (ISO-8859-4)", "iso-8859-4"}, - {"Baltic (Windows-1257)", "windows-1257"}, - - {"Celtic (ISO-8859-14)", "iso-8859-14"}, - - {"Central European (IBM-852)", "ibm852"}, - {"Central European (ISO-8859-2)", "iso-8859-2"}, - {"Central European (MacCE)", "x-mac-ce"}, - {"Central European (Windows-1250)", "windows-1250"}, - - {"Chinese Simplified (GB18030)", "gb18030"}, - {"Chinese Simplified (GB2312)", "gb2312"}, - {"Chinese Simplified (GBK)", "x-gbk"}, - {"Chinese Simplified (HZ)", "hz-gb-2312"}, - {"Chinese Simplified (ISO-2022-CN)", "iso-2022-cn"}, - {"Chinese Traditional (Big5)", "big5"}, - {"Chinese Traditional (Big5-HKSCS)", "big5-hkscs"}, - {"Chinese Traditional (EUC-TW)", "x-euc-tw"}, - - {"Croatian (MacCroatian)", "x-mac-croatian"}, - - {"Cyrillic (IBM-855)", "ibm855"}, - {"Cyrillic (ISO-8859-5)", "iso-8859-5"}, - {"Cyrillic (ISO-IR-111)", "iso-ir-111"}, - {"Cyrillic (KOI8-R)", "koi8-r"}, - {"Cyrillic (MacCyrillic)", "x-mac-cyrillic"}, - {"Cyrillic (Windows-1251)", "windows-1251"}, - {"Cyrillic/Russian (CP-866)", "ibm866"}, - {"Cyrillic/Ukrainian (KOI8-U)", "koi8-u"}, - {"Cyrillic/Ukrainian (MacUkrainian)", "x-mac-ukrainian"}, - - {"English (US-ASCII)", "us-ascii"}, - - {"Farsi (MacFarsi)", "x-mac-farsi"}, - - {"Georgian (GEOSTD8)", "geostd8"}, - - {"Greek (ISO-8859-7)", "iso-8859-7"}, - {"Greek (MacGreek)", "x-mac-greek"}, - {"Greek (Windows-1253)", "windows-1253"}, - - {"Gujarati (MacGujarati)", "x-mac-gujarati"}, - {"Gurmukhi (MacGurmukhi)", "x-mac-gurmukhi"}, - - {"Hebrew (IBM-862)", "ibm862"}, - {"Hebrew (ISO-8859-8-E)", "iso-8859-8-e"}, - {"Hebrew (ISO-8859-8-I)", "iso-8859-8-i"}, - {"Hebrew (MacHebrew)", "x-mac-hebrew"}, - {"Hebrew (Windows-1255)", "windows-1255"}, - {"Hebrew Visual (ISO-8859-8)", "iso-8859-8"}, - - {"Hindi (MacDevanagari)", "x-mac-devanagari"}, - {"Hindi (SunDevanagari)", "x-sun-unicode-india-0"}, - - {"Icelandic (MacIcelandic)", "x-mac-icelandic"}, - - {"Japanese (EUC-JP)", "euc-jp"}, - {"Japanese (ISO-2022-JP)", "iso-2022-jp"}, - {"Japanese (Shift_JIS)", "shift_jis"}, - - {"Korean (EUC-KR)", "euc-kr"}, - {"Korean (ISO-2022-KR)", "iso-2022-kr"}, - {"Korean (JOHAB)", "x-johab"}, - {"Korean (UHC)", "x-windows-949"}, - - {"Nordic (ISO-8859-10)", "iso-8859-10"}, - - {"Romanian (ISO-8859-16)", "iso-8859-16"}, - {"Romanian (MacRomanian)", "x-mac-romanian"}, - - {"South European (ISO-8859-3)", "iso-8859-3"}, - - {"Thai (IBM-874)", "ibm874"}, - {"Thai (ISO-8859-11)", "iso-8859-11"}, - {"Thai (TIS-620)", "tis-620"}, - {"Thai (Windows-874)", "windows-874"}, - - {"Turkish (IBM-857)", "ibm857"}, - {"Turkish (ISO-8859-9)", "iso-8859-9"}, - {"Turkish (MacTurkish)", "x-mac-turkish"}, - {"Turkish (Windows-1254)", "windows-1254"}, - - {"Vietnamese (TCVN)", "x-viet-tcvn5712"}, - {"Vietnamese (VISCII)", "viscii"}, - {"Vietnamese (VPS)", "x-viet-vps"}, - {"Vietnamese (Windows-1258)", "windows-1258"}, - - {"Western (IBM-850)", "ibm850"}, - {"Western (ISO-8859-1)", "iso-8859-1"}, - {"Western (ISO-8859-15)", "iso-8859-15"}, - {"Western (MacRoman)", "x-mac-roman"}, - {"Western (Windows-1252)", "windows-1252"}, - - {NULL, NULL} - }; - - PrettyNamesHash *map = new PrettyNamesHash(100); - wxArrayString *arr = new wxArrayString(); - arr->Add(L"Local"); - - for (int i = 0; encodingNames[i].real != NULL; i++) { - // Verify that iconv actually supports converting to and from this encoding - iconv_t cd = iconv_open(encodingNames[i].real, WCHAR_T_ENCODING); - if (cd == iconv_invalid) continue; - iconv_close(cd); - - cd = iconv_open(WCHAR_T_ENCODING, encodingNames[i].real); - if (cd == iconv_invalid) continue; - iconv_close(cd); - - wxString pretty = wxString::FromAscii(encodingNames[i].pretty); - arr->Add(pretty); - (*map)[pretty] = wxString::FromAscii(encodingNames[i].real); + try { + if (srcLen != (size_t)-1) { + if (src[srcLen - 1] == 0) srcLen -= 1; + srcLen *= sizeof(wchar_t); } - - prettyEncodingList = arr; - prettyEncodingHash = map; + if (dstSize == 0) { + return conv.RequiredBufferSize(reinterpret_cast(src), srcLen); + } + return conv.Convert(reinterpret_cast(src), srcLen, dst, dstSize); + } + catch (agi::charset::ConvError const&) { + return (size_t)-1; } - return *prettyEncodingList; } -static AegisubCSConv localConv(L"Local", false); -AegisubCSConv& csConvLocal(localConv); +static AegisubCSConvImpl localConv; +AegisubCSConv& csConvLocal = localConv; diff --git a/aegisub/src/charset_conv.h b/aegisub/src/charset_conv.h index 28700819b..7baa85f27 100644 --- a/aegisub/src/charset_conv.h +++ b/aegisub/src/charset_conv.h @@ -35,135 +35,38 @@ /// #ifndef AGI_PRE -#include -#include - -#include #include #include #include #endif #include "aegisub_endian.h" - -#if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG) -#define ICONV_POSIX -#endif - -/// @class iconv_wrapper -/// @brief RAII wrapper for iconv -class iconv_wrapper { -private: - iconv_t conv; -public: - iconv_wrapper(const char *to, const char *from) - : conv(iconv_open(to, from)) - { } - iconv_wrapper(wxString const& to, wxString const& from) - : conv(iconv_open(to.ToAscii(), from.ToAscii())) - { } - iconv_wrapper(const char *to, wxString const& from) - : conv(iconv_open(to, from.ToAscii())) - { } - iconv_wrapper(wxString const& to, const char *from) - : conv(iconv_open(to.ToAscii(), from)) - { } - ~iconv_wrapper() { - if (conv != (iconv_t)-1) iconv_close(conv); - } - operator iconv_t() { - return conv; - } - operator const iconv_t() const { - return conv; - } -}; +#include /// @class AegisubCSConv /// @brief wxMBConv implementation for converting to and from unicode class AegisubCSConv : public wxMBConv { public: - /// @param mbEncName Multibyte encoding to convert to/from - /// @param enableSubst Whether to substitute characters when needed. - /// By default, any conversion that would be lossy will fail - /// When enableSubst is true, conversions to multibyte with a sufficiently - /// large buffer are guaranteed to succeed, with characters dropped or - /// changed as needed to fit the string into the target encoding. - AegisubCSConv(const wxChar *mbEncName, bool enableSubst = false); // wxMBConv implementation; see strconv.h for usage details size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const; size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const; - size_t GetMBNulLen() const; - wxMBConv *Clone() const; - - /// @brief Multibyte-aware strlen - /// @return Length in bytes of str (excluding terminator) - size_t MBBuffLen(const char *str) const; - - /// @brief Get a list of support encodings with user-friendly names - static wxArrayString GetEncodingsList(); - /// @brief Get a list of all encodings supported by iconv - /// Requires GNU iconv for useful results - static wxArrayString GetAllSupportedEncodings(); - /// @brief Map a user-friendly encoding name to the real encoding name - static wxString GetRealEncodingName(wxString name); + wxMBConv *Clone() const { return NULL; }; +protected: + AegisubCSConv(); private: - // The smattering of mutable variables here are due to that ToWChar and - // FromWChar are const in wxMBConv, but we require minor mutation for - // things like locks (as iconv is not thread-safe) - wxString wcCharsetName; - wxString mbCharsetName; - mutable size_t mbNulLen; - bool enableSubst; - - size_t doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const; - size_t iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) const; - - static void ucToMbFallback( - unsigned int code, - void (*callback) (const char *buf, size_t buflen, void* callback_arg), - void *callback_arg, - void *convPtr); - - /// Replacement character for characters which do not fit in the target - /// encoding and iconv does not have an appropriate substitute for - char invalidRep[8]; - size_t invalidRepSize; - -#ifndef ICONV_POSIX - mutable iconv_fallbacks fallbacks; -#endif + AegisubCSConv(const AegisubCSConv&); + AegisubCSConv& operator=(const AegisubCSConv&); + wxString localCharset; #if wxUSE_THREADS mutable wxMutex iconvMutex; #endif -protected: - iconv_wrapper m2w, w2m; + // ToWChar and FromWChar are const in wxMBConv, but iconv can't be used + // immutably + mutable agi::charset::IconvWrapper conv; }; -// Predefined conversion for the current locale, intended to be a drop-in -// replacement for wxConvLocal extern AegisubCSConv& csConvLocal; - -#ifdef HAVE_BIG_ENDIAN -# if SIZEOF_WCHAR_T == 4 -# define WCHAR_T_ENCODING "UTF-32BE" -# elif SIZEOF_WCHAR_T == 2 -# define WCHAR_T_ENCODING "UTF-16BE" -# endif -#elif defined(HAVE_LITTLE_ENDIAN) -# if SIZEOF_WCHAR_T == 4 -# define WCHAR_T_ENCODING "UTF-32LE" -# elif SIZEOF_WCHAR_T == 2 -# define WCHAR_T_ENCODING "UTF-16LE" -# endif -#else -# if SIZEOF_WCHAR_T == 4 -# define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-32BE" : "UTF-32LE") -# elif SIZEOF_WCHAR_T == 2 -# define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-16BE" : "UTF-16LE") -# endif -#endif diff --git a/aegisub/src/compat.cpp b/aegisub/src/compat.cpp index f9781ee1b..e12251513 100644 --- a/aegisub/src/compat.cpp +++ b/aegisub/src/compat.cpp @@ -7,8 +7,8 @@ wxArrayString lagi_MRU_wxAS(const wxString &list) { const agi::MRUManager::MRUListMap *map_list = AegisubApp::Get()->mru->Get(STD_STR(list)); for (agi::MRUManager::MRUListMap::const_iterator i_lst = map_list->begin(); i_lst != map_list->end(); ++i_lst) { - work.Add(wxString(i_lst->second)); + work.Add(wxString(i_lst->second.c_str(), wxConvUTF8)); } - return work; + return work; } diff --git a/aegisub/src/compat.h b/aegisub/src/compat.h index fe794f772..af1609e01 100644 --- a/aegisub/src/compat.h +++ b/aegisub/src/compat.h @@ -8,8 +8,8 @@ #include -#define STD_STR(x) std::string(x.mb_str()) +#define STD_STR(x) std::string(x.utf8_str()) inline wxColour lagi_wxColour(const agi::Colour &colour) { return wxColour(colour); } -inline wxString lagi_wxString(const std::string &str) { return wxString(str); } +inline wxString lagi_wxString(const std::string &str) { return wxString(str.c_str(), wxConvUTF8); } wxArrayString lagi_MRU_wxAS(const wxString &list); diff --git a/aegisub/src/dialog_export.cpp b/aegisub/src/dialog_export.cpp index c6ee51a11..88c4f4be6 100644 --- a/aegisub/src/dialog_export.cpp +++ b/aegisub/src/dialog_export.cpp @@ -34,9 +34,6 @@ /// @ingroup export /// - -/////////// -// Headers #include "config.h" #ifndef AGI_PRE @@ -102,7 +99,7 @@ DialogExport::DialogExport (wxWindow *parent) // Charset dropdown list wxStaticText *charset_list_label = new wxStaticText(this, -1, _("Text encoding:")); - CharsetList = new wxChoice(this, Charset_List_Box, wxDefaultPosition, wxDefaultSize, AegisubCSConv::GetEncodingsList()); + CharsetList = new wxChoice(this, Charset_List_Box, wxDefaultPosition, wxDefaultSize, agi::charset::GetEncodingsList()); wxSizer *charset_list_sizer = new wxBoxSizer(wxHORIZONTAL); charset_list_sizer->Add(charset_list_label, 0, wxALIGN_CENTER | wxRIGHT, 5); charset_list_sizer->Add(CharsetList, 1, wxEXPAND); @@ -219,6 +216,9 @@ void DialogExport::OnProcess(wxCommandEvent &event) { wxString err(error); wxMessageBox(err, _T("Error exporting subtitles"), wxOK | wxICON_ERROR, this); } + catch (const agi::charset::ConvError& err) { + wxMessageBox(err.GetMessage(), _T("Error exporting subtitles"), wxOK | wxICON_ERROR, this); + } catch (...) { wxMessageBox(_T("Unknown error"), _T("Error exporting subtitles"), wxOK | wxICON_ERROR, this); } diff --git a/aegisub/src/frame_main.cpp b/aegisub/src/frame_main.cpp index 78ad44fa5..194f7546e 100644 --- a/aegisub/src/frame_main.cpp +++ b/aegisub/src/frame_main.cpp @@ -713,8 +713,7 @@ void FrameMain::LoadSubtitles (wxString filename,wxString charset) { // Make sure that file isn't actually a timecode file try { TextFileReader testSubs(filename,charset); - charset = testSubs.GetCurrentEncoding(); - isBinary = charset == _T("binary"); + isBinary = testSubs.IsBinary(); if (!isBinary && testSubs.HasMoreLines()) { wxString cur = testSubs.ReadLineFromFile(); if (cur.Left(10) == _T("# timecode")) { @@ -817,8 +816,7 @@ bool FrameMain::SaveSubtitles(bool saveas,bool withCharset) { // Get charset wxString charset = _T(""); if (withCharset) { - wxArrayString choices = AegisubCSConv::GetEncodingsList(); - charset = wxGetSingleChoice(_("Choose charset code:"), _T("Charset"),choices,this,-1, -1,true,250,200); + charset = wxGetSingleChoice(_("Choose charset code:"), _T("Charset"),agi::charset::GetEncodingsList(),this,-1, -1,true,250,200); if (charset.IsEmpty()) return false; } diff --git a/aegisub/src/frame_main_events.cpp b/aegisub/src/frame_main_events.cpp index c38bd2978..cd9bcdb6e 100644 --- a/aegisub/src/frame_main_events.cpp +++ b/aegisub/src/frame_main_events.cpp @@ -538,7 +538,7 @@ int FrameMain::AddMacroMenuItems(wxMenu *menu, const std::vectormru->GetEntry("Subtitle", number)); + LoadSubtitles(lagi_wxString(AegisubApp::Get()->mru->GetEntry("Subtitle", number))); } @@ -548,7 +548,7 @@ void FrameMain::OnOpenRecentSubs(wxCommandEvent &event) { /// void FrameMain::OnOpenRecentVideo(wxCommandEvent &event) { int number = event.GetId()-Menu_Video_Recent; - LoadVideo(AegisubApp::Get()->mru->GetEntry("Video", number)); + LoadVideo(lagi_wxString(AegisubApp::Get()->mru->GetEntry("Video", number))); } @@ -558,7 +558,7 @@ void FrameMain::OnOpenRecentVideo(wxCommandEvent &event) { /// void FrameMain::OnOpenRecentTimecodes(wxCommandEvent &event) { int number = event.GetId()-Menu_Timecodes_Recent; - LoadVFR(AegisubApp::Get()->mru->GetEntry("Timecodes", number)); + LoadVFR(lagi_wxString(AegisubApp::Get()->mru->GetEntry("Timecodes", number))); } @@ -568,7 +568,7 @@ void FrameMain::OnOpenRecentTimecodes(wxCommandEvent &event) { /// void FrameMain::OnOpenRecentKeyframes(wxCommandEvent &event) { int number = event.GetId()-Menu_Keyframes_Recent; - KeyFrameFile::Load(AegisubApp::Get()->mru->GetEntry("Keyframes", number)); + KeyFrameFile::Load(lagi_wxString(AegisubApp::Get()->mru->GetEntry("Keyframes", number))); videoBox->videoSlider->Refresh(); audioBox->audioDisplay->Update(); Refresh(); @@ -581,7 +581,7 @@ void FrameMain::OnOpenRecentKeyframes(wxCommandEvent &event) { /// void FrameMain::OnOpenRecentAudio(wxCommandEvent &event) { int number = event.GetId()-Menu_Audio_Recent; - LoadSubtitles(AegisubApp::Get()->mru->GetEntry("Audio", number)); + LoadAudio(lagi_wxString(AegisubApp::Get()->mru->GetEntry("Audio", number))); } @@ -805,13 +805,12 @@ void FrameMain::OnOpenSubtitles(wxCommandEvent& WXUNUSED(event)) { /// void FrameMain::OnOpenSubtitlesCharset(wxCommandEvent& WXUNUSED(event)) { // Initialize charsets - wxArrayString choices = AegisubCSConv::GetEncodingsList(); wxString path = lagi_wxString(OPT_GET("Path/Last/Subtitles")->GetString()); // Get options and load wxString filename = wxFileSelector(_("Open subtitles file"),path,_T(""),_T(""),AssFile::GetWildcardList(0),wxFD_OPEN | wxFD_FILE_MUST_EXIST); if (!filename.empty()) { - wxString charset = wxGetSingleChoice(_("Choose charset code:"), _("Charset"),choices,this,-1, -1,true,250,200); + wxString charset = wxGetSingleChoice(_("Choose charset code:"), _("Charset"),agi::charset::GetEncodingsList(),this,-1, -1,true,250,200); if (!charset.empty()) { LoadSubtitles(filename,charset); } diff --git a/aegisub/src/hotkeys.cpp b/aegisub/src/hotkeys.cpp index 1d25938f3..9833fce95 100644 --- a/aegisub/src/hotkeys.cpp +++ b/aegisub/src/hotkeys.cpp @@ -300,7 +300,7 @@ void HotkeyManager::Load() { TextFileReader file(filename); wxString header; try { - if (file.GetCurrentEncoding() != _T("binary")) + if (!file.IsBinary()) header = file.ReadLineFromFile(); } catch (wxString e) { diff --git a/aegisub/src/main.cpp b/aegisub/src/main.cpp index d758b29b7..75ab1cf24 100644 --- a/aegisub/src/main.cpp +++ b/aegisub/src/main.cpp @@ -263,6 +263,10 @@ emit_stdout->Enable(); wxMessageBox(err,_T("Fatal error while initializing")); return false; } + catch (agi::Exception const& e) { + wxMessageBox(e.GetMessage(),_T("Fatal error while initializing")); + return false; + } catch (...) { wxMessageBox(_T("Unhandled exception"),_T("Fatal error while initializing")); diff --git a/aegisub/src/preferences.cpp b/aegisub/src/preferences.cpp index 9f6bc0d8f..ab6866157 100644 --- a/aegisub/src/preferences.cpp +++ b/aegisub/src/preferences.cpp @@ -29,6 +29,7 @@ #include #include "colour_button.h" +#include "compat.h" #include "libresrc/libresrc.h" #include "preferences.h" #include "main.h" @@ -172,7 +173,7 @@ void Preferences::OptionAdd(wxPanel *parent, wxFlexGridSizer *flex, const wxStri case agi::OptionValue::Type_String: { flex->Add(new wxStaticText(parent, wxID_ANY, name), 1, wxALIGN_CENTRE_VERTICAL); - wxTextCtrl *text = new wxTextCtrl(parent, wxID_ANY , opt->GetString(), wxDefaultPosition, wxDefaultSize); + wxTextCtrl *text = new wxTextCtrl(parent, wxID_ANY , lagi_wxString(opt->GetString()), wxDefaultPosition, wxDefaultSize); flex->Add(text, 1, wxEXPAND); break; } diff --git a/aegisub/src/spellchecker_hunspell.cpp b/aegisub/src/spellchecker_hunspell.cpp index aa80eda00..91cedc1a9 100644 --- a/aegisub/src/spellchecker_hunspell.cpp +++ b/aegisub/src/spellchecker_hunspell.cpp @@ -59,6 +59,8 @@ #include "options.h" #include "spellchecker_hunspell.h" #include "standard_paths.h" +#include "text_file_reader.h" +#include "text_file_writer.h" #include "utils.h" @@ -66,6 +68,7 @@ HunspellSpellChecker::HunspellSpellChecker() { hunspell = NULL; conv = NULL; + rconv = NULL; SetLanguage(lagi_wxString(OPT_GET("Tool/Spell Checker/Language")->GetString())); } @@ -84,6 +87,8 @@ void HunspellSpellChecker::Reset() { hunspell = NULL; delete conv; conv = NULL; + delete rconv; + rconv = NULL; affpath.Clear(); dicpath.Clear(); } @@ -96,8 +101,13 @@ void HunspellSpellChecker::Reset() { /// bool HunspellSpellChecker::CanAddWord(wxString word) { if (!hunspell) return false; - wxCharBuffer buffer = word.mb_str(*conv); - return (buffer.data() != NULL); + try { + conv->Convert(word); + return true; + } + catch (agi::charset::ConvError const&) { + return false; + } } @@ -111,9 +121,9 @@ void HunspellSpellChecker::AddWord(wxString word) { // Add to currently loaded file #ifdef WITH_OLD_HUNSPELL - hunspell->put_word(word.mb_str(*conv)); + hunspell->put_word(conv->Convert(word).c_str()); #else - hunspell->add(word.mb_str(*conv)); + hunspell->add(conv->Convert(word).c_str()); #endif // Ensure that the path exists @@ -124,22 +134,14 @@ void HunspellSpellChecker::AddWord(wxString word) { // Load dictionary wxArrayString dic; - wxString curLine; bool added = false; if (fn.FileExists()) { // Even if you ever want to remove this "if", keep the braces, so the stream closes at the end bool first = true; - wxFileInputStream in(usrdicpath); - if (!in.IsOk()) return; - wxTextInputStream textIn(in,_T(" \t"),*conv); - - // Read it - while (in.CanRead() && !in.Eof()) { - // Read line - curLine = textIn.ReadLine(); - curLine.Trim(); + TextFileReader reader(usrdicpath, L"UTF-8"); + while (reader.HasMoreLines()) { + wxString curLine = reader.ReadLineFromFile(); if (curLine.IsEmpty()) continue; - // First if (first) { first = false; if (curLine.IsNumber()) continue; @@ -160,11 +162,14 @@ void HunspellSpellChecker::AddWord(wxString word) { if (!added) dic.Add(word); // Write back to disk - wxFileOutputStream out(usrdicpath); - if (!out.IsOk()) return; - wxTextOutputStream textOut(out,wxEOL_UNIX,*conv); - textOut.WriteString(wxString::Format(_T("%i"),dic.Count())+_T("\n")); - for (unsigned int i=0;ispell(buf) == 1); - return false; + try { + return hunspell->spell(conv->Convert(word).c_str()) == 1; + } + catch (agi::charset::ConvError const&) { + return false; + } } @@ -187,31 +195,26 @@ bool HunspellSpellChecker::CheckWord(wxString word) { /// @return List of suggestions /// wxArrayString HunspellSpellChecker::GetSuggestions(wxString word) { - // Array wxArrayString suggestions; + if (!hunspell) return suggestions; - // Get suggestions - if (hunspell) { - // Word - wxCharBuffer buf = word.mb_str(*conv); - if (!buf) return suggestions; - + try { // Grab raw from Hunspell char **results; - int n = hunspell->suggest(&results,buf); + int n = hunspell->suggest(&results,conv->Convert(word).c_str()); // Convert each for (int i=0;iConvert(results[i])); delete results[i]; } - // Delete delete results; } + catch (agi::charset::ConvError const&) { + return suggestions; + } - // Return them return suggestions; } @@ -279,25 +282,23 @@ void HunspellSpellChecker::SetLanguage(wxString language) { hunspell = new Hunspell(affpath.mb_str(csConvLocal),dicpath.mb_str(csConvLocal)); conv = NULL; if (hunspell) { - conv = new AegisubCSConv(wxString(hunspell->get_dic_encoding(),wxConvUTF8)); - - // Load user dictionary - if (wxFileExists(usrdicpath)) { - wxFileInputStream in(usrdicpath); - if (!in.IsOk()) return; - wxTextInputStream textIn(in,_T(" \t"),*conv); - while (in.CanRead() && !in.Eof()) { - // Read line - wxString curLine = textIn.ReadLine(); - curLine.Trim(); + conv = new agi::charset::IconvWrapper("wchar_t", hunspell->get_dic_encoding()); + rconv = new agi::charset::IconvWrapper(hunspell->get_dic_encoding(), "wchar_t"); + try { + TextFileReader reader(usrdicpath, L"UTF-8"); + while (reader.HasMoreLines()) { + wxString curLine = reader.ReadLineFromFile(); if (curLine.IsEmpty() || curLine.IsNumber()) continue; #ifdef WITH_OLD_HUNSPELL - hunspell->put_word(curLine.mb_str(*conv)); + hunspell->put_word(conv->Convert(curLine).c_str()); #else - hunspell->add(curLine.mb_str(*conv)); + hunspell->add(conv->Convert(curLine).c_str()); #endif } } + catch (const wchar_t *) { + // file not found + } } } diff --git a/aegisub/src/spellchecker_hunspell.h b/aegisub/src/spellchecker_hunspell.h index c8c7341fb..dfa407292 100644 --- a/aegisub/src/spellchecker_hunspell.h +++ b/aegisub/src/spellchecker_hunspell.h @@ -43,6 +43,11 @@ #include #include "include/aegisub/spellchecker.h" +namespace agi { + namespace charset { + class IconvWrapper; + } +} /// @class HunspellSpellChecker @@ -55,7 +60,8 @@ private: Hunspell *hunspell; /// Conversion buffer - wxMBConv *conv; + agi::charset::IconvWrapper *conv; + agi::charset::IconvWrapper *rconv; /// Path to .aff file wxString affpath; diff --git a/aegisub/src/text_file_reader.cpp b/aegisub/src/text_file_reader.cpp index aae4d3e49..039cd3993 100644 --- a/aegisub/src/text_file_reader.cpp +++ b/aegisub/src/text_file_reader.cpp @@ -51,8 +51,15 @@ #include "charset_detect.h" #include "text_file_reader.h" -TextFileReader::TextFileReader(wxString filename, wxString enc, bool trim) -: encoding(enc), conv((iconv_t)-1), trim(trim), readComplete(false), currout(0), outptr(0), currentLine(0) { +TextFileReader::TextFileReader(wxString const& filename, wxString encoding, bool trim) +: isBinary(false) +, conv() +, trim(trim) +, readComplete(false) +, currout(0) +, outptr(0) +, currentLine(0) +{ #ifdef __WINDOWS__ file.open(filename.wc_str(),std::ios::in | std::ios::binary); #else @@ -61,16 +68,14 @@ TextFileReader::TextFileReader(wxString filename, wxString enc, bool trim) if (!file.is_open()) throw L"Failed opening file for reading."; if (encoding.IsEmpty()) encoding = CharSetDetect::GetEncoding(filename); - if (encoding == L"binary") return; - encoding = AegisubCSConv::GetRealEncodingName(encoding); - conv = iconv_open(WCHAR_T_ENCODING, encoding.ToAscii()); - if (conv == (iconv_t)-1) { - throw wxString::Format(L"Character set '%s' is not supported.", enc.c_str()); + if (encoding == L"binary") { + isBinary = true; + return; } + conv.reset(new agi::charset::IconvWrapper(encoding.c_str(), "wchar_t")); } TextFileReader::~TextFileReader() { - if (conv != (iconv_t)-1) iconv_close(conv); } wchar_t TextFileReader::GetWChar() { @@ -98,7 +103,8 @@ wchar_t TextFileReader::GetWChar() { return 0; do { - size_t ret = iconv(conv, &inptr, &inbytesleft, reinterpret_cast(&outptr), &outbytesleft); + // Without this const_cast the wrong overload is chosen + size_t ret = conv->Convert(const_cast(&inptr), &inbytesleft, reinterpret_cast(&outptr), &outbytesleft); if (ret != (size_t)-1) break; int err = errno; @@ -144,7 +150,6 @@ wxString TextFileReader::ReadLineFromFile() { if (ch == 0) readComplete = true; - // Trim if (trim) { buffer.Trim(true); buffer.Trim(false); @@ -155,7 +160,3 @@ wxString TextFileReader::ReadLineFromFile() { bool TextFileReader::HasMoreLines() { return !readComplete; } - -wxString TextFileReader::GetCurrentEncoding() { - return encoding; -} diff --git a/aegisub/src/text_file_reader.h b/aegisub/src/text_file_reader.h index 66ebb5f4e..1f6c9bf6a 100644 --- a/aegisub/src/text_file_reader.h +++ b/aegisub/src/text_file_reader.h @@ -38,21 +38,23 @@ #ifndef AGI_PRE #include - -#include +#include #include #include #endif +namespace agi { namespace charset { + class IconvWrapper; +} } + /// @class TextFileReader /// @brief A line-based text file reader class TextFileReader { private: - /// Encoding of the file being read - wxString encoding; + bool isBinary; std::ifstream file; - iconv_t conv; + std::auto_ptr conv; bool trim; bool readComplete; @@ -76,7 +78,7 @@ public: /// @param filename File to open /// @param enc Encoding to use, or empty to autodetect /// @param trim Whether to trim whitespace from lines read - TextFileReader(wxString filename,wxString encoding=L"", bool trim=true); + TextFileReader(wxString const& filename,wxString encoding=L"", bool trim=true); /// @brief Destructor ~TextFileReader(); @@ -85,8 +87,5 @@ public: wxString ReadLineFromFile(); /// @brief Check if there are any more lines to read bool HasMoreLines(); - - /// @brief Get the file encoding used by this reader - /// @return "unknown", "binary", or a character encoding name - wxString GetCurrentEncoding(); + bool IsBinary() { return isBinary; } }; diff --git a/aegisub/src/text_file_writer.cpp b/aegisub/src/text_file_writer.cpp index bae68f22d..60d990b3d 100644 --- a/aegisub/src/text_file_writer.cpp +++ b/aegisub/src/text_file_writer.cpp @@ -51,7 +51,7 @@ /// @param filename /// @param encoding /// -TextFileWriter::TextFileWriter(wxString filename, wxString encoding) +TextFileWriter::TextFileWriter(wxString const& filename, wxString encoding) : conv() { #ifdef WIN32 file.open(filename.wc_str(),std::ios::out | std::ios::binary | std::ios::trunc); @@ -59,17 +59,17 @@ TextFileWriter::TextFileWriter(wxString filename, wxString encoding) file.open(wxFNCONV(filename),std::ios::out | std::ios::binary | std::ios::trunc); #endif if (!file.is_open()) { - throw _T("Failed opening file for writing."); + throw L"Failed opening file for writing."; } - if (encoding.IsEmpty()) encoding = lagi_wxString(OPT_GET("App/Save Charset")->GetString()); - conv.reset(new AegisubCSConv(encoding, true)); + if (encoding.empty()) encoding = lagi_wxString(OPT_GET("App/Save Charset")->GetString()); + conv.reset(new agi::charset::IconvWrapper("utf-8", encoding.c_str(), true)); // Write the BOM try { - WriteLineToFile(_T("\uFEFF"), false); + WriteLineToFile(L"\uFEFF", false); } - catch (wxString ignore) { + catch (agi::charset::ConversionFailure&) { // If the BOM could not be converted to the target encoding it isn't needed } } @@ -85,14 +85,11 @@ TextFileWriter::~TextFileWriter() { /// @brief DOCME /// @param line /// @param addLineBreak -/// void TextFileWriter::WriteLineToFile(wxString line, bool addLineBreak) { - wxString temp = line; - if (addLineBreak) temp += _T("\r\n"); + if (addLineBreak) line += L"\n"; - wxCharBuffer buf = temp.mb_str(*conv); - if (buf.data()) - file.write(buf.data(), conv->MBBuffLen(buf.data())); + std::string buf = conv->Convert(line.utf8_str().data()); + file.write(buf.data(), buf.size()); } diff --git a/aegisub/src/text_file_writer.h b/aegisub/src/text_file_writer.h index 79ff5b26f..3b6de7c50 100644 --- a/aegisub/src/text_file_writer.h +++ b/aegisub/src/text_file_writer.h @@ -43,8 +43,11 @@ #include #endif - -class AegisubCSConv; +namespace agi { + namespace charset { + class IconvWrapper; + } +} /// DOCME @@ -59,13 +62,13 @@ private: std::ofstream file; /// DOCME - std::auto_ptr conv; + std::auto_ptr conv; TextFileWriter(const TextFileWriter&); TextFileWriter& operator=(const TextFileWriter&); public: - TextFileWriter(wxString filename, wxString encoding=_T("")); + TextFileWriter(wxString const& filename, wxString encoding=""); ~TextFileWriter(); void WriteLineToFile(wxString line, bool addLineBreak=true); diff --git a/aegisub/src/video_provider_manager.cpp b/aegisub/src/video_provider_manager.cpp index 95c9842fc..1ebc28ee1 100644 --- a/aegisub/src/video_provider_manager.cpp +++ b/aegisub/src/video_provider_manager.cpp @@ -69,7 +69,7 @@ VideoProvider *VideoProviderFactoryManager::GetProvider(wxString video) { } try { - VideoProvider *y4m_provider = new YUV4MPEGVideoProvider(video.wc_str()); + VideoProvider *y4m_provider = new YUV4MPEGVideoProvider(video); if (y4m_provider) y4m_provider = new VideoProviderCache(y4m_provider); return y4m_provider; @@ -92,7 +92,7 @@ VideoProvider *VideoProviderFactoryManager::GetProvider(wxString video) { for (unsigned int i=0;iCreateProvider(video.wc_str()); + VideoProvider *provider = GetFactory(list[i])->CreateProvider(video); if (provider) { // Cache if necessary if (provider->WantsCaching()) { diff --git a/aegisub/tests/Makefile.am b/aegisub/tests/Makefile.am index f5412fb18..3930dbd24 100644 --- a/aegisub/tests/Makefile.am +++ b/aegisub/tests/Makefile.am @@ -12,6 +12,7 @@ run_SOURCES = \ util_unix.cpp \ libaegisub_access.cpp \ libaegisub_cajun.cpp \ + libaegisub_iconv.cpp \ libaegisub_util.cpp \ libaegisub_mru.cpp diff --git a/aegisub/tests/libaegisub_iconv.cpp b/aegisub/tests/libaegisub_iconv.cpp new file mode 100644 index 000000000..14af4b6ae --- /dev/null +++ b/aegisub/tests/libaegisub_iconv.cpp @@ -0,0 +1,138 @@ +// Copyright (c) 2010, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file libaegisub_iconv.cpp +/// @brief agi::charset +/// @ingroup iconv + +#include +#include + +#include "main.h" +#include "util.h" + +using namespace agi::charset; + +TEST(lagi_iconv, BasicSetup) { + EXPECT_NO_THROW(IconvWrapper("UTF-8", "UTF-16LE")); +} + +TEST(lagi_iconv, InvalidConversions) { + EXPECT_THROW(IconvWrapper("nonexistent charset", "UTF-16LE"), UnsupportedConversion); + EXPECT_THROW(IconvWrapper("UTF-16LE", "nonexistent charset"), UnsupportedConversion); + EXPECT_THROW(IconvWrapper("nonexistent charset", "nonexistent charset"), UnsupportedConversion); +} + +TEST(lagi_iconv, StrLen1) { + IconvWrapper conv("UTF-8", "UTF-8", false); + for (int i = 0; i < 10; i++) { + std::string str(i, ' '); + ASSERT_EQ(i, conv.SrcStrLen(str.c_str())); + ASSERT_EQ(i, conv.DstStrLen(str.c_str())); + } +} +TEST(lagi_iconv, StrLen2) { + IconvWrapper conv("UTF-16LE", "UTF-16LE", false); + for (int i = 0; i < 10; i++) { + std::basic_string str(i, ' '); + ASSERT_EQ(2*i, conv.SrcStrLen((const char *)str.c_str())); + ASSERT_EQ(2*i, conv.DstStrLen((const char *)str.c_str())); + } +} +TEST(lagi_iconv, StrLen4) { + IconvWrapper conv("UTF-32LE", "UTF-32LE", false); + for (int i = 0; i < 10; i++) { + std::basic_string str(i, ' '); + ASSERT_EQ(4*i, conv.SrcStrLen((const char *)str.c_str())); + ASSERT_EQ(4*i, conv.DstStrLen((const char *)str.c_str())); + } +} + +TEST(lagi_iconv, Fallbacks) { + IconvWrapper nofallback("UTF-8", "Shift-JIS", false); + IconvWrapper fallback("UTF-8", "Shift-JIS", true); + IconvWrapper noneneeded("UTF-8", "UTF-16LE", false); + + // Shift-JIS does not have a backslash + EXPECT_THROW(nofallback.Convert("\\"), BadOutput); + ASSERT_NO_THROW(fallback.Convert("\\")); + EXPECT_EQ("\\", fallback.Convert("\\")); + EXPECT_NO_THROW(noneneeded.Convert("\\")); + + // BOM into non-unicode + char bom[] = "\xEF\xBB\xBF"; + EXPECT_THROW(nofallback.Convert(bom), BadOutput); + ASSERT_NO_THROW(fallback.Convert(bom)); + EXPECT_EQ("", fallback.Convert(bom)); + EXPECT_NO_THROW(noneneeded.Convert(bom)); + + // A snowman (U+2603) + char snowman[] = "\xE2\x98\x83"; + EXPECT_THROW(nofallback.Convert(snowman), BadOutput); + EXPECT_NO_THROW(noneneeded.Convert(snowman)); + ASSERT_NO_THROW(fallback.Convert(snowman)); + EXPECT_EQ("?", fallback.Convert(snowman)); +} + +TEST(lagi_iconv, BadInput) { + IconvWrapper utf16("UTF-16LE", "UTF-8"); + EXPECT_THROW(utf16.Convert(" "), BadInput); + IconvWrapper utf8("UTF-8", "UTF-16LE"); + EXPECT_THROW(utf8.Convert("\xE2\xFF"), BadInput); +} + +TEST(lagi_iconv, Conversions) { + IconvWrapper utf16le("UTF-16LE", "UTF-8", false); + IconvWrapper utf16be("UTF-16BE", "UTF-8", false); + IconvWrapper utf8("UTF-8", "UTF-16LE", false); + + char space_utf8_[] = " "; + char space_utf16be_[] = {0, 32, 0, 0}; + char space_utf16le_[] = {32, 0, 0, 0}; + std::string space_utf8(space_utf8_); + std::string space_utf16be(space_utf16be_, 2); + std::string space_utf16le(space_utf16le_, 2); + + EXPECT_EQ(space_utf8, utf16le.Convert(space_utf16le)); + EXPECT_EQ(space_utf8, utf16be.Convert(space_utf16be)); + EXPECT_EQ(space_utf16le, utf8.Convert(space_utf8)); +} + +// Basic overflow tests +TEST(lagi_iconv, Buffer) { + IconvWrapper conv("UTF-8", "UTF-16LE", false); + char buff[32]; + memset(buff, 0xFF, sizeof(buff)); + + EXPECT_THROW(conv.Convert("", 1, buff, 0), BufferTooSmall); + EXPECT_EQ('\xFF', buff[0]); + EXPECT_THROW(conv.Convert("", 1, buff, 1), BufferTooSmall); + EXPECT_EQ('\xFF', buff[0]); + EXPECT_NO_THROW(conv.Convert("", 1, buff, 2)); + EXPECT_EQ('\0', buff[0]); + EXPECT_EQ('\0', buff[1]); + EXPECT_EQ('\xFF', buff[2]); +} + +TEST(lagi_iconv, LocalSupport) { + ASSERT_NO_THROW(IconvWrapper("UTF-8", "")); + IconvWrapper conv("UTF-8", ""); + ASSERT_NO_THROW(conv.Convert(" ")); + EXPECT_EQ(" ", conv.Convert(" ")); +} +TEST(lagi_iconv, wchar_tSupport) { + EXPECT_NO_THROW(IconvWrapper("UTF-8", "wchar_t")); +}