From 66232396820f0767b0a36d750e1ae4d940de6be6 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Tue, 6 Jul 2010 19:23:10 +0000 Subject: [PATCH] Work around BOM-related issues with UTF-32 and UTF-16 with unspecified byte order Originally committed to SVN as r4656. --- aegisub/libaegisub/common/charset_conv.cpp | 81 ++++++++++++++----- .../include/libaegisub/charset_conv.h | 9 +-- aegisub/tests/libaegisub_iconv.cpp | 9 +++ 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/aegisub/libaegisub/common/charset_conv.cpp b/aegisub/libaegisub/common/charset_conv.cpp index da53d9f1c..3f5a8cec3 100644 --- a/aegisub/libaegisub/common/charset_conv.cpp +++ b/aegisub/libaegisub/common/charset_conv.cpp @@ -76,18 +76,14 @@ namespace agi { namespace charset { #ifdef ICONV_POSIX -class IconvWrapper::Converter { -public: - Converter(bool, const char*) { } - size_t operator()(iconv_t cd, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { - return iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); - } +struct iconv_fallbacks { }; -#else -class IconvWrapper::Converter : public iconv_fallbacks { -private: +#endif + +class Converter : public iconv_fallbacks { bool subst; - char invalidRep[4]; + int bomSize; + char invalidRep[8]; size_t invalidRepSize; static void fallback( unsigned int code, @@ -107,32 +103,72 @@ private: } public: Converter(bool subst, const char* targetEnc) - : subst(subst) + : subst(subst) { + +#ifndef ICONV_POSIX data = this; mb_to_uc_fallback = NULL; mb_to_wc_fallback = NULL; uc_to_mb_fallback = fallback; wc_to_mb_fallback = NULL; +#endif - char sbuff[] = "?"; - const char* src = sbuff; - char* dst = invalidRep; - size_t dstLen = 4; - size_t srcLen = 1; + char buff[8]; iconv_t cd = iconv_open(GetRealEncodingName(targetEnc), "UTF-8"); assert(cd != iconv_invalid); + + // Get BOM size (if any) + const char* src = ""; + char *dst = buff; + size_t srcLen = 1; + size_t dstLen = 8; + size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen); assert(res != iconv_failed); assert(srcLen == 0); - iconv_close(cd); + src = buff; + bomSize = 0; + for (src = buff; src < dst; ++src) { + if (*src) { + bomSize = (8 - dstLen) / 2; + break; + } + } + + // Get fallback character + char sbuff[] = "?"; + src = sbuff; + dst = invalidRep; + dstLen = 4; + srcLen = 1; + + res = operator()(cd, &src, &srcLen, &dst, &dstLen); + assert(res != iconv_failed); + assert(srcLen == 0); invalidRepSize = 4 - dstLen; + + iconv_close(cd); } size_t operator()(iconv_t cd, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) { + // If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without + // a specified byte order), skip over it + if (bomSize > 0 && inbytesleft && *inbytesleft) { + // libiconv marks the bom as written after writing the first + // character after the bom rather than when it writes the bom, so + // convert at least one extra character + char bom[8]; + char *dst = bom; + size_t dstSize = min(8, bomSize + *outbytesleft); + const char *src = *inbuf; + size_t srcSize = *inbytesleft; + iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize); + } size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft); +#ifndef ICONV_POSIX if (!subst) return res; // Save original errno so we can return it rather than the result from iconvctl @@ -157,8 +193,8 @@ public: } if (res == iconv_failed && err == E2BIG && *outbytesleft == 0) { // Check for E2BIG false positives - char buff[4]; - size_t buffsize = 4; + char buff[8]; + size_t buffsize = 8; char* out = buff; const char* in = *inbuf; size_t insize = *inbytesleft; @@ -167,7 +203,7 @@ public: res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize); // If no bytes of the output buffer were used, the original // conversion may have been successful - if (buffsize == 4) { + if (buffsize == 8) { err = errno; } else { @@ -177,10 +213,10 @@ public: } errno = err; +#endif return res; } }; -#endif // Calculate the size of NUL in the given character set static size_t NulSize(const char* encoding) { @@ -188,6 +224,7 @@ static size_t NulSize(const char* encoding) { // UTF-8 seems like the obvious choice iconv_t cd = iconv_open(GetRealEncodingName(encoding), "UTF-8"); assert(cd != iconv_invalid); + Converter conv(false, GetRealEncodingName(encoding)); char dbuff[4]; char sbuff[] = ""; @@ -196,7 +233,7 @@ static size_t NulSize(const char* encoding) { size_t dstLen = sizeof(dbuff); size_t srcLen = 1; - size_t ret = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen); + size_t ret = conv(cd, &src, &srcLen, &dst, &dstLen); assert(ret != iconv_failed); assert(dst - dbuff > 0); iconv_close(cd); diff --git a/aegisub/libaegisub/include/libaegisub/charset_conv.h b/aegisub/libaegisub/include/libaegisub/charset_conv.h index 8786e0451..2c1c17e9e 100644 --- a/aegisub/libaegisub/include/libaegisub/charset_conv.h +++ b/aegisub/libaegisub/include/libaegisub/charset_conv.h @@ -51,13 +51,12 @@ T const& GetEncodingsList() { typedef void* iconv_t; +// Helper class that abstracts away the differences betwen libiconv and +// POSIX iconv implementations +class Converter; + /// @brief A C++ wrapper for iconv class IconvWrapper { -private: - // Helper class that abstracts away the differences betwen libiconv and - // POSIX iconv implementations - class Converter; - iconv_t cd; size_t toNulLen; size_t fromNulLen; diff --git a/aegisub/tests/libaegisub_iconv.cpp b/aegisub/tests/libaegisub_iconv.cpp index 14af4b6ae..74b0b896e 100644 --- a/aegisub/tests/libaegisub_iconv.cpp +++ b/aegisub/tests/libaegisub_iconv.cpp @@ -21,6 +21,7 @@ #include #include + #include "main.h" #include "util.h" @@ -136,3 +137,11 @@ TEST(lagi_iconv, LocalSupport) { TEST(lagi_iconv, wchar_tSupport) { EXPECT_NO_THROW(IconvWrapper("UTF-8", "wchar_t")); } + +TEST(lagi_iconv, pretty_names) { + std::vector names = GetEncodingsList >(); + for (std::vector::iterator cur = names.begin(); cur != names.end(); ++cur) { + EXPECT_NO_THROW(IconvWrapper("utf-8", cur->c_str())); + EXPECT_NO_THROW(IconvWrapper(cur->c_str(), "utf-8")); + } +}