Work around BOM-related issues with UTF-32 and UTF-16 with unspecified byte order

Originally committed to SVN as r4656.
2010-07-06 19:23:10 +00:00 · 2010-07-06 19:23:10 +00:00 · 6623239682
commit 6623239682
parent ee4c5dee0b
3 changed files with 72 additions and 27 deletions
--- a/aegisub/libaegisub/common/charset_conv.cpp
+++ b/aegisub/libaegisub/common/charset_conv.cpp
@ -76,18 +76,14 @@ namespace agi {
 	namespace charset {

 #ifdef ICONV_POSIX
-class IconvWrapper::Converter {
-public:
-	Converter(bool, const char*) { }
-	size_t operator()(iconv_t cd, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
-		return iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
-	}
+struct iconv_fallbacks {
 };
-#else
-class IconvWrapper::Converter : public iconv_fallbacks {
-private:
+#endif
+
+class Converter : public iconv_fallbacks {
 	bool subst;
-	char invalidRep[4];
+	int bomSize;
+	char invalidRep[8];
 	size_t invalidRepSize;
 	static void fallback(
 		unsigned int code,
@ -109,30 +105,70 @@ public:
 	Converter(bool subst, const char* targetEnc)
 	: subst(subst)
 	{
+
+#ifndef ICONV_POSIX
 		data = this;
 		mb_to_uc_fallback = NULL;
 		mb_to_wc_fallback = NULL;
 		uc_to_mb_fallback = fallback;
 		wc_to_mb_fallback = NULL;
+#endif

-		char sbuff[] = "?";
-		const char* src = sbuff;
-		char* dst = invalidRep;
-		size_t dstLen = 4;
-		size_t srcLen = 1;
+		char buff[8];

 		iconv_t cd = iconv_open(GetRealEncodingName(targetEnc), "UTF-8");
 		assert(cd != iconv_invalid);
+
+		// Get BOM size (if any)
+		const char* src = "";
+		char *dst = buff;
+		size_t srcLen = 1;
+		size_t dstLen = 8;
+
 		size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
 		assert(res != iconv_failed);
 		assert(srcLen == 0);
-		iconv_close(cd);
+		src = buff;
+		bomSize = 0;
+		for (src = buff; src < dst; ++src) {
+			if (*src) {
+				bomSize = (8 - dstLen) / 2;
+				break;
+			}
+		}
+
+		// Get fallback character
+		char sbuff[] = "?";
+		src = sbuff;
+		dst = invalidRep;
+		dstLen = 4;
+		srcLen = 1;
+
+		res = operator()(cd, &src, &srcLen, &dst, &dstLen);
+		assert(res != iconv_failed);
+		assert(srcLen == 0);

 		invalidRepSize = 4 - dstLen;
+
+		iconv_close(cd);
 	}
 	size_t operator()(iconv_t cd, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
+		// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
+		// a specified byte order), skip over it
+		if (bomSize > 0 && inbytesleft && *inbytesleft) {
+			// libiconv marks the bom as written after writing the first
+			// character after the bom rather than when it writes the bom, so
+			// convert at least one extra character
+			char bom[8];
+			char *dst = bom;
+			size_t dstSize = min(8, bomSize + *outbytesleft);
+			const char *src = *inbuf;
+			size_t srcSize = *inbytesleft;
+			iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize);
+		}
 		size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);

+#ifndef ICONV_POSIX
 		if (!subst) return res;

 		// Save original errno so we can return it rather than the result from iconvctl
@ -157,8 +193,8 @@ public:
 		}
 		if (res == iconv_failed && err == E2BIG && *outbytesleft == 0) {
 			// Check for E2BIG false positives
-			char buff[4];
-			size_t buffsize = 4;
+			char buff[8];
+			size_t buffsize = 8;
 			char* out = buff;
 			const char* in = *inbuf;
 			size_t insize = *inbytesleft;
@ -167,7 +203,7 @@ public:
 			res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
 			// If no bytes of the output buffer were used, the original
 			// conversion may have been successful
-			if (buffsize == 4) {
+			if (buffsize == 8) {
 				err = errno;
 			}
 			else {
@ -177,10 +213,10 @@ public:
 		}

 		errno = err;
+#endif
 		return res;
 	}
 };
-#endif

 // Calculate the size of NUL in the given character set
 static size_t NulSize(const char* encoding) {
@ -188,6 +224,7 @@ static size_t NulSize(const char* encoding) {
 	// UTF-8 seems like the obvious choice
 	iconv_t cd = iconv_open(GetRealEncodingName(encoding), "UTF-8");
 	assert(cd != iconv_invalid);
+	Converter conv(false, GetRealEncodingName(encoding));

 	char dbuff[4];
 	char sbuff[] = "";
@ -196,7 +233,7 @@ static size_t NulSize(const char* encoding) {
 	size_t dstLen = sizeof(dbuff);
 	size_t srcLen = 1;

-	size_t ret = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
+	size_t ret = conv(cd, &src, &srcLen, &dst, &dstLen);
 	assert(ret != iconv_failed);
 	assert(dst - dbuff > 0);
 	iconv_close(cd);
--- a/aegisub/libaegisub/include/libaegisub/charset_conv.h
+++ b/aegisub/libaegisub/include/libaegisub/charset_conv.h
@ -51,13 +51,12 @@ T const& GetEncodingsList() {

 typedef void* iconv_t;

+// Helper class that abstracts away the differences betwen libiconv and
+// POSIX iconv implementations
+class Converter;
+
 /// @brief A C++ wrapper for iconv
 class IconvWrapper {
-private:
-	// Helper class that abstracts away the differences betwen libiconv and
-	// POSIX iconv implementations
-	class Converter;
-
 	iconv_t cd;
 	size_t toNulLen;
 	size_t fromNulLen;
--- a/aegisub/tests/libaegisub_iconv.cpp
+++ b/aegisub/tests/libaegisub_iconv.cpp
@ -21,6 +21,7 @@
 #include <stdint.h>
 #include <libaegisub/charset_conv.h>

+
 #include "main.h"
 #include "util.h"

@ -136,3 +137,11 @@ TEST(lagi_iconv, LocalSupport) {
 TEST(lagi_iconv, wchar_tSupport) {
 	EXPECT_NO_THROW(IconvWrapper("UTF-8", "wchar_t"));
 }
+
+TEST(lagi_iconv, pretty_names) {
+	std::vector<std::string> names = GetEncodingsList<std::vector<std::string> >();
+	for (std::vector<std::string>::iterator cur = names.begin(); cur != names.end(); ++cur) {
+		EXPECT_NO_THROW(IconvWrapper("utf-8", cur->c_str()));
+		EXPECT_NO_THROW(IconvWrapper(cur->c_str(), "utf-8"));
+	}
+}