Add converter to ISO-6937-2

glibc's iconv implementation supports ISO-6937-2, but libiconv doesn't due to that these days the only place it's used is in a few old subtitle formats. As a result, on everything but linux we need our own converter. Conversion from ISO-6937-2 is currently not supported. Originally committed to SVN as r6632.
2012-03-29 19:04:49 +00:00 · 2012-03-29 19:04:49 +00:00 · f31d9a5a8b
commit f31d9a5a8b
parent 71776940f6
9 changed files with 566 additions and 208 deletions
--- a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
+++ b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
@ -267,6 +267,14 @@
 				RelativePath="..\..\libaegisub\common\charset.cpp"
 				>
 			</File>
 			<File
 				RelativePath="..\..\libaegisub\common\charset_6937.cpp"
 				>
 			</File>
 			<File
 				RelativePath="..\..\libaegisub\common\charset_6937.h"
 				>
 			</File>
 			<File
 				RelativePath="..\..\libaegisub\common\charset_conv.cpp"
 				>
--- a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj
+++ b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj
@ -45,6 +45,7 @@
  <ItemGroup>
    <ClInclude Include="$(SrcDir)lagi_pre.h" />
    <ClInclude Include="$(SrcDir)config.h" />
    <ClInclude Include="$(SrcDir)common\charset_6937.h" />
    <ClInclude Include="$(SrcDir)common\charset_ucd.h" />
    <ClInclude Include="$(SrcDir)common\option_visit.h" />
    <ClInclude Include="$(SrcDir)include\libaegisub\access.h" />
@ -86,6 +87,7 @@
    <ClCompile Include="$(SrcDir)common\cajun\reader.cpp" />
    <ClCompile Include="$(SrcDir)common\cajun\writer.cpp" />
    <ClCompile Include="$(SrcDir)common\charset.cpp" />
    <ClCompile Include="$(SrcDir)common\charset_6937.cpp" />
    <ClCompile Include="$(SrcDir)common\charset_conv.cpp" />
    <ClCompile Include="$(SrcDir)common\charset_ucd.cpp" />
    <ClCompile Include="$(SrcDir)common\hotkey.cpp" />
--- a/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters
+++ b/aegisub/build/msbuild/libaegisub/libaegisub.vcxproj.filters
@ -20,6 +20,9 @@
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="$(SrcDir)common\charset_6937.h">
      <Filter>Header Files</Filter>
    </ClInclude>
    <ClInclude Include="$(SrcDir)common\charset_ucd.h">
      <Filter>Header Files</Filter>
    </ClInclude>
@ -130,6 +133,9 @@
    <ClCompile Include="$(SrcDir)common\charset.cpp">
      <Filter>Source Files\Common</Filter>
    </ClCompile>
    <ClCompile Include="$(SrcDir)common\charset_6937.cpp">
      <Filter>Source Files\Common</Filter>
    </ClCompile>
    <ClCompile Include="$(SrcDir)common\charset_conv.cpp">
      <Filter>Source Files\Common</Filter>
    </ClCompile>
--- a/aegisub/libaegisub/Makefile
+++ b/aegisub/libaegisub/Makefile
@ -24,6 +24,7 @@ SRC += \
 	common/cajun/reader.cpp \
 	common/cajun/writer.cpp \
 	common/charset.cpp \
 	common/charset_6937.cpp \
 	common/charset_conv.cpp \
 	common/charset_ucd.cpp \
 	common/hotkey.cpp \
--- a/aegisub/libaegisub/common/charset_6937.cpp
+++ b/aegisub/libaegisub/common/charset_6937.cpp
@ -0,0 +1,250 @@
 // Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
 //
 // Permission to use, copy, modify, and distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice appear in all copies.
 //
 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 //
 // $Id$
 /// @file charset_6937.cpp
 /// @brief A charset converter for ISO-6937-2
 /// @ingroup libaegisub
 #include "../config.h"
 #include "charset_6937.h"
 #ifndef LAGI_PRE
 #include <algorithm>
 #include <cerrno>
 #endif
 #include <iconv.h>
 namespace {
 // ISO-6937-2 values for the first 383 codepoints
 const int iso6937_codepoints[] = {
 	0x00,   0x01,   0x02,   0x03,   0x04,
 	0x05,   0x06,   0x07,   0x08,   0x09,
 	0x0A,   0x0B,   0x0C,   0x0D,   0x0E,
 	0x0F,   0x10,   0x11,   0x12,   0x13,
 	0x14,   0x15,   0x16,   0x17,   0x18,
 	0x19,   0x1A,   0x1B,   0x1C,   0x1D,
 	0x1E,   0x1F,   0x20,   0x21,   0x22,
 	0x23,   0x24,   0x25,   0x26,   0x27,
 	0x28,   0x29,   0x2A,   0x2B,   0x2C,
 	0x2D,   0x2E,   0x2F,   0x30,   0x31,
 	0x32,   0x33,   0x34,   0x35,   0x36,
 	0x37,   0x38,   0x39,   0x3A,   0x3B,
 	0x3C,   0x3D,   0x3E,   0x3F,   0x40,
 	0x41,   0x42,   0x43,   0x44,   0x45,
 	0x46,   0x47,   0x48,   0x49,   0x4A,
 	0x4B,   0x4C,   0x4D,   0x4E,   0x4F,
 	0x50,   0x51,   0x52,   0x53,   0x54,
 	0x55,   0x56,   0x57,   0x58,   0x59,
 	0x5A,   0x5B,   0x5C,   0x5D,   0x5E,
 	0x5F,   0x60,   0x61,   0x62,   0x63,
 	0x64,   0x65,   0x66,   0x67,   0x68,
 	0x69,   0x6A,   0x6B,   0x6C,   0x6D,
 	0x6E,   0x6F,   0x70,   0x71,   0x72,
 	0x73,   0x74,   0x75,   0x76,   0x77,
 	0x78,   0x79,   0x7A,   0x7B,   0x7C,
 	0x7D,   0x7E,   0x7F,   0x80,   0x81,
 	0x82,   0x83,   0x84,   0x85,   0x86,
 	0x87,   0x88,   0x89,   0x8A,   0x8B,
 	0x8C,   0x8D,   0x8E,   0x8F,   0x90,
 	0x91,   0x92,   0x93,   0x94,   0x95,
 	0x96,   0x97,   0x98,   0x99,   0x9A,
 	0x9B,   0x9C,   0x9D,   0x9E,   0x9F,
 	0xA0,   0xA1,   0xA2,   0xA3,   0xA8,
 	0xA5,   0x00,   0xA7,   0xC820, 0xD3,
 	0xE3,   0xAB,   0x00,   0x00,   0xD2,
 	0xC520, 0xB0,   0xB1,   0xB2,   0xB3,
 	0xC220, 0xB5,   0xB6,   0xB7,   0xCB20,
 	0xD1,   0xEB,   0xBB,   0xBC,   0xBD,
 	0xBE,   0xBF,   0xC141, 0xC241, 0xC341,
 	0xC441, 0xC841, 0xCA41, 0xE1,   0xCB43,
 	0xC145, 0xC245, 0xC345, 0xC845, 0xC149,
 	0xC249, 0xC349, 0xC849, 0xE2,   0xC44E,
 	0xC14F, 0xC24F, 0xC34F, 0xC44F, 0xC84F,
 	0xB4,   0xE9,   0xC155, 0xC255, 0xC355,
 	0xC855, 0xC259, 0xEC,   0xFB,   0xC161,
 	0xC261, 0xC361, 0xC461, 0xC861, 0xCA61,
 	0xF1,   0xCB63, 0xC165, 0xC265, 0xC365,
 	0xC865, 0xC169, 0xC269, 0xC369, 0xC869,
 	0xF3,   0xC46E, 0xC16F, 0xC26F, 0xC36F,
 	0xC46F, 0xC86F, 0xB8,   0xF9,   0xC175,
 	0xC275, 0xC375, 0xC875, 0xC279, 0xFC,
 	0xC879, 0xC541, 0xC561, 0xC641, 0xC661,
 	0xCE41, 0xCE61, 0xC243, 0xC263, 0xC343,
 	0xC363, 0xC743, 0xC763, 0xCF43, 0xCF63,
 	0xCF44, 0xCF64, 0x00,   0xF2,   0xC545,
 	0xC565, 0x00,   0x00,   0xC745, 0xC765,
 	0xCE45, 0xCE65, 0xCF45, 0xCF65, 0xC347,
 	0xC367, 0xC647, 0xC667, 0xC747, 0xC767,
 	0xCB47, 0xCB67, 0xC348, 0xC368, 0xE4,
 	0xF4,   0xC449, 0xC469, 0xC549, 0xC569,
 	0x00,   0x00,   0xCE49, 0xCE69, 0xC749,
 	0xF5,   0xE6,   0xF6,   0xC34A, 0xC36A,
 	0xCB4B, 0xCB6B, 0xF0,   0xC24C, 0xC26C,
 	0xCB4C, 0xCB6C, 0xCF4C, 0xCF6C, 0xE7,
 	0xF7,   0xE8,   0xF8,   0xC24E, 0xC26E,
 	0xCB4E, 0xCB6E, 0xCF4E, 0xCF6E, 0xEF,
 	0xEE,   0xFE,   0xC54F, 0xC56F, 0x00,
 	0x00,   0xCD4F, 0xCD6F, 0xEA,   0xFA,
 	0xC252, 0xC272, 0xCB52, 0xCB72, 0xCF52,
 	0xCF72, 0xC253, 0xC273, 0xC353, 0xC373,
 	0xCB53, 0xCB73, 0xCF53, 0xCF73, 0xCB54,
 	0xCB74, 0xCF54, 0xCF74, 0xED,   0xFD,
 	0xC455, 0xC475, 0xC555, 0xC575, 0xC655,
 	0xC675, 0xCA55, 0xCA75, 0xCD55, 0xCD75,
 	0xCE55, 0xCE75, 0xC357, 0xC377, 0xC359,
 	0xC379, 0xC859, 0xC25A, 0xC27A, 0xC75A,
 	0xC77A, 0xCF5A, 0xCF7A
 };
 struct extended_range {
 	const int codepoint;
 	const int value;
 };
 bool operator<(extended_range const& lft, extended_range const& rgt) {
 	return lft.codepoint < rgt.codepoint;
 }
 bool operator<(int lft, extended_range const& rgt) {
 	return lft < rgt.codepoint;
 }
 bool operator<(extended_range const& lft, int rgt) {
 	return lft.codepoint < rgt;
 }
 // ISO-6937-2 values for codepoints that don't come in a nice contiguous block
 const extended_range iso6937_extended_codepoints[] = {
 	{ 0x02C7, 0xCF20 },
 	{ 0x02D8, 0xC620 },
 	{ 0x02D9, 0xC720 },
 	{ 0x02DA, 0xCA20 },
 	{ 0x02DB, 0xCE20 },
 	{ 0x02DD, 0xCD20 },
 	{ 0x2014, 0xD0 },
 	{ 0x2018, 0xA9 },
 	{ 0x2019, 0xB9 },
 	{ 0x201C, 0xAA },
 	{ 0x201D, 0xBA },
 	{ 0x2022, 0xD4 },
 	{ 0x20AC, 0xA4 }, // ETSI EN 300 468 extension: euro sign at A4
 	{ 0x2126, 0xE0 },
 	{ 0x215B, 0xDC },
 	{ 0x215C, 0xDD },
 	{ 0x215D, 0xDE },
 	{ 0x2190, 0xAC },
 	{ 0x2191, 0xAD },
 	{ 0x2192, 0xAE },
 	{ 0x2193, 0xAF },
 	{ 0x266A, 0xD5 }
 };
 #define countof(array) (sizeof(array) / sizeof((array)[0]))
 /// Get the ISO-6937-2 value for the given unicode codepoint or 0 if it cannot be mapped
 int get_iso6937(int codepoint) {
 	if (static_cast<size_t>(codepoint) < countof(iso6937_codepoints))
 		return iso6937_codepoints[codepoint];
 	const extended_range *end = iso6937_extended_codepoints + countof(iso6937_extended_codepoints);
 	const extended_range *ext = std::lower_bound(iso6937_extended_codepoints, end, codepoint);
 	if (ext == end || ext->codepoint != codepoint)
 		return 0;
 	return ext->value;
 }
 } // namespace {
 namespace agi { namespace charset {
 #ifdef _LIBICONV_VERSION
 #define INTERNAL_CHARSET "UCS-4-INTERNAL"
 #else
 #define INTERNAL_CHARSET "WCHAR_T"
 #endif
 Converter6937::Converter6937(bool subst, const char *src)
 : to_ucs4(new IconvWrapper(src, INTERNAL_CHARSET))
 , subst(subst)
 {
 }
 size_t Converter6937::Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) {
 	// No state to reset
 	if (!inbuf || !inbytesleft)
 		return 0;
 	size_t bytes_written = 0;
 	while (*inbytesleft > 0) {
 		int in_val = 0;
 		// Copy inbuf/inbytesleft so that we don't update them if the
 		// conversion fails (due to not enough space or a bad sequence)
 		const char *inbuftmp = *inbuf;
 		size_t inbyteslefttmp = *inbytesleft;
 		char *val_buf = reinterpret_cast<char *>(&in_val);
 		size_t val_buf_size = sizeof(in_val);
 		// Get the next unicode character from the input
 		size_t ret = to_ucs4->Convert(&inbuftmp, &inbyteslefttmp, &val_buf, &val_buf_size);
 		if (ret == (size_t)-1 && errno != E2BIG)
 			return ret;
 		// And convert that to ISO-6937-2
 		int val = get_iso6937(in_val);
 		if (!val && in_val) {
 			if (subst) {
 				val = '?';
 			}
 			else {
 				errno = EILSEQ;
 				return (size_t)-1;
 			}
 		}
 		if (*outbytesleft < 1 || (val > 255 && *outbytesleft < 2)) {
 			errno = E2BIG;
 			return (size_t)-1;
 		}
 #define WRITE_BYTE(b) \
 		do { \
 			*(*outbuf)++ = (b); \
 			--*outbytesleft; \
 			++bytes_written; \
 		} while(0)
 		if (val <= 255)
 			WRITE_BYTE(val);
 		else {
 			WRITE_BYTE((val >> 8) & 0xFF);
 			WRITE_BYTE(val & 0xFF);
 		}
 		// Update the input pointers now that the conversion has succeeded
 		*inbuf = inbuftmp;
 		*inbytesleft = inbyteslefttmp;
 	}
 	return bytes_written;
 }
 } } // namespace agi::charset
--- a/aegisub/libaegisub/common/charset_6937.h
+++ b/aegisub/libaegisub/common/charset_6937.h
@ -0,0 +1,46 @@
 // Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
 //
 // Permission to use, copy, modify, and distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice appear in all copies.
 //
 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 //
 // $Id$
 /// @file charset_6937.h
 /// @brief A charset converter for ISO-6937-2
 /// @ingroup libaegisub
 #include <libaegisub/charset_conv.h>
 namespace agi { namespace charset {
 /// @brief A charset converter for ISO-6937-2
 ///
 /// While glibc iconv supports ISO-6937-2, GNU libiconv does not due to that
 /// it's not used by anything but old subtitle formats
 class Converter6937 : public Converter {
 	/// Converter to UCS-4 so that we only have to deal with unicode codepoints
 	agi::scoped_ptr<IconvWrapper> to_ucs4;
 	/// Should unsupported characters be replaced with '?'
 	const bool subst;
 public:
 	/// Constructor
 	/// @param subst Enable substitution for unsupported characters
 	/// @param src Source encoding
 	Converter6937(bool subst, const char *src);
 	/// Convert a string. Interface is the same as iconv.
 	size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft);
 };
 } }
--- a/aegisub/libaegisub/common/charset_conv.cpp
+++ b/aegisub/libaegisub/common/charset_conv.cpp
@ -31,6 +31,8 @@
 #include <libaegisub/charset_conv.h>
 #include <iconv.h>
 #include "charset_6937.h"
 // Check if we can use advanced fallback capabilities added in GNU's iconv
 // implementation
 #if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG)
@ -52,244 +54,251 @@ namespace {
 			return strcmp(s1, s2) < 0;
 		}
 	};
-}
+
 	agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst);
 /// @brief Map a user-friendly encoding name to the real encoding name
-static const char* GetRealEncodingName(const char* name) {
+	const char* get_real_encoding_name(const char* name) {
-	static std::map<const char*, const char*, ltstr> prettyNames;
+		static std::map<const char*, const char*, ltstr> pretty_names;
-	if (prettyNames.empty()) {
+		if (pretty_names.empty()) {
-#		define ADD(pretty, real) prettyNames[pretty] = real
+#			define ADD(pretty, real) pretty_names[pretty] = real
-#		include <libaegisub/charsets.def>
+#			include <libaegisub/charsets.def>
-#		undef ADD
+#			undef ADD
 	}
 	std::map<const char*, const char*, ltstr>::iterator real = prettyNames.find(name);
 	if (real != prettyNames.end()) {
 		return real->second;
 	}
 	return name;
 }
 namespace agi {
 	namespace charset {
 static size_t get_bom_size(iconv_t cd) {
 	// Most (but not all) iconv implementations automatically insert a BOM
 	// at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but
 	// we usually don't want this, as some of the wxString using code
 	// assumes there is no BOM (as the exact encoding is known externally)
 	// As such, when doing conversions we will strip the BOM if it exists,
 	// then manually add it when writing files
 	char buff[8];
 	const char* src = "";
 	char *dst = buff;
 	size_t srcLen = 1;
 	size_t dstLen = 8;
 	size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
 	assert(res != iconv_failed);
 	assert(srcLen == 0);
 	size_t size = 0;
 	for (src = buff; src < dst; ++src) {
 		if (*src) ++size;
 	}
 	if (size) {
 		// If there is a BOM, it will always be at least as big as the NUL
 		size = std::max(size, (8 - dstLen) / 2);
 	}
 	return size;
 }
 static void eat_bom(iconv_t cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
 	// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
 	// a specified byte order), skip over it
 	if (bomSize > 0 && inbytesleft && *inbytesleft) {
 		// libiconv marks the bom as written after writing the first
 		// character after the bom rather than when it writes the bom, so
 		// convert at least one extra character
 		char bom[8];
 		char *dst = bom;
 		size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft);
 		const char *src = *inbuf;
 		size_t srcSize = *inbytesleft;
 		iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize);
 	}
 }
 #ifdef ICONV_POSIX
 class Converter {
 	size_t bomSize;
 	iconv_t cd;
 public:
 	// subst is not used here because POSIX doesn't let you disable substitution
 	Converter(bool, const char* sourceEncoding, const char* destEncoding)
 	{
 		const char *dstEnc = GetRealEncodingName(destEncoding);
 		cd = iconv_open(dstEnc, "UTF-8");
 		if (cd == iconv_invalid) {
 			throw UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
 		}
-		bomSize = get_bom_size(cd);
+		std::map<const char*, const char*, ltstr>::iterator real = pretty_names.find(name);
-		iconv_close(cd);
+		if (real != pretty_names.end())
-		cd = iconv_open(dstEnc, GetRealEncodingName(sourceEncoding));
+			return real->second;
-		if (cd == iconv_invalid) {
+		return name;
 			throw UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
 		}
 	}
 	~Converter() {
 		if (cd != iconv_invalid) iconv_close(cd);
 	}
 	size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
 		eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
-		size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
+	size_t get_bom_size(iconv_t cd) {
 		// Most (but not all) iconv implementations automatically insert a BOM
 		// at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but
 		// we usually don't want this, as some of the wxString using code
 		// assumes there is no BOM (as the exact encoding is known externally)
 		// As such, when doing conversions we will strip the BOM if it exists,
 		// then manually add it when writing files
-		// This loop never does anything useful with a POSIX-compliant iconv
+		char buff[8];
-		// implementation, but those don't seem to actually exist
+		const char* src = "";
-		while (res == iconv_failed && errno != E2BIG) {
+		char *dst = buff;
 			++*inbuf;
 			--*inbytesleft;
 			res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
 		}
 		return res;
 	}
 };
 #else
 class Converter : public iconv_fallbacks {
 	size_t bomSize;
 	char invalidRep[8];
 	size_t invalidRepSize;
 	iconv_t cd;
 	static void fallback(
 		unsigned int code,
 		void (*callback) (const char *buf, size_t buflen, void* callback_arg),
 		void *callback_arg,
 		void *convPtr)
 	{
 		// At some point in the future, this should probably switch to a real mapping
 		// For now, there's just three cases: BOM to nothing, '\' to itself
 		// (for Shift-JIS, which does not have \) and everything else to '?'
 		if (code == 0xFEFF) return;
 		if (code == 0x5C) callback("\\", 1, callback_arg);
 		else {
 			Converter *self = static_cast<Converter *>(convPtr);
 			callback(self->invalidRep, self->invalidRepSize, callback_arg);
 		}
 	}
 	Converter(Converter const&);
 	Converter& operator=(Converter const&);
 public:
 	Converter(bool subst, const char* sourceEncoding, const char* destEncoding)
 	{
 		const char *dstEnc = GetRealEncodingName(destEncoding);
 		cd = iconv_open(dstEnc, "UTF-8");
 		if (cd == iconv_invalid) {
 			throw UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
 		}
 		bomSize = get_bom_size(cd);
 		// Get fallback character
 		const char sbuff[] = "?";
 		const char *src = sbuff;
 		char *dst = invalidRep;
 		size_t dstLen = 4;
 		size_t srcLen = 1;
 		size_t dstLen = 8;
-		size_t res = Convert(&src, &srcLen, &dst, &dstLen);
+		size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
 		assert(res != iconv_failed);
 		assert(srcLen == 0);
-		invalidRepSize = 4 - dstLen;
+		size_t size = 0;
-
+		for (src = buff; src < dst; ++src) {
-		iconv_close(cd);
+			if (*src) ++size;
 		cd = iconv_open(dstEnc, GetRealEncodingName(sourceEncoding));
 		if (cd == iconv_invalid) {
 			throw UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
 		}
 		if (size) {
 			// If there is a BOM, it will always be at least as big as the NUL
 			size = std::max(size, (8 - dstLen) / 2);
 		}
 		return size;
 	}
-		if (subst) {
+	void eat_bom(iconv_t cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
-			data = this;
+		// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
-			mb_to_uc_fallback = NULL;
+		// a specified byte order), skip over it
-			mb_to_wc_fallback = NULL;
+		if (bomSize > 0 && inbytesleft && *inbytesleft) {
-			uc_to_mb_fallback = fallback;
+			// libiconv marks the bom as written after writing the first
-			wc_to_mb_fallback = NULL;
+			// character after the bom rather than when it writes the bom, so
-
+			// convert at least one extra character
-			int transliterate = 1;
+			char bom[8];
-			iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate);
+			char *dst = bom;
-			iconvctl(cd, ICONV_SET_FALLBACKS, this);
+			size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft);
 			const char *src = *inbuf;
 			size_t srcSize = *inbytesleft;
 			iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize);
 		}
 	}
-	~Converter() {
+
-		if (cd != iconv_invalid) iconv_close(cd);
+	// Calculate the size of NUL in the given character set
 	size_t nul_size(const char* encoding) {
 		// We need a character set to convert from with a known encoding of NUL
 		// UTF-8 seems like the obvious choice
 		agi::scoped_ptr<agi::charset::Converter> cd(get_converter(false, "UTF-8", encoding));
 		char dbuff[4];
 		char sbuff[] = "";
 		char* dst = dbuff;
 		const char* src = sbuff;
 		size_t dstLen = sizeof(dbuff);
 		size_t srcLen = 1;
 		size_t ret = cd->Convert(&src, &srcLen, &dst, &dstLen);
 		assert(ret != iconv_failed);
 		assert(dst - dbuff > 0);
 		return dst - dbuff;
 	}
 	size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
 		eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
 		size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
-		if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) {
+#ifdef ICONV_POSIX
-			// libiconv checks if there are any bytes left in the output buffer
+	class ConverterImpl : public agi::charset::Converter {
-			// before checking if the conversion would actually write any
+		size_t bomSize;
-			// characters to the output buffer, resulting in occasional invalid
+		iconv_t cd;
-			// E2BIG false positives
+	public:
-			char buff[8];
+		// subst is not used here because POSIX doesn't let you disable substitution
-			size_t buffsize = 8;
+		ConverterImpl(bool, const char* sourceEncoding, const char* destEncoding)
-			char* out = buff;
+		{
-			const char* in = *inbuf;
+			const char *dstEnc = get_real_encoding_name(destEncoding);
-			size_t insize = *inbytesleft;
+			cd = iconv_open(dstEnc, "UTF-8");
 			if (cd == iconv_invalid) {
 				throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
 			}
-			res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
+			bomSize = get_bom_size(cd);
-			// If no bytes of the output buffer were used, the original
+			iconv_close(cd);
-			// conversion may have been successful
+			cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding));
-			if (buffsize != 8) {
+			if (cd == iconv_invalid) {
-				errno = E2BIG;
+				throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
 				res = iconv_failed;
 			}
 		}
 		~ConverterImpl() {
 			if (cd != iconv_invalid) iconv_close(cd);
 		}
 		size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
 			eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
-		return res;
+			size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
-	}
+
-};
+			// This loop never does anything useful with a POSIX-compliant iconv
 			// implementation, but those don't seem to actually exist
 			while (res == iconv_failed && errno != E2BIG) {
 				++*inbuf;
 				--*inbytesleft;
 				res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
 			}
 			return res;
 		}
 	};
 #else
 	class ConverterImpl : public iconv_fallbacks, public agi::charset::Converter {
 		size_t bomSize;
 		char invalidRep[8];
 		size_t invalidRepSize;
 		iconv_t cd;
 		static void fallback(
 			unsigned int code,
 			void (*callback) (const char *buf, size_t buflen, void* callback_arg),
 			void *callback_arg,
 			void *convPtr)
 		{
 			// At some point in the future, this should probably switch to a real mapping
 			// For now, there's just three cases: BOM to nothing, '\' to itself
 			// (for Shift-JIS, which does not have \) and everything else to '?'
 			if (code == 0xFEFF) return;
 			if (code == 0x5C) callback("\\", 1, callback_arg);
 			else {
 				ConverterImpl *self = static_cast<ConverterImpl *>(convPtr);
 				callback(self->invalidRep, self->invalidRepSize, callback_arg);
 			}
 		}
 		ConverterImpl(ConverterImpl const&);
 		ConverterImpl& operator=(ConverterImpl const&);
 	public:
 		ConverterImpl(bool subst, const char* sourceEncoding, const char* destEncoding)
 		{
 			const char *dstEnc = get_real_encoding_name(destEncoding);
 			cd = iconv_open(dstEnc, "UTF-8");
 			if (cd == iconv_invalid)
 				throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
 			bomSize = get_bom_size(cd);
 			// Get fallback character
 			const char sbuff[] = "?";
 			const char *src = sbuff;
 			char *dst = invalidRep;
 			size_t dstLen = 4;
 			size_t srcLen = 1;
 			size_t res = Convert(&src, &srcLen, &dst, &dstLen);
 			assert(res != iconv_failed);
 			assert(srcLen == 0);
 			invalidRepSize = 4 - dstLen;
 			iconv_close(cd);
 			cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding));
 			if (cd == iconv_invalid)
 				throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
 			if (subst) {
 				data = this;
 				mb_to_uc_fallback = NULL;
 				mb_to_wc_fallback = NULL;
 				uc_to_mb_fallback = fallback;
 				wc_to_mb_fallback = NULL;
 				int transliterate = 1;
 				iconvctl(cd, ICONV_SET_TRANSLITERATE, &transliterate);
 				iconvctl(cd, ICONV_SET_FALLBACKS, this);
 			}
 		}
 		~ConverterImpl() {
 			if (cd != iconv_invalid) iconv_close(cd);
 		}
 		size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
 			eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
 			size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
 			if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) {
 				// libiconv checks if there are any bytes left in the output buffer
 				// before checking if the conversion would actually write any
 				// characters to the output buffer, resulting in occasional invalid
 				// E2BIG false positives
 				char buff[8];
 				size_t buffsize = 8;
 				char* out = buff;
 				const char* in = *inbuf;
 				size_t insize = *inbytesleft;
 				res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
 				// If no bytes of the output buffer were used, the original
 				// conversion may have been successful
 				if (buffsize != 8) {
 					errno = E2BIG;
 					res = iconv_failed;
 				}
 			}
 			return res;
 		}
 	};
 #endif
-// Calculate the size of NUL in the given character set
+	agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst) {
-static size_t NulSize(const char* encoding) {
+		try {
-	// We need a character set to convert from with a known encoding of NUL
+			return new ConverterImpl(subst, src, dst);
-	// UTF-8 seems like the obvious choice
+		}
-	Converter cd(false, "UTF-8", encoding);
+		catch (agi::charset::UnsupportedConversion const&) {
 			if (strcmp(dst, "ISO-6937-2"))
 				throw;
 			return new agi::charset::Converter6937(subst, src);
 		}
 	}
 } // namespace {
-	char dbuff[4];
+namespace agi { namespace charset {
 	char sbuff[] = "";
 	char* dst = dbuff;
 	const char* src = sbuff;
 	size_t dstLen = sizeof(dbuff);
 	size_t srcLen = 1;
 	size_t ret = cd.Convert(&src, &srcLen, &dst, &dstLen);
 	assert(ret != iconv_failed);
 	assert(dst - dbuff > 0);
 	return dst - dbuff;
 }
 IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding, bool enableSubst)
 : toNulLen(0)
 , fromNulLen(0)
-, conv(new Converter(enableSubst, sourceEncoding, destEncoding))
+, conv(get_converter(enableSubst, sourceEncoding, destEncoding))
 {
 	// These need to be set only after we verify that the source and dest
 	// charsets are valid
-	toNulLen = NulSize(destEncoding);
+	toNulLen = nul_size(destEncoding);
-	fromNulLen = NulSize(sourceEncoding);
+	fromNulLen = nul_size(sourceEncoding);
 }
 IconvWrapper::~IconvWrapper() {
 }
--- a/aegisub/libaegisub/include/libaegisub/charset_conv.h
+++ b/aegisub/libaegisub/include/libaegisub/charset_conv.h
@ -41,9 +41,12 @@ DEFINE_SIMPLE_EXCEPTION_NOINNER(BadOutput, ConversionFailure, "iconv/failed/EINV
 typedef void* iconv_t;
-// Helper class that abstracts away the differences between libiconv and
+/// Helper class that abstracts away the differences between libiconv and
-// POSIX iconv implementations
+/// POSIX iconv implementations
-class Converter;
+struct Converter {
 	virtual ~Converter() { }
 	virtual size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) = 0;
 };
 /// @brief A C++ wrapper for iconv
 class IconvWrapper {
--- a/aegisub/tests/libaegisub_iconv.cpp
+++ b/aegisub/tests/libaegisub_iconv.cpp
@ -150,3 +150,36 @@ TEST(lagi_iconv, Roundtrip) {
 					"Jackdaws love my big sphinx of quartz")));
 	}
 }
 TEST(lagi_iconv, Iso6937) {
 	ASSERT_NO_THROW(IconvWrapper("UTF-8", "ISO-6937-2"));
 	IconvWrapper subst("UTF-8", "ISO-6937-2");
 	IconvWrapper no_subst("UTF-8", "ISO-6937-2", false);
 	// 7-bit is same as ISO-8859
 	for (int i = 0; i < 128; ++i) {
 		const char buf[] = { i, 0 };
 		std::string ret;
 		EXPECT_NO_THROW(ret = subst.Convert(buf));
 		EXPECT_STREQ(buf, ret.c_str());
 	}
 	std::string ret;
 	// LATIN CAPITAL LETTER D WITH CARON (U+010E) - multibyte char in main block
 	EXPECT_NO_THROW(ret = subst.Convert("\xC4\x8E"));
 	EXPECT_STREQ("\xCF\x44", ret.c_str());
 	// BREVE - multibyte char in extended ranges
 	EXPECT_NO_THROW(ret = subst.Convert("\xCB\x98"));
 	EXPECT_STREQ("\xC6\x20", ret.c_str());
 	// EM DASH - single byte char in extended ranges
 	EXPECT_NO_THROW(ret = subst.Convert("\xE2\x80\x94"));
 	EXPECT_STREQ("\xD0", ret.c_str());
 	// codepoint not in ISO-6937-2
 	EXPECT_NO_THROW(ret = subst.Convert("\xCB\x97"));
 	EXPECT_STREQ("?", ret.c_str());
 	EXPECT_THROW(no_subst.Convert("\xCB\x97"), agi::charset::BadOutput);
 }