forked from mia/Aegisub
Work around BOM-related issues with UTF-32 and UTF-16 with unspecified byte order
Originally committed to SVN as r4656.
This commit is contained in:
parent
ee4c5dee0b
commit
6623239682
3 changed files with 72 additions and 27 deletions
|
@ -76,18 +76,14 @@ namespace agi {
|
||||||
namespace charset {
|
namespace charset {
|
||||||
|
|
||||||
#ifdef ICONV_POSIX
|
#ifdef ICONV_POSIX
|
||||||
class IconvWrapper::Converter {
|
struct iconv_fallbacks {
|
||||||
public:
|
|
||||||
Converter(bool, const char*) { }
|
|
||||||
size_t operator()(iconv_t cd, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
|
|
||||||
return iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
#else
|
#endif
|
||||||
class IconvWrapper::Converter : public iconv_fallbacks {
|
|
||||||
private:
|
class Converter : public iconv_fallbacks {
|
||||||
bool subst;
|
bool subst;
|
||||||
char invalidRep[4];
|
int bomSize;
|
||||||
|
char invalidRep[8];
|
||||||
size_t invalidRepSize;
|
size_t invalidRepSize;
|
||||||
static void fallback(
|
static void fallback(
|
||||||
unsigned int code,
|
unsigned int code,
|
||||||
|
@ -109,30 +105,70 @@ public:
|
||||||
Converter(bool subst, const char* targetEnc)
|
Converter(bool subst, const char* targetEnc)
|
||||||
: subst(subst)
|
: subst(subst)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
#ifndef ICONV_POSIX
|
||||||
data = this;
|
data = this;
|
||||||
mb_to_uc_fallback = NULL;
|
mb_to_uc_fallback = NULL;
|
||||||
mb_to_wc_fallback = NULL;
|
mb_to_wc_fallback = NULL;
|
||||||
uc_to_mb_fallback = fallback;
|
uc_to_mb_fallback = fallback;
|
||||||
wc_to_mb_fallback = NULL;
|
wc_to_mb_fallback = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
char sbuff[] = "?";
|
char buff[8];
|
||||||
const char* src = sbuff;
|
|
||||||
char* dst = invalidRep;
|
|
||||||
size_t dstLen = 4;
|
|
||||||
size_t srcLen = 1;
|
|
||||||
|
|
||||||
iconv_t cd = iconv_open(GetRealEncodingName(targetEnc), "UTF-8");
|
iconv_t cd = iconv_open(GetRealEncodingName(targetEnc), "UTF-8");
|
||||||
assert(cd != iconv_invalid);
|
assert(cd != iconv_invalid);
|
||||||
|
|
||||||
|
// Get BOM size (if any)
|
||||||
|
const char* src = "";
|
||||||
|
char *dst = buff;
|
||||||
|
size_t srcLen = 1;
|
||||||
|
size_t dstLen = 8;
|
||||||
|
|
||||||
size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
|
size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
|
||||||
assert(res != iconv_failed);
|
assert(res != iconv_failed);
|
||||||
assert(srcLen == 0);
|
assert(srcLen == 0);
|
||||||
iconv_close(cd);
|
src = buff;
|
||||||
|
bomSize = 0;
|
||||||
|
for (src = buff; src < dst; ++src) {
|
||||||
|
if (*src) {
|
||||||
|
bomSize = (8 - dstLen) / 2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get fallback character
|
||||||
|
char sbuff[] = "?";
|
||||||
|
src = sbuff;
|
||||||
|
dst = invalidRep;
|
||||||
|
dstLen = 4;
|
||||||
|
srcLen = 1;
|
||||||
|
|
||||||
|
res = operator()(cd, &src, &srcLen, &dst, &dstLen);
|
||||||
|
assert(res != iconv_failed);
|
||||||
|
assert(srcLen == 0);
|
||||||
|
|
||||||
invalidRepSize = 4 - dstLen;
|
invalidRepSize = 4 - dstLen;
|
||||||
|
|
||||||
|
iconv_close(cd);
|
||||||
}
|
}
|
||||||
size_t operator()(iconv_t cd, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
|
size_t operator()(iconv_t cd, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
|
||||||
|
// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
|
||||||
|
// a specified byte order), skip over it
|
||||||
|
if (bomSize > 0 && inbytesleft && *inbytesleft) {
|
||||||
|
// libiconv marks the bom as written after writing the first
|
||||||
|
// character after the bom rather than when it writes the bom, so
|
||||||
|
// convert at least one extra character
|
||||||
|
char bom[8];
|
||||||
|
char *dst = bom;
|
||||||
|
size_t dstSize = min(8, bomSize + *outbytesleft);
|
||||||
|
const char *src = *inbuf;
|
||||||
|
size_t srcSize = *inbytesleft;
|
||||||
|
iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize);
|
||||||
|
}
|
||||||
size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
|
size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
|
||||||
|
|
||||||
|
#ifndef ICONV_POSIX
|
||||||
if (!subst) return res;
|
if (!subst) return res;
|
||||||
|
|
||||||
// Save original errno so we can return it rather than the result from iconvctl
|
// Save original errno so we can return it rather than the result from iconvctl
|
||||||
|
@ -157,8 +193,8 @@ public:
|
||||||
}
|
}
|
||||||
if (res == iconv_failed && err == E2BIG && *outbytesleft == 0) {
|
if (res == iconv_failed && err == E2BIG && *outbytesleft == 0) {
|
||||||
// Check for E2BIG false positives
|
// Check for E2BIG false positives
|
||||||
char buff[4];
|
char buff[8];
|
||||||
size_t buffsize = 4;
|
size_t buffsize = 8;
|
||||||
char* out = buff;
|
char* out = buff;
|
||||||
const char* in = *inbuf;
|
const char* in = *inbuf;
|
||||||
size_t insize = *inbytesleft;
|
size_t insize = *inbytesleft;
|
||||||
|
@ -167,7 +203,7 @@ public:
|
||||||
res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
|
res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
|
||||||
// If no bytes of the output buffer were used, the original
|
// If no bytes of the output buffer were used, the original
|
||||||
// conversion may have been successful
|
// conversion may have been successful
|
||||||
if (buffsize == 4) {
|
if (buffsize == 8) {
|
||||||
err = errno;
|
err = errno;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -177,10 +213,10 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
errno = err;
|
errno = err;
|
||||||
|
#endif
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
// Calculate the size of NUL in the given character set
|
// Calculate the size of NUL in the given character set
|
||||||
static size_t NulSize(const char* encoding) {
|
static size_t NulSize(const char* encoding) {
|
||||||
|
@ -188,6 +224,7 @@ static size_t NulSize(const char* encoding) {
|
||||||
// UTF-8 seems like the obvious choice
|
// UTF-8 seems like the obvious choice
|
||||||
iconv_t cd = iconv_open(GetRealEncodingName(encoding), "UTF-8");
|
iconv_t cd = iconv_open(GetRealEncodingName(encoding), "UTF-8");
|
||||||
assert(cd != iconv_invalid);
|
assert(cd != iconv_invalid);
|
||||||
|
Converter conv(false, GetRealEncodingName(encoding));
|
||||||
|
|
||||||
char dbuff[4];
|
char dbuff[4];
|
||||||
char sbuff[] = "";
|
char sbuff[] = "";
|
||||||
|
@ -196,7 +233,7 @@ static size_t NulSize(const char* encoding) {
|
||||||
size_t dstLen = sizeof(dbuff);
|
size_t dstLen = sizeof(dbuff);
|
||||||
size_t srcLen = 1;
|
size_t srcLen = 1;
|
||||||
|
|
||||||
size_t ret = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
|
size_t ret = conv(cd, &src, &srcLen, &dst, &dstLen);
|
||||||
assert(ret != iconv_failed);
|
assert(ret != iconv_failed);
|
||||||
assert(dst - dbuff > 0);
|
assert(dst - dbuff > 0);
|
||||||
iconv_close(cd);
|
iconv_close(cd);
|
||||||
|
|
|
@ -51,13 +51,12 @@ T const& GetEncodingsList() {
|
||||||
|
|
||||||
typedef void* iconv_t;
|
typedef void* iconv_t;
|
||||||
|
|
||||||
|
// Helper class that abstracts away the differences betwen libiconv and
|
||||||
|
// POSIX iconv implementations
|
||||||
|
class Converter;
|
||||||
|
|
||||||
/// @brief A C++ wrapper for iconv
|
/// @brief A C++ wrapper for iconv
|
||||||
class IconvWrapper {
|
class IconvWrapper {
|
||||||
private:
|
|
||||||
// Helper class that abstracts away the differences betwen libiconv and
|
|
||||||
// POSIX iconv implementations
|
|
||||||
class Converter;
|
|
||||||
|
|
||||||
iconv_t cd;
|
iconv_t cd;
|
||||||
size_t toNulLen;
|
size_t toNulLen;
|
||||||
size_t fromNulLen;
|
size_t fromNulLen;
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <libaegisub/charset_conv.h>
|
#include <libaegisub/charset_conv.h>
|
||||||
|
|
||||||
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
|
||||||
|
@ -136,3 +137,11 @@ TEST(lagi_iconv, LocalSupport) {
|
||||||
TEST(lagi_iconv, wchar_tSupport) {
|
TEST(lagi_iconv, wchar_tSupport) {
|
||||||
EXPECT_NO_THROW(IconvWrapper("UTF-8", "wchar_t"));
|
EXPECT_NO_THROW(IconvWrapper("UTF-8", "wchar_t"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(lagi_iconv, pretty_names) {
|
||||||
|
std::vector<std::string> names = GetEncodingsList<std::vector<std::string> >();
|
||||||
|
for (std::vector<std::string>::iterator cur = names.begin(); cur != names.end(); ++cur) {
|
||||||
|
EXPECT_NO_THROW(IconvWrapper("utf-8", cur->c_str()));
|
||||||
|
EXPECT_NO_THROW(IconvWrapper(cur->c_str(), "utf-8"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue