Add some documentation for AegisubCSConv and TextFileReader.
Originally committed to SVN as r4036.
This commit is contained in:
parent
c7d95e5590
commit
ba088237d7
5 changed files with 155 additions and 298 deletions
|
@ -103,7 +103,6 @@ void AssFile::Load (const wxString _filename,const wxString charset,bool addToRe
|
|||
wxString enc;
|
||||
if (charset.IsEmpty()) enc = TextFileReader::GetEncoding(_filename);
|
||||
else enc = charset;
|
||||
TextFileReader::EnsureValid(enc);
|
||||
|
||||
// Generic preparation
|
||||
Clear();
|
||||
|
|
|
@ -49,52 +49,27 @@
|
|||
WX_DECLARE_STRING_HASH_MAP(wxString, PrettyNamesHash);
|
||||
|
||||
#if wxUSE_THREADS
|
||||
|
||||
/// DOCME
|
||||
static wxMutex encodingListMutex;
|
||||
#endif
|
||||
|
||||
|
||||
/// DOCME
|
||||
static const iconv_t iconv_invalid = (iconv_t)-1;
|
||||
|
||||
/// DOCME
|
||||
static const size_t iconv_failed = (size_t)-1;
|
||||
|
||||
/// DOCME
|
||||
#define ICONV_CONST_CAST(a) const_cast<ICONV_CONST char *>(a)
|
||||
|
||||
#ifndef ICONV_POSIX
|
||||
static int addEncoding(unsigned int namescount, const char * const * names, void* data);
|
||||
#endif
|
||||
|
||||
/// DOCME
|
||||
static wxArrayString *supportedEncodings = NULL;
|
||||
|
||||
/// DOCME
|
||||
static wxArrayString *prettyEncodingList = NULL;
|
||||
|
||||
/// DOCME
|
||||
static PrettyNamesHash *prettyEncodingHash = NULL;
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param mbEncName
|
||||
/// @param enableSubst
|
||||
///
|
||||
AegisubCSConv::AegisubCSConv(const wxChar *mbEncName, bool enableSubst)
|
||||
: mbCharsetName(GetRealEncodingName(mbEncName)), mbNulLen(0), enableSubst(enableSubst)
|
||||
: wcCharsetName(WCHAR_T_ENCODING)
|
||||
, mbCharsetName(GetRealEncodingName(mbEncName))
|
||||
, mbNulLen(0)
|
||||
, enableSubst(enableSubst)
|
||||
, m2w(wcCharsetName, mbCharsetName)
|
||||
, w2m(mbCharsetName, wcCharsetName)
|
||||
{
|
||||
wcCharsetName = wxString::FromAscii(WCHAR_T_ENCODING);
|
||||
|
||||
m2w = iconv_open(wcCharsetName.ToAscii(), mbCharsetName.ToAscii());
|
||||
w2m = iconv_open(mbCharsetName.ToAscii(), wcCharsetName.ToAscii());
|
||||
|
||||
if (m2w == iconv_invalid || w2m == iconv_invalid) {
|
||||
if (m2w != iconv_invalid) iconv_close(m2w);
|
||||
if (w2m != iconv_invalid) iconv_close(w2m);
|
||||
|
||||
throw wxString::Format(_T("Character set %s is not supported."), mbEncName);
|
||||
throw wxString::Format(L"Character set %s is not supported.", mbEncName);
|
||||
}
|
||||
|
||||
if (enableSubst) {
|
||||
|
@ -110,26 +85,14 @@ AegisubCSConv::AegisubCSConv(const wxChar *mbEncName, bool enableSubst)
|
|||
}
|
||||
}
|
||||
|
||||
/// @brief DOCME
|
||||
///
|
||||
AegisubCSConv::~AegisubCSConv() {
|
||||
if (m2w != iconv_invalid) iconv_close(m2w);
|
||||
if (w2m != iconv_invalid) iconv_close(w2m);
|
||||
}
|
||||
|
||||
/// @brief DOCME
|
||||
/// @return
|
||||
///
|
||||
wxMBConv * AegisubCSConv::Clone() const {
|
||||
AegisubCSConv *c = new AegisubCSConv(mbCharsetName);
|
||||
c->mbNulLen = mbNulLen;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
/// @brief Calculate the size of NUL in the target encoding via iconv
|
||||
/// @return
|
||||
///
|
||||
/// @return The size in bytes of NUL
|
||||
size_t AegisubCSConv::GetMBNulLen() const {
|
||||
if (mbNulLen == 0) {
|
||||
const wchar_t nulStr[] = L"";
|
||||
|
@ -142,18 +105,13 @@ size_t AegisubCSConv::GetMBNulLen() const {
|
|||
size_t res = iconv(w2m, &inPtr, &inLen, &outPtr, &outLen);
|
||||
|
||||
if (res != 0)
|
||||
const_cast<AegisubCSConv *>(this)->mbNulLen = (size_t)-1;
|
||||
mbNulLen = (size_t)-1;
|
||||
else
|
||||
const_cast<AegisubCSConv *>(this)->mbNulLen = sizeof(outBuff) - outLen;
|
||||
mbNulLen = sizeof(outBuff) - outLen;
|
||||
}
|
||||
return mbNulLen;
|
||||
}
|
||||
|
||||
|
||||
/// @brief Calculate the length (in bytes) of a MB string, not including the terminator
|
||||
/// @param str
|
||||
/// @return
|
||||
///
|
||||
size_t AegisubCSConv::MBBuffLen(const char * str) const {
|
||||
size_t nulLen = GetMBNulLen();
|
||||
const char *ptr;
|
||||
|
@ -171,14 +129,12 @@ size_t AegisubCSConv::MBBuffLen(const char * str) const {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param dst
|
||||
/// @param dstSize
|
||||
/// @param src
|
||||
/// @param srcLen
|
||||
/// @return
|
||||
///
|
||||
/// @brief Convert a string from multibyte to wide characters
|
||||
/// @param dst Destination buffer.
|
||||
/// @param dstSize Length of destination buffer in wchar_ts
|
||||
/// @param src Source multibyte string
|
||||
/// @param srcLen Length of source buffer in bytes, or -1 to autodetect
|
||||
/// @return The number of wchar_ts needed to store the string in the target charset
|
||||
size_t AegisubCSConv::ToWChar(wchar_t *dst, size_t dstSize, const char *src, size_t srcLen) const {
|
||||
return doConversion(
|
||||
m2w,
|
||||
|
@ -189,14 +145,12 @@ size_t AegisubCSConv::ToWChar(wchar_t *dst, size_t dstSize, const char *src, siz
|
|||
) / sizeof(wchar_t);
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param dst
|
||||
/// @param dstSize
|
||||
/// @param src
|
||||
/// @param srcLen
|
||||
/// @return
|
||||
///
|
||||
/// @brief Convert a string from wide characters to multibyte
|
||||
/// @param dst Destination buffer
|
||||
/// @param dstSize Length of destination buffer in bytes
|
||||
/// @param src Source wide character string
|
||||
/// @param srcLen Length in wchar_ts of source, or -1 to autodetect
|
||||
/// @return The number of bytes needed to store the string in the target charset
|
||||
size_t AegisubCSConv::FromWChar(char *dst, size_t dstSize, const wchar_t *src, size_t srcLen) const {
|
||||
return doConversion(
|
||||
w2m,
|
||||
|
@ -207,15 +161,7 @@ size_t AegisubCSConv::FromWChar(char *dst, size_t dstSize, const wchar_t *src, s
|
|||
);
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param cd
|
||||
/// @param dst
|
||||
/// @param dstSize
|
||||
/// @param src
|
||||
/// @param srcSize
|
||||
/// @return
|
||||
///
|
||||
// Perform a conversion if a buffer is given or calculate the needed buffer size if not
|
||||
size_t AegisubCSConv::doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const {
|
||||
if (dstSize > 0) {
|
||||
return iconvWrapper(cd, &src, &srcSize, &dst, &dstSize);
|
||||
|
@ -239,20 +185,12 @@ size_t AegisubCSConv::doConversion(iconv_t cd, char *dst, size_t dstSize, char *
|
|||
return charsWritten;
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param cd
|
||||
/// @param inbuf
|
||||
/// @param inbytesleft
|
||||
/// @param outbuf
|
||||
/// @param outbytesleft
|
||||
/// @return
|
||||
///
|
||||
// Actually perform a conversion via iconv
|
||||
size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft,
|
||||
char **outbuf, size_t *outbytesleft) const {
|
||||
char **outbuf, size_t *outbytesleft) const {
|
||||
|
||||
#if wxUSE_THREADS
|
||||
wxMutexLocker lock(const_cast<AegisubCSConv *>(this)->iconvMutex);
|
||||
wxMutexLocker lock(iconvMutex);
|
||||
#endif
|
||||
|
||||
char *outbuforig = *outbuf;
|
||||
|
@ -265,10 +203,11 @@ size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft
|
|||
|
||||
#ifdef ICONV_POSIX
|
||||
if (errno == EILSEQ) {
|
||||
throw _T("One or more characters do not fit in the selected ")
|
||||
_T("encoding and the version of iconv Aegisub was built with")
|
||||
_T(" does not have useful fallbacks. For best results, ")
|
||||
_T("please rebuild Aegisub using a recent version of GNU iconv.");
|
||||
throw
|
||||
L"One or more characters do not fit in the selected "
|
||||
L"encoding and the version of iconv Aegisub was built with"
|
||||
L" does not have useful fallbacks. For best results, "
|
||||
L"please rebuild Aegisub using a recent version of GNU iconv.";
|
||||
}
|
||||
return wxCONV_FAILED;
|
||||
#else
|
||||
|
@ -287,7 +226,7 @@ size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft
|
|||
}
|
||||
if (res == iconv_failed && err == EILSEQ) {
|
||||
// Conversion still failed with transliteration enabled, so try our substitution
|
||||
iconvctl(cd, ICONV_SET_FALLBACKS, const_cast<iconv_fallbacks *>(&fallbacks));
|
||||
iconvctl(cd, ICONV_SET_FALLBACKS, &fallbacks);
|
||||
res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft);
|
||||
err = errno;
|
||||
iconvctl(cd, ICONV_SET_FALLBACKS, NULL);
|
||||
|
@ -309,13 +248,11 @@ size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft
|
|||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param code
|
||||
/// @param callback
|
||||
/// @param callback_arg
|
||||
/// @param convPtr
|
||||
/// @return
|
||||
///
|
||||
/// @brief GNU iconv character substitution callback
|
||||
/// @param code Unicode character which could not be converted
|
||||
/// @param callback Callback to tell iconv what string to use instead
|
||||
/// @param callback_arg Iconv userdata for callback
|
||||
/// @param convPtr AegisubCSConv instance to use
|
||||
void AegisubCSConv::ucToMbFallback(
|
||||
unsigned int code,
|
||||
void (*callback) (const char *buf, size_t buflen, void* callback_arg),
|
||||
|
@ -323,7 +260,8 @@ void AegisubCSConv::ucToMbFallback(
|
|||
void *convPtr)
|
||||
{
|
||||
// At some point in the future, this should probably switch to a real mapping
|
||||
// For now, there's just three cases: BOM to nothing, \ to itself (lol Shift-JIS) and everything else to ?
|
||||
// For now, there's just three cases: BOM to nothing, '\' to itself
|
||||
// (for Shift-JIS, which does not have \) and everything else to '?'
|
||||
if (code == 0xFEFF) return;
|
||||
if (code == 0x5C) callback("\\", 1, callback_arg);
|
||||
else {
|
||||
|
@ -333,13 +271,10 @@ void AegisubCSConv::ucToMbFallback(
|
|||
}
|
||||
|
||||
#ifndef ICONV_POSIX
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param namescount
|
||||
/// @param names
|
||||
/// @param data
|
||||
/// @return
|
||||
///
|
||||
/// @brief Callback for iconvlist
|
||||
/// @param namescount Number of names in names
|
||||
/// @param names Names to add to the list
|
||||
/// @param data Unused userdata field
|
||||
int addEncoding(unsigned int namescount, const char * const * names, void* data) {
|
||||
for (unsigned int i = 0; i < namescount; i++) {
|
||||
supportedEncodings->Add(wxString::FromAscii(names[i]));
|
||||
|
@ -348,10 +283,6 @@ int addEncoding(unsigned int namescount, const char * const * names, void* data)
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @return
|
||||
///
|
||||
wxArrayString AegisubCSConv::GetAllSupportedEncodings() {
|
||||
#if wxUSE_THREADS
|
||||
wxMutexLocker lock(encodingListMutex);
|
||||
|
@ -366,13 +297,8 @@ wxArrayString AegisubCSConv::GetAllSupportedEncodings() {
|
|||
return *supportedEncodings;
|
||||
}
|
||||
|
||||
|
||||
/// @brief Map pretty names to the real encoding names
|
||||
/// @param name
|
||||
/// @return
|
||||
///
|
||||
wxString AegisubCSConv::GetRealEncodingName(wxString name) {
|
||||
if (name.Lower() == _T("local")) return wxLocale::GetSystemEncodingName();
|
||||
if (name.Lower() == L"local") return wxLocale::GetSystemEncodingName();
|
||||
if (prettyEncodingList == NULL) return name;
|
||||
|
||||
PrettyNamesHash::iterator realName = prettyEncodingHash->find(name);
|
||||
|
@ -382,9 +308,6 @@ wxString AegisubCSConv::GetRealEncodingName(wxString name) {
|
|||
return name;
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
///
|
||||
wxArrayString AegisubCSConv::GetEncodingsList() {
|
||||
#if wxUSE_THREADS
|
||||
wxMutexLocker lock(encodingListMutex);
|
||||
|
@ -511,10 +434,10 @@ wxArrayString AegisubCSConv::GetEncodingsList() {
|
|||
|
||||
PrettyNamesHash *map = new PrettyNamesHash(100);
|
||||
wxArrayString *arr = new wxArrayString();
|
||||
arr->Add(_T("Local"));
|
||||
arr->Add(L"Local");
|
||||
|
||||
for (int i = 0; encodingNames[i].real != NULL; i++) {
|
||||
// Verify that iconv actually supports this encoding
|
||||
// Verify that iconv actually supports converting to and from this encoding
|
||||
iconv_t cd = iconv_open(encodingNames[i].real, WCHAR_T_ENCODING);
|
||||
if (cd == iconv_invalid) continue;
|
||||
iconv_close(cd);
|
||||
|
@ -533,7 +456,5 @@ wxArrayString AegisubCSConv::GetEncodingsList() {
|
|||
}
|
||||
return *prettyEncodingList;
|
||||
}
|
||||
static AegisubCSConv localConv(_T("Local"), false);
|
||||
static AegisubCSConv localConv(L"Local", false);
|
||||
AegisubCSConv& csConvLocal(localConv);
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2009, Thomas Goyne
|
||||
// Copyright (c) 2010, Thomas Goyne
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
|
@ -34,9 +34,6 @@
|
|||
/// @ingroup utility
|
||||
///
|
||||
|
||||
|
||||
|
||||
|
||||
#ifndef AGI_PRE
|
||||
#include <iconv.h>
|
||||
#include <wchar.h>
|
||||
|
@ -49,64 +46,77 @@
|
|||
|
||||
#include "aegisub_endian.h"
|
||||
|
||||
|
||||
#if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG)
|
||||
|
||||
/// DOCME
|
||||
#define ICONV_POSIX
|
||||
#endif
|
||||
|
||||
/// @class iconv_wrapper
|
||||
/// @brief RAII wrapper for iconv
|
||||
class iconv_wrapper {
|
||||
private:
|
||||
iconv_t conv;
|
||||
public:
|
||||
iconv_wrapper(const char *to, const char *from)
|
||||
: conv(iconv_open(to, from))
|
||||
{ }
|
||||
iconv_wrapper(wxString const& to, wxString const& from)
|
||||
: conv(iconv_open(to.ToAscii(), from.ToAscii()))
|
||||
{ }
|
||||
iconv_wrapper(const char *to, wxString const& from)
|
||||
: conv(iconv_open(to, from.ToAscii()))
|
||||
{ }
|
||||
iconv_wrapper(wxString const& to, const char *from)
|
||||
: conv(iconv_open(to.ToAscii(), from))
|
||||
{ }
|
||||
~iconv_wrapper() {
|
||||
if (conv != (iconv_t)-1) iconv_close(conv);
|
||||
}
|
||||
operator iconv_t() {
|
||||
return conv;
|
||||
}
|
||||
operator const iconv_t() const {
|
||||
return conv;
|
||||
}
|
||||
};
|
||||
|
||||
/// DOCME
|
||||
/// @class AegisubCSConv
|
||||
/// @brief DOCME
|
||||
///
|
||||
/// DOCME
|
||||
/// @brief wxMBConv implementation for converting to and from unicode
|
||||
class AegisubCSConv : public wxMBConv {
|
||||
public:
|
||||
// By default, any conversion that would be lossy will fail
|
||||
// When enableSubst is true, conversions to multibyte with a sufficiently large buffer
|
||||
// are guaranteed to succeed, with characters dropped or changed as needed to fit the
|
||||
// string into the target encoding.
|
||||
/// @param mbEncName Multibyte encoding to convert to/from
|
||||
/// @param enableSubst Whether to substitute characters when needed.
|
||||
/// By default, any conversion that would be lossy will fail
|
||||
/// When enableSubst is true, conversions to multibyte with a sufficiently
|
||||
/// large buffer are guaranteed to succeed, with characters dropped or
|
||||
/// changed as needed to fit the string into the target encoding.
|
||||
AegisubCSConv(const wxChar *mbEncName, bool enableSubst = false);
|
||||
virtual ~AegisubCSConv();
|
||||
|
||||
// wxMBConv implementation; see strconv.h for usage details
|
||||
virtual size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const;
|
||||
virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const;
|
||||
virtual size_t GetMBNulLen() const;
|
||||
virtual wxMBConv *Clone() const;
|
||||
size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const;
|
||||
size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const;
|
||||
size_t GetMBNulLen() const;
|
||||
wxMBConv *Clone() const;
|
||||
|
||||
// Get the length (in bytes) of a null-terminated string whose encoding is mbEncName
|
||||
/// @brief Multibyte-aware strlen
|
||||
/// @return Length in bytes of str (excluding terminator)
|
||||
size_t MBBuffLen(const char *str) const;
|
||||
|
||||
// Get a list of support encodings with somewhat user-friendly names
|
||||
/// @brief Get a list of support encodings with user-friendly names
|
||||
static wxArrayString GetEncodingsList();
|
||||
// Get a list of all encodings supported by iconv
|
||||
/// @brief Get a list of all encodings supported by iconv
|
||||
/// Requires GNU iconv for useful results
|
||||
static wxArrayString GetAllSupportedEncodings();
|
||||
// Map a user-friendly encoding name to iconv's name
|
||||
/// @brief Map a user-friendly encoding name to the real encoding name
|
||||
static wxString GetRealEncodingName(wxString name);
|
||||
|
||||
protected:
|
||||
|
||||
/// DOCME
|
||||
|
||||
/// DOCME
|
||||
iconv_t m2w, w2m;
|
||||
|
||||
private:
|
||||
|
||||
/// DOCME
|
||||
// The smattering of mutable variables here are due to that ToWChar and
|
||||
// FromWChar are const in wxMBConv, but we require minor mutation for
|
||||
// things like locks (as iconv is not thread-safe)
|
||||
wxString wcCharsetName;
|
||||
|
||||
/// DOCME
|
||||
wxString mbCharsetName;
|
||||
|
||||
/// DOCME
|
||||
size_t mbNulLen;
|
||||
|
||||
/// DOCME
|
||||
bool enableSubst;
|
||||
mutable size_t mbNulLen;
|
||||
bool enableSubst;
|
||||
|
||||
size_t doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const;
|
||||
size_t iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) const;
|
||||
|
@ -117,56 +127,43 @@ private:
|
|||
void *callback_arg,
|
||||
void *convPtr);
|
||||
|
||||
/// DOCME
|
||||
/// Replacement character for characters which do not fit in the target
|
||||
/// encoding and iconv does not have an appropriate substitute for
|
||||
char invalidRep[8];
|
||||
|
||||
/// DOCME
|
||||
size_t invalidRepSize;
|
||||
|
||||
#ifndef ICONV_POSIX
|
||||
|
||||
/// DOCME
|
||||
iconv_fallbacks fallbacks;
|
||||
mutable iconv_fallbacks fallbacks;
|
||||
#endif
|
||||
|
||||
#if wxUSE_THREADS
|
||||
|
||||
/// DOCME
|
||||
wxMutex iconvMutex;
|
||||
mutable wxMutex iconvMutex;
|
||||
#endif
|
||||
|
||||
protected:
|
||||
iconv_wrapper m2w, w2m;
|
||||
};
|
||||
|
||||
// Predefined conversion for the current locale. Should be a drop-in replacement for wxConvLocal
|
||||
// Predefined conversion for the current locale, intended to be a drop-in
|
||||
// replacement for wxConvLocal
|
||||
extern AegisubCSConv& csConvLocal;
|
||||
|
||||
#ifdef HAVE_BIG_ENDIAN
|
||||
# if SIZEOF_WCHAR_T == 4
|
||||
|
||||
/// DOCME
|
||||
# define WCHAR_T_ENCODING "UTF-32BE"
|
||||
# elif SIZEOF_WCHAR_T == 2
|
||||
|
||||
/// DOCME
|
||||
# define WCHAR_T_ENCODING "UTF-16BE"
|
||||
# endif
|
||||
#elif defined(HAVE_LITTLE_ENDIAN)
|
||||
# if SIZEOF_WCHAR_T == 4
|
||||
|
||||
/// DOCME
|
||||
# define WCHAR_T_ENCODING "UTF-32LE"
|
||||
# elif SIZEOF_WCHAR_T == 2
|
||||
|
||||
/// DOCME
|
||||
# define WCHAR_T_ENCODING "UTF-16LE"
|
||||
# endif
|
||||
#else
|
||||
# if SIZEOF_WCHAR_T == 4
|
||||
|
||||
/// DOCME
|
||||
# define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-32BE" : "UTF-32LE")
|
||||
# elif SIZEOF_WCHAR_T == 2
|
||||
|
||||
/// DOCME
|
||||
# define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-16BE" : "UTF-16LE")
|
||||
# endif
|
||||
#endif
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2005, Rodrigo Braz Monteiro
|
||||
// Copyright (c) 2010, Rodrigo Braz Monteiro, Thomas Goyne
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
|
@ -51,13 +51,6 @@
|
|||
#endif
|
||||
#include "text_file_reader.h"
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param filename
|
||||
/// @param enc
|
||||
/// @param trim
|
||||
/// @return
|
||||
///
|
||||
TextFileReader::TextFileReader(wxString filename, wxString enc, bool trim)
|
||||
: encoding(enc), conv((iconv_t)-1), trim(trim), readComplete(false), currout(0), outptr(0), currentLine(0) {
|
||||
#ifdef __WINDOWS__
|
||||
|
@ -65,29 +58,22 @@ TextFileReader::TextFileReader(wxString filename, wxString enc, bool trim)
|
|||
#else
|
||||
file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
|
||||
#endif
|
||||
if (!file.is_open()) {
|
||||
throw _T("Failed opening file for reading.");
|
||||
}
|
||||
if (!file.is_open()) throw L"Failed opening file for reading.";
|
||||
|
||||
if (encoding.IsEmpty()) encoding = GetEncoding(filename);
|
||||
if (encoding == _T("binary")) return;
|
||||
if (encoding == L"binary") return;
|
||||
encoding = AegisubCSConv::GetRealEncodingName(encoding);
|
||||
conv = iconv_open(WCHAR_T_ENCODING, encoding.ToAscii());
|
||||
if (conv == (iconv_t)-1) {
|
||||
throw wxString::Format(L"Character set '%s' is not supported.", enc.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
///
|
||||
TextFileReader::~TextFileReader() {
|
||||
if (conv != (iconv_t)-1) iconv_close(conv);
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param filename
|
||||
/// @return
|
||||
///
|
||||
wxString TextFileReader::GetEncoding(const wxString filename) {
|
||||
wxString TextFileReader::GetEncoding(wxString const& filename) {
|
||||
// Prepare
|
||||
unsigned char b[4];
|
||||
memset(b, 0, sizeof(b));
|
||||
|
@ -100,27 +86,27 @@ wxString TextFileReader::GetEncoding(const wxString filename) {
|
|||
ifile.open(wxFNCONV(filename));
|
||||
#endif
|
||||
if (!ifile.is_open()) {
|
||||
return _T("unknown");
|
||||
return L"unknown";
|
||||
}
|
||||
ifile.read(reinterpret_cast<char *>(b),4);
|
||||
ifile.close();
|
||||
|
||||
// Try to get the byte order mark from them
|
||||
if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return _T("UTF-8");
|
||||
else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return _T("UTF-32LE");
|
||||
else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return _T("UTF-32BE");
|
||||
else if (b[0] == 0xFF && b[1] == 0xFE) return _T("UTF-16LE");
|
||||
else if (b[0] == 0xFE && b[1] == 0xFF) return _T("UTF-16BE");
|
||||
else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return _T("UTF-7");
|
||||
if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return L"UTF-8";
|
||||
else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return L"UTF-32LE";
|
||||
else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return L"UTF-32BE";
|
||||
else if (b[0] == 0xFF && b[1] == 0xFE) return L"UTF-16LE";
|
||||
else if (b[0] == 0xFE && b[1] == 0xFF) return L"UTF-16BE";
|
||||
else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return L"UTF-7";
|
||||
|
||||
// Try to guess UTF-16
|
||||
else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return _T("UTF-16BE");
|
||||
else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return _T("UTF-16LE");
|
||||
else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return L"UTF-16BE";
|
||||
else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return L"UTF-16LE";
|
||||
|
||||
// If any of the first four bytes are under 0x20 (the first printable character),
|
||||
// except for 9-13 range, assume binary
|
||||
for (int i=0;i<4;i++) {
|
||||
if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return _T("binary");
|
||||
if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return L"binary";
|
||||
}
|
||||
|
||||
#ifdef WITH_UNIVCHARDET
|
||||
|
@ -129,14 +115,10 @@ wxString TextFileReader::GetEncoding(const wxString filename) {
|
|||
return det.GetEncoding(filename);
|
||||
#else
|
||||
// Fall back to local
|
||||
return _T("Local");
|
||||
return L"local";
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @return
|
||||
///
|
||||
wchar_t TextFileReader::GetWChar() {
|
||||
// If there's already some converted characters waiting, return the next one
|
||||
if (++currout < outptr) {
|
||||
|
@ -174,7 +156,7 @@ wchar_t TextFileReader::GetWChar() {
|
|||
// adding one byte to the input buffer until either it succeeds or we add enough bytes to
|
||||
// complete any character
|
||||
if (++bytesAdded > 3)
|
||||
throw wxString::Format(_T("Invalid input character found near line %u"), currentLine);
|
||||
throw wxString::Format(L"Invalid input character found near line %u", currentLine);
|
||||
|
||||
file.read(inptr + inbytesleft, 1);
|
||||
inbytesleft++;
|
||||
|
@ -183,34 +165,27 @@ wchar_t TextFileReader::GetWChar() {
|
|||
if (outptr > outbuf)
|
||||
return *currout;
|
||||
|
||||
throw wxString::Format(_T("Invalid input character found near line %u"), currentLine);
|
||||
throw wxString::Format(L"Invalid input character found near line %u", currentLine);
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @return
|
||||
///
|
||||
wxString TextFileReader::ReadLineFromFile() {
|
||||
wxString buffer;
|
||||
size_t bufAlloc = 1024;
|
||||
buffer.Alloc(bufAlloc);
|
||||
buffer.Alloc(1024);
|
||||
|
||||
currentLine++;
|
||||
// Read a line
|
||||
wchar_t ch;
|
||||
size_t len = 0;
|
||||
bool first = true;
|
||||
// This doesn't work for \r deliminated files, but it's very unlikely
|
||||
// that we'll run into one of those
|
||||
for (ch = GetWChar(); ch != L'\n' && ch != 0; ch = GetWChar()) {
|
||||
if (ch == L'\r') continue;
|
||||
// Skip the BOM -- we don't need it as the encoding is already known
|
||||
// and it sometimes causes conversion problems
|
||||
if (ch == 0xFEFF && len == 0) continue;
|
||||
if (ch == 0xFEFF && first) continue;
|
||||
|
||||
if (len >= bufAlloc - 1) {
|
||||
bufAlloc *= 2;
|
||||
buffer.Alloc(bufAlloc);
|
||||
}
|
||||
buffer += ch;
|
||||
len++;
|
||||
first = false;
|
||||
}
|
||||
if (ch == 0)
|
||||
readComplete = true;
|
||||
|
@ -223,36 +198,10 @@ wxString TextFileReader::ReadLineFromFile() {
|
|||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @return
|
||||
///
|
||||
bool TextFileReader::HasMoreLines() {
|
||||
return !readComplete;
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param encoding
|
||||
/// @return
|
||||
///
|
||||
void TextFileReader::EnsureValid(wxString enc) {
|
||||
if (enc == _T("binary")) return;
|
||||
|
||||
enc = AegisubCSConv::GetRealEncodingName(enc);
|
||||
iconv_t cd = iconv_open(WCHAR_T_ENCODING, enc.ToAscii());
|
||||
bool canOpen = cd != (iconv_t)-1;
|
||||
iconv_close(cd);
|
||||
if (!canOpen) {
|
||||
throw wxString::Format(_T("Character set %s is not supported."), enc.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
///
|
||||
wxString TextFileReader::GetCurrentEncoding() {
|
||||
return encoding;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2005, Rodrigo Braz Monteiro
|
||||
// Copyright (c) 2010, Rodrigo Braz Monteiro
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
|
@ -45,62 +45,53 @@
|
|||
#include <wx/string.h>
|
||||
#endif
|
||||
|
||||
|
||||
/// DOCME
|
||||
/// @class TextFileReader
|
||||
/// @brief DOCME
|
||||
///
|
||||
/// DOCME
|
||||
/// @brief A line-based text file reader
|
||||
class TextFileReader {
|
||||
private:
|
||||
|
||||
/// DOCME
|
||||
/// Encoding of the file being read
|
||||
wxString encoding;
|
||||
|
||||
/// DOCME
|
||||
std::ifstream file;
|
||||
|
||||
/// DOCME
|
||||
iconv_t conv;
|
||||
|
||||
/// DOCME
|
||||
bool trim;
|
||||
|
||||
/// DOCME
|
||||
bool readComplete;
|
||||
|
||||
|
||||
/// DOCME
|
||||
// Iconv buffers and state
|
||||
wchar_t outbuf[256];
|
||||
|
||||
/// DOCME
|
||||
wchar_t *currout;
|
||||
|
||||
/// DOCME
|
||||
wchar_t *outptr;
|
||||
|
||||
/// DOCME
|
||||
size_t outbytesleft;
|
||||
|
||||
|
||||
/// DOCME
|
||||
/// Current line number
|
||||
unsigned int currentLine;
|
||||
|
||||
/// @brief Read a single wchar_t from the file
|
||||
wchar_t GetWChar();
|
||||
|
||||
TextFileReader(const TextFileReader&);
|
||||
TextFileReader& operator=(const TextFileReader&);
|
||||
|
||||
public:
|
||||
TextFileReader(wxString filename,wxString encoding=_T(""), bool trim=true);
|
||||
/// @brief Constructor
|
||||
/// @param filename File to open
|
||||
/// @param enc Encoding to use, or empty to autodetect
|
||||
/// @param trim Whether to trim whitespace from lines read
|
||||
TextFileReader(wxString filename,wxString encoding=L"", bool trim=true);
|
||||
/// @brief Destructor
|
||||
~TextFileReader();
|
||||
|
||||
/// @brief Read a line from the file
|
||||
/// @return The line, possibly trimmed
|
||||
wxString ReadLineFromFile();
|
||||
/// @brief Check if there are any more lines to read
|
||||
bool HasMoreLines();
|
||||
|
||||
static void EnsureValid(const wxString encoding);
|
||||
/// @brief Get the file encoding used by this reader
|
||||
/// @return "unknown", "binary", or a character encoding name
|
||||
wxString GetCurrentEncoding();
|
||||
static wxString GetEncoding(const wxString filename);
|
||||
|
||||
/// @brief Attempt to detect a file's encoding
|
||||
/// @param filename The file to check
|
||||
/// @return "unknown", "binary", or a character encoding name
|
||||
static wxString GetEncoding(wxString const& filename);
|
||||
};
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue