Add some documentation for AegisubCSConv and TextFileReader.

Originally committed to SVN as r4036.
This commit is contained in:
Thomas Goyne 2010-01-24 18:56:51 +00:00
parent c7d95e5590
commit ba088237d7
5 changed files with 155 additions and 298 deletions

View file

@ -103,7 +103,6 @@ void AssFile::Load (const wxString _filename,const wxString charset,bool addToRe
wxString enc; wxString enc;
if (charset.IsEmpty()) enc = TextFileReader::GetEncoding(_filename); if (charset.IsEmpty()) enc = TextFileReader::GetEncoding(_filename);
else enc = charset; else enc = charset;
TextFileReader::EnsureValid(enc);
// Generic preparation // Generic preparation
Clear(); Clear();

View file

@ -49,52 +49,27 @@
WX_DECLARE_STRING_HASH_MAP(wxString, PrettyNamesHash); WX_DECLARE_STRING_HASH_MAP(wxString, PrettyNamesHash);
#if wxUSE_THREADS #if wxUSE_THREADS
/// DOCME
static wxMutex encodingListMutex; static wxMutex encodingListMutex;
#endif #endif
/// DOCME
static const iconv_t iconv_invalid = (iconv_t)-1; static const iconv_t iconv_invalid = (iconv_t)-1;
/// DOCME
static const size_t iconv_failed = (size_t)-1; static const size_t iconv_failed = (size_t)-1;
/// DOCME
#define ICONV_CONST_CAST(a) const_cast<ICONV_CONST char *>(a) #define ICONV_CONST_CAST(a) const_cast<ICONV_CONST char *>(a)
#ifndef ICONV_POSIX
static int addEncoding(unsigned int namescount, const char * const * names, void* data);
#endif
/// DOCME
static wxArrayString *supportedEncodings = NULL; static wxArrayString *supportedEncodings = NULL;
/// DOCME
static wxArrayString *prettyEncodingList = NULL; static wxArrayString *prettyEncodingList = NULL;
/// DOCME
static PrettyNamesHash *prettyEncodingHash = NULL; static PrettyNamesHash *prettyEncodingHash = NULL;
/// @brief DOCME
/// @param mbEncName
/// @param enableSubst
///
AegisubCSConv::AegisubCSConv(const wxChar *mbEncName, bool enableSubst) AegisubCSConv::AegisubCSConv(const wxChar *mbEncName, bool enableSubst)
: mbCharsetName(GetRealEncodingName(mbEncName)), mbNulLen(0), enableSubst(enableSubst) : wcCharsetName(WCHAR_T_ENCODING)
, mbCharsetName(GetRealEncodingName(mbEncName))
, mbNulLen(0)
, enableSubst(enableSubst)
, m2w(wcCharsetName, mbCharsetName)
, w2m(mbCharsetName, wcCharsetName)
{ {
wcCharsetName = wxString::FromAscii(WCHAR_T_ENCODING);
m2w = iconv_open(wcCharsetName.ToAscii(), mbCharsetName.ToAscii());
w2m = iconv_open(mbCharsetName.ToAscii(), wcCharsetName.ToAscii());
if (m2w == iconv_invalid || w2m == iconv_invalid) { if (m2w == iconv_invalid || w2m == iconv_invalid) {
if (m2w != iconv_invalid) iconv_close(m2w); throw wxString::Format(L"Character set %s is not supported.", mbEncName);
if (w2m != iconv_invalid) iconv_close(w2m);
throw wxString::Format(_T("Character set %s is not supported."), mbEncName);
} }
if (enableSubst) { if (enableSubst) {
@ -110,26 +85,14 @@ AegisubCSConv::AegisubCSConv(const wxChar *mbEncName, bool enableSubst)
} }
} }
/// @brief DOCME
///
AegisubCSConv::~AegisubCSConv() {
if (m2w != iconv_invalid) iconv_close(m2w);
if (w2m != iconv_invalid) iconv_close(w2m);
}
/// @brief DOCME
/// @return
///
wxMBConv * AegisubCSConv::Clone() const { wxMBConv * AegisubCSConv::Clone() const {
AegisubCSConv *c = new AegisubCSConv(mbCharsetName); AegisubCSConv *c = new AegisubCSConv(mbCharsetName);
c->mbNulLen = mbNulLen; c->mbNulLen = mbNulLen;
return c; return c;
} }
/// @brief Calculate the size of NUL in the target encoding via iconv /// @brief Calculate the size of NUL in the target encoding via iconv
/// @return /// @return The size in bytes of NUL
///
size_t AegisubCSConv::GetMBNulLen() const { size_t AegisubCSConv::GetMBNulLen() const {
if (mbNulLen == 0) { if (mbNulLen == 0) {
const wchar_t nulStr[] = L""; const wchar_t nulStr[] = L"";
@ -142,18 +105,13 @@ size_t AegisubCSConv::GetMBNulLen() const {
size_t res = iconv(w2m, &inPtr, &inLen, &outPtr, &outLen); size_t res = iconv(w2m, &inPtr, &inLen, &outPtr, &outLen);
if (res != 0) if (res != 0)
const_cast<AegisubCSConv *>(this)->mbNulLen = (size_t)-1; mbNulLen = (size_t)-1;
else else
const_cast<AegisubCSConv *>(this)->mbNulLen = sizeof(outBuff) - outLen; mbNulLen = sizeof(outBuff) - outLen;
} }
return mbNulLen; return mbNulLen;
} }
/// @brief Calculate the length (in bytes) of a MB string, not including the terminator
/// @param str
/// @return
///
size_t AegisubCSConv::MBBuffLen(const char * str) const { size_t AegisubCSConv::MBBuffLen(const char * str) const {
size_t nulLen = GetMBNulLen(); size_t nulLen = GetMBNulLen();
const char *ptr; const char *ptr;
@ -171,14 +129,12 @@ size_t AegisubCSConv::MBBuffLen(const char * str) const {
} }
} }
/// @brief Convert a string from multibyte to wide characters
/// @brief DOCME /// @param dst Destination buffer.
/// @param dst /// @param dstSize Length of destination buffer in wchar_ts
/// @param dstSize /// @param src Source multibyte string
/// @param src /// @param srcLen Length of source buffer in bytes, or -1 to autodetect
/// @param srcLen /// @return The number of wchar_ts needed to store the string in the target charset
/// @return
///
size_t AegisubCSConv::ToWChar(wchar_t *dst, size_t dstSize, const char *src, size_t srcLen) const { size_t AegisubCSConv::ToWChar(wchar_t *dst, size_t dstSize, const char *src, size_t srcLen) const {
return doConversion( return doConversion(
m2w, m2w,
@ -189,14 +145,12 @@ size_t AegisubCSConv::ToWChar(wchar_t *dst, size_t dstSize, const char *src, siz
) / sizeof(wchar_t); ) / sizeof(wchar_t);
} }
/// @brief Convert a string from wide characters to multibyte
/// @brief DOCME /// @param dst Destination buffer
/// @param dst /// @param dstSize Length of destination buffer in bytes
/// @param dstSize /// @param src Source wide character string
/// @param src /// @param srcLen Length in wchar_ts of source, or -1 to autodetect
/// @param srcLen /// @return The number of bytes needed to store the string in the target charset
/// @return
///
size_t AegisubCSConv::FromWChar(char *dst, size_t dstSize, const wchar_t *src, size_t srcLen) const { size_t AegisubCSConv::FromWChar(char *dst, size_t dstSize, const wchar_t *src, size_t srcLen) const {
return doConversion( return doConversion(
w2m, w2m,
@ -207,15 +161,7 @@ size_t AegisubCSConv::FromWChar(char *dst, size_t dstSize, const wchar_t *src, s
); );
} }
// Perform a conversion if a buffer is given or calculate the needed buffer size if not
/// @brief DOCME
/// @param cd
/// @param dst
/// @param dstSize
/// @param src
/// @param srcSize
/// @return
///
size_t AegisubCSConv::doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const { size_t AegisubCSConv::doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const {
if (dstSize > 0) { if (dstSize > 0) {
return iconvWrapper(cd, &src, &srcSize, &dst, &dstSize); return iconvWrapper(cd, &src, &srcSize, &dst, &dstSize);
@ -239,20 +185,12 @@ size_t AegisubCSConv::doConversion(iconv_t cd, char *dst, size_t dstSize, char *
return charsWritten; return charsWritten;
} }
// Actually perform a conversion via iconv
/// @brief DOCME
/// @param cd
/// @param inbuf
/// @param inbytesleft
/// @param outbuf
/// @param outbytesleft
/// @return
///
size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft, size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft) const { char **outbuf, size_t *outbytesleft) const {
#if wxUSE_THREADS #if wxUSE_THREADS
wxMutexLocker lock(const_cast<AegisubCSConv *>(this)->iconvMutex); wxMutexLocker lock(iconvMutex);
#endif #endif
char *outbuforig = *outbuf; char *outbuforig = *outbuf;
@ -265,10 +203,11 @@ size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft
#ifdef ICONV_POSIX #ifdef ICONV_POSIX
if (errno == EILSEQ) { if (errno == EILSEQ) {
throw _T("One or more characters do not fit in the selected ") throw
_T("encoding and the version of iconv Aegisub was built with") L"One or more characters do not fit in the selected "
_T(" does not have useful fallbacks. For best results, ") L"encoding and the version of iconv Aegisub was built with"
_T("please rebuild Aegisub using a recent version of GNU iconv."); L" does not have useful fallbacks. For best results, "
L"please rebuild Aegisub using a recent version of GNU iconv.";
} }
return wxCONV_FAILED; return wxCONV_FAILED;
#else #else
@ -287,7 +226,7 @@ size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft
} }
if (res == iconv_failed && err == EILSEQ) { if (res == iconv_failed && err == EILSEQ) {
// Conversion still failed with transliteration enabled, so try our substitution // Conversion still failed with transliteration enabled, so try our substitution
iconvctl(cd, ICONV_SET_FALLBACKS, const_cast<iconv_fallbacks *>(&fallbacks)); iconvctl(cd, ICONV_SET_FALLBACKS, &fallbacks);
res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); res = iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft);
err = errno; err = errno;
iconvctl(cd, ICONV_SET_FALLBACKS, NULL); iconvctl(cd, ICONV_SET_FALLBACKS, NULL);
@ -309,13 +248,11 @@ size_t AegisubCSConv::iconvWrapper(iconv_t cd, char **inbuf, size_t *inbytesleft
} }
/// @brief DOCME /// @brief GNU iconv character substitution callback
/// @param code /// @param code Unicode character which could not be converted
/// @param callback /// @param callback Callback to tell iconv what string to use instead
/// @param callback_arg /// @param callback_arg Iconv userdata for callback
/// @param convPtr /// @param convPtr AegisubCSConv instance to use
/// @return
///
void AegisubCSConv::ucToMbFallback( void AegisubCSConv::ucToMbFallback(
unsigned int code, unsigned int code,
void (*callback) (const char *buf, size_t buflen, void* callback_arg), void (*callback) (const char *buf, size_t buflen, void* callback_arg),
@ -323,7 +260,8 @@ void AegisubCSConv::ucToMbFallback(
void *convPtr) void *convPtr)
{ {
// At some point in the future, this should probably switch to a real mapping // At some point in the future, this should probably switch to a real mapping
// For now, there's just three cases: BOM to nothing, \ to itself (lol Shift-JIS) and everything else to ? // For now, there's just three cases: BOM to nothing, '\' to itself
// (for Shift-JIS, which does not have \) and everything else to '?'
if (code == 0xFEFF) return; if (code == 0xFEFF) return;
if (code == 0x5C) callback("\\", 1, callback_arg); if (code == 0x5C) callback("\\", 1, callback_arg);
else { else {
@ -333,13 +271,10 @@ void AegisubCSConv::ucToMbFallback(
} }
#ifndef ICONV_POSIX #ifndef ICONV_POSIX
/// @brief Callback for iconvlist
/// @brief DOCME /// @param namescount Number of names in names
/// @param namescount /// @param names Names to add to the list
/// @param names /// @param data Unused userdata field
/// @param data
/// @return
///
int addEncoding(unsigned int namescount, const char * const * names, void* data) { int addEncoding(unsigned int namescount, const char * const * names, void* data) {
for (unsigned int i = 0; i < namescount; i++) { for (unsigned int i = 0; i < namescount; i++) {
supportedEncodings->Add(wxString::FromAscii(names[i])); supportedEncodings->Add(wxString::FromAscii(names[i]));
@ -348,10 +283,6 @@ int addEncoding(unsigned int namescount, const char * const * names, void* data)
} }
#endif #endif
/// @brief DOCME
/// @return
///
wxArrayString AegisubCSConv::GetAllSupportedEncodings() { wxArrayString AegisubCSConv::GetAllSupportedEncodings() {
#if wxUSE_THREADS #if wxUSE_THREADS
wxMutexLocker lock(encodingListMutex); wxMutexLocker lock(encodingListMutex);
@ -366,13 +297,8 @@ wxArrayString AegisubCSConv::GetAllSupportedEncodings() {
return *supportedEncodings; return *supportedEncodings;
} }
/// @brief Map pretty names to the real encoding names
/// @param name
/// @return
///
wxString AegisubCSConv::GetRealEncodingName(wxString name) { wxString AegisubCSConv::GetRealEncodingName(wxString name) {
if (name.Lower() == _T("local")) return wxLocale::GetSystemEncodingName(); if (name.Lower() == L"local") return wxLocale::GetSystemEncodingName();
if (prettyEncodingList == NULL) return name; if (prettyEncodingList == NULL) return name;
PrettyNamesHash::iterator realName = prettyEncodingHash->find(name); PrettyNamesHash::iterator realName = prettyEncodingHash->find(name);
@ -382,9 +308,6 @@ wxString AegisubCSConv::GetRealEncodingName(wxString name) {
return name; return name;
} }
/// @brief DOCME
///
wxArrayString AegisubCSConv::GetEncodingsList() { wxArrayString AegisubCSConv::GetEncodingsList() {
#if wxUSE_THREADS #if wxUSE_THREADS
wxMutexLocker lock(encodingListMutex); wxMutexLocker lock(encodingListMutex);
@ -511,10 +434,10 @@ wxArrayString AegisubCSConv::GetEncodingsList() {
PrettyNamesHash *map = new PrettyNamesHash(100); PrettyNamesHash *map = new PrettyNamesHash(100);
wxArrayString *arr = new wxArrayString(); wxArrayString *arr = new wxArrayString();
arr->Add(_T("Local")); arr->Add(L"Local");
for (int i = 0; encodingNames[i].real != NULL; i++) { for (int i = 0; encodingNames[i].real != NULL; i++) {
// Verify that iconv actually supports this encoding // Verify that iconv actually supports converting to and from this encoding
iconv_t cd = iconv_open(encodingNames[i].real, WCHAR_T_ENCODING); iconv_t cd = iconv_open(encodingNames[i].real, WCHAR_T_ENCODING);
if (cd == iconv_invalid) continue; if (cd == iconv_invalid) continue;
iconv_close(cd); iconv_close(cd);
@ -533,7 +456,5 @@ wxArrayString AegisubCSConv::GetEncodingsList() {
} }
return *prettyEncodingList; return *prettyEncodingList;
} }
static AegisubCSConv localConv(_T("Local"), false); static AegisubCSConv localConv(L"Local", false);
AegisubCSConv& csConvLocal(localConv); AegisubCSConv& csConvLocal(localConv);

View file

@ -1,4 +1,4 @@
// Copyright (c) 2009, Thomas Goyne // Copyright (c) 2010, Thomas Goyne
// All rights reserved. // All rights reserved.
// //
// Redistribution and use in source and binary forms, with or without // Redistribution and use in source and binary forms, with or without
@ -34,9 +34,6 @@
/// @ingroup utility /// @ingroup utility
/// ///
#ifndef AGI_PRE #ifndef AGI_PRE
#include <iconv.h> #include <iconv.h>
#include <wchar.h> #include <wchar.h>
@ -49,63 +46,76 @@
#include "aegisub_endian.h" #include "aegisub_endian.h"
#if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG) #if !defined(_LIBICONV_VERSION) || _LIBICONV_VERSION < 0x010A || defined(LIBICONV_PLUG)
/// DOCME
#define ICONV_POSIX #define ICONV_POSIX
#endif #endif
/// @class iconv_wrapper
/// @brief RAII wrapper for iconv
class iconv_wrapper {
private:
iconv_t conv;
public:
iconv_wrapper(const char *to, const char *from)
: conv(iconv_open(to, from))
{ }
iconv_wrapper(wxString const& to, wxString const& from)
: conv(iconv_open(to.ToAscii(), from.ToAscii()))
{ }
iconv_wrapper(const char *to, wxString const& from)
: conv(iconv_open(to, from.ToAscii()))
{ }
iconv_wrapper(wxString const& to, const char *from)
: conv(iconv_open(to.ToAscii(), from))
{ }
~iconv_wrapper() {
if (conv != (iconv_t)-1) iconv_close(conv);
}
operator iconv_t() {
return conv;
}
operator const iconv_t() const {
return conv;
}
};
/// DOCME
/// @class AegisubCSConv /// @class AegisubCSConv
/// @brief DOCME /// @brief wxMBConv implementation for converting to and from unicode
///
/// DOCME
class AegisubCSConv : public wxMBConv { class AegisubCSConv : public wxMBConv {
public: public:
// By default, any conversion that would be lossy will fail /// @param mbEncName Multibyte encoding to convert to/from
// When enableSubst is true, conversions to multibyte with a sufficiently large buffer /// @param enableSubst Whether to substitute characters when needed.
// are guaranteed to succeed, with characters dropped or changed as needed to fit the /// By default, any conversion that would be lossy will fail
// string into the target encoding. /// When enableSubst is true, conversions to multibyte with a sufficiently
/// large buffer are guaranteed to succeed, with characters dropped or
/// changed as needed to fit the string into the target encoding.
AegisubCSConv(const wxChar *mbEncName, bool enableSubst = false); AegisubCSConv(const wxChar *mbEncName, bool enableSubst = false);
virtual ~AegisubCSConv();
// wxMBConv implementation; see strconv.h for usage details // wxMBConv implementation; see strconv.h for usage details
virtual size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const; size_t ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen = wxNO_LEN) const;
virtual size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const; size_t FromWChar(char *dst, size_t dstLen, const wchar_t *src, size_t srcLen = wxNO_LEN) const;
virtual size_t GetMBNulLen() const; size_t GetMBNulLen() const;
virtual wxMBConv *Clone() const; wxMBConv *Clone() const;
// Get the length (in bytes) of a null-terminated string whose encoding is mbEncName /// @brief Multibyte-aware strlen
/// @return Length in bytes of str (excluding terminator)
size_t MBBuffLen(const char *str) const; size_t MBBuffLen(const char *str) const;
// Get a list of support encodings with somewhat user-friendly names /// @brief Get a list of support encodings with user-friendly names
static wxArrayString GetEncodingsList(); static wxArrayString GetEncodingsList();
// Get a list of all encodings supported by iconv /// @brief Get a list of all encodings supported by iconv
/// Requires GNU iconv for useful results
static wxArrayString GetAllSupportedEncodings(); static wxArrayString GetAllSupportedEncodings();
// Map a user-friendly encoding name to iconv's name /// @brief Map a user-friendly encoding name to the real encoding name
static wxString GetRealEncodingName(wxString name); static wxString GetRealEncodingName(wxString name);
protected:
/// DOCME
/// DOCME
iconv_t m2w, w2m;
private: private:
// The smattering of mutable variables here are due to that ToWChar and
/// DOCME // FromWChar are const in wxMBConv, but we require minor mutation for
// things like locks (as iconv is not thread-safe)
wxString wcCharsetName; wxString wcCharsetName;
/// DOCME
wxString mbCharsetName; wxString mbCharsetName;
mutable size_t mbNulLen;
/// DOCME
size_t mbNulLen;
/// DOCME
bool enableSubst; bool enableSubst;
size_t doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const; size_t doConversion(iconv_t cd, char *dst, size_t dstSize, char *src, size_t srcSize) const;
@ -117,56 +127,43 @@ private:
void *callback_arg, void *callback_arg,
void *convPtr); void *convPtr);
/// DOCME /// Replacement character for characters which do not fit in the target
/// encoding and iconv does not have an appropriate substitute for
char invalidRep[8]; char invalidRep[8];
/// DOCME
size_t invalidRepSize; size_t invalidRepSize;
#ifndef ICONV_POSIX #ifndef ICONV_POSIX
mutable iconv_fallbacks fallbacks;
/// DOCME
iconv_fallbacks fallbacks;
#endif #endif
#if wxUSE_THREADS #if wxUSE_THREADS
mutable wxMutex iconvMutex;
/// DOCME
wxMutex iconvMutex;
#endif #endif
protected:
iconv_wrapper m2w, w2m;
}; };
// Predefined conversion for the current locale. Should be a drop-in replacement for wxConvLocal // Predefined conversion for the current locale, intended to be a drop-in
// replacement for wxConvLocal
extern AegisubCSConv& csConvLocal; extern AegisubCSConv& csConvLocal;
#ifdef HAVE_BIG_ENDIAN #ifdef HAVE_BIG_ENDIAN
# if SIZEOF_WCHAR_T == 4 # if SIZEOF_WCHAR_T == 4
/// DOCME
# define WCHAR_T_ENCODING "UTF-32BE" # define WCHAR_T_ENCODING "UTF-32BE"
# elif SIZEOF_WCHAR_T == 2 # elif SIZEOF_WCHAR_T == 2
/// DOCME
# define WCHAR_T_ENCODING "UTF-16BE" # define WCHAR_T_ENCODING "UTF-16BE"
# endif # endif
#elif defined(HAVE_LITTLE_ENDIAN) #elif defined(HAVE_LITTLE_ENDIAN)
# if SIZEOF_WCHAR_T == 4 # if SIZEOF_WCHAR_T == 4
/// DOCME
# define WCHAR_T_ENCODING "UTF-32LE" # define WCHAR_T_ENCODING "UTF-32LE"
# elif SIZEOF_WCHAR_T == 2 # elif SIZEOF_WCHAR_T == 2
/// DOCME
# define WCHAR_T_ENCODING "UTF-16LE" # define WCHAR_T_ENCODING "UTF-16LE"
# endif # endif
#else #else
# if SIZEOF_WCHAR_T == 4 # if SIZEOF_WCHAR_T == 4
/// DOCME
# define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-32BE" : "UTF-32LE") # define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-32BE" : "UTF-32LE")
# elif SIZEOF_WCHAR_T == 2 # elif SIZEOF_WCHAR_T == 2
/// DOCME
# define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-16BE" : "UTF-16LE") # define WCHAR_T_ENCODING ((Endian::MachineToBig((uint32_t)1) == 1) ? "UTF-16BE" : "UTF-16LE")
# endif # endif
#endif #endif

View file

@ -1,4 +1,4 @@
// Copyright (c) 2005, Rodrigo Braz Monteiro // Copyright (c) 2010, Rodrigo Braz Monteiro, Thomas Goyne
// All rights reserved. // All rights reserved.
// //
// Redistribution and use in source and binary forms, with or without // Redistribution and use in source and binary forms, with or without
@ -51,13 +51,6 @@
#endif #endif
#include "text_file_reader.h" #include "text_file_reader.h"
/// @brief DOCME
/// @param filename
/// @param enc
/// @param trim
/// @return
///
TextFileReader::TextFileReader(wxString filename, wxString enc, bool trim) TextFileReader::TextFileReader(wxString filename, wxString enc, bool trim)
: encoding(enc), conv((iconv_t)-1), trim(trim), readComplete(false), currout(0), outptr(0), currentLine(0) { : encoding(enc), conv((iconv_t)-1), trim(trim), readComplete(false), currout(0), outptr(0), currentLine(0) {
#ifdef __WINDOWS__ #ifdef __WINDOWS__
@ -65,29 +58,22 @@ TextFileReader::TextFileReader(wxString filename, wxString enc, bool trim)
#else #else
file.open(wxFNCONV(filename),std::ios::in | std::ios::binary); file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
#endif #endif
if (!file.is_open()) { if (!file.is_open()) throw L"Failed opening file for reading.";
throw _T("Failed opening file for reading.");
}
if (encoding.IsEmpty()) encoding = GetEncoding(filename); if (encoding.IsEmpty()) encoding = GetEncoding(filename);
if (encoding == _T("binary")) return; if (encoding == L"binary") return;
encoding = AegisubCSConv::GetRealEncodingName(encoding); encoding = AegisubCSConv::GetRealEncodingName(encoding);
conv = iconv_open(WCHAR_T_ENCODING, encoding.ToAscii()); conv = iconv_open(WCHAR_T_ENCODING, encoding.ToAscii());
if (conv == (iconv_t)-1) {
throw wxString::Format(L"Character set '%s' is not supported.", enc.c_str());
}
} }
/// @brief DOCME
///
TextFileReader::~TextFileReader() { TextFileReader::~TextFileReader() {
if (conv != (iconv_t)-1) iconv_close(conv); if (conv != (iconv_t)-1) iconv_close(conv);
} }
wxString TextFileReader::GetEncoding(wxString const& filename) {
/// @brief DOCME
/// @param filename
/// @return
///
wxString TextFileReader::GetEncoding(const wxString filename) {
// Prepare // Prepare
unsigned char b[4]; unsigned char b[4];
memset(b, 0, sizeof(b)); memset(b, 0, sizeof(b));
@ -100,27 +86,27 @@ wxString TextFileReader::GetEncoding(const wxString filename) {
ifile.open(wxFNCONV(filename)); ifile.open(wxFNCONV(filename));
#endif #endif
if (!ifile.is_open()) { if (!ifile.is_open()) {
return _T("unknown"); return L"unknown";
} }
ifile.read(reinterpret_cast<char *>(b),4); ifile.read(reinterpret_cast<char *>(b),4);
ifile.close(); ifile.close();
// Try to get the byte order mark from them // Try to get the byte order mark from them
if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return _T("UTF-8"); if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return L"UTF-8";
else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return _T("UTF-32LE"); else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return L"UTF-32LE";
else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return _T("UTF-32BE"); else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return L"UTF-32BE";
else if (b[0] == 0xFF && b[1] == 0xFE) return _T("UTF-16LE"); else if (b[0] == 0xFF && b[1] == 0xFE) return L"UTF-16LE";
else if (b[0] == 0xFE && b[1] == 0xFF) return _T("UTF-16BE"); else if (b[0] == 0xFE && b[1] == 0xFF) return L"UTF-16BE";
else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return _T("UTF-7"); else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return L"UTF-7";
// Try to guess UTF-16 // Try to guess UTF-16
else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return _T("UTF-16BE"); else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return L"UTF-16BE";
else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return _T("UTF-16LE"); else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return L"UTF-16LE";
// If any of the first four bytes are under 0x20 (the first printable character), // If any of the first four bytes are under 0x20 (the first printable character),
// except for 9-13 range, assume binary // except for 9-13 range, assume binary
for (int i=0;i<4;i++) { for (int i=0;i<4;i++) {
if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return _T("binary"); if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return L"binary";
} }
#ifdef WITH_UNIVCHARDET #ifdef WITH_UNIVCHARDET
@ -129,14 +115,10 @@ wxString TextFileReader::GetEncoding(const wxString filename) {
return det.GetEncoding(filename); return det.GetEncoding(filename);
#else #else
// Fall back to local // Fall back to local
return _T("Local"); return L"local";
#endif #endif
} }
/// @brief DOCME
/// @return
///
wchar_t TextFileReader::GetWChar() { wchar_t TextFileReader::GetWChar() {
// If there's already some converted characters waiting, return the next one // If there's already some converted characters waiting, return the next one
if (++currout < outptr) { if (++currout < outptr) {
@ -174,7 +156,7 @@ wchar_t TextFileReader::GetWChar() {
// adding one byte to the input buffer until either it succeeds or we add enough bytes to // adding one byte to the input buffer until either it succeeds or we add enough bytes to
// complete any character // complete any character
if (++bytesAdded > 3) if (++bytesAdded > 3)
throw wxString::Format(_T("Invalid input character found near line %u"), currentLine); throw wxString::Format(L"Invalid input character found near line %u", currentLine);
file.read(inptr + inbytesleft, 1); file.read(inptr + inbytesleft, 1);
inbytesleft++; inbytesleft++;
@ -183,34 +165,27 @@ wchar_t TextFileReader::GetWChar() {
if (outptr > outbuf) if (outptr > outbuf)
return *currout; return *currout;
throw wxString::Format(_T("Invalid input character found near line %u"), currentLine); throw wxString::Format(L"Invalid input character found near line %u", currentLine);
} }
/// @brief DOCME
/// @return
///
wxString TextFileReader::ReadLineFromFile() { wxString TextFileReader::ReadLineFromFile() {
wxString buffer; wxString buffer;
size_t bufAlloc = 1024; buffer.Alloc(1024);
buffer.Alloc(bufAlloc);
currentLine++; currentLine++;
// Read a line // Read a line
wchar_t ch; wchar_t ch;
size_t len = 0; bool first = true;
// This doesn't work for \r deliminated files, but it's very unlikely
// that we'll run into one of those
for (ch = GetWChar(); ch != L'\n' && ch != 0; ch = GetWChar()) { for (ch = GetWChar(); ch != L'\n' && ch != 0; ch = GetWChar()) {
if (ch == L'\r') continue; if (ch == L'\r') continue;
// Skip the BOM -- we don't need it as the encoding is already known // Skip the BOM -- we don't need it as the encoding is already known
// and it sometimes causes conversion problems // and it sometimes causes conversion problems
if (ch == 0xFEFF && len == 0) continue; if (ch == 0xFEFF && first) continue;
if (len >= bufAlloc - 1) {
bufAlloc *= 2;
buffer.Alloc(bufAlloc);
}
buffer += ch; buffer += ch;
len++; first = false;
} }
if (ch == 0) if (ch == 0)
readComplete = true; readComplete = true;
@ -223,36 +198,10 @@ wxString TextFileReader::ReadLineFromFile() {
return buffer; return buffer;
} }
/// @brief DOCME
/// @return
///
bool TextFileReader::HasMoreLines() { bool TextFileReader::HasMoreLines() {
return !readComplete; return !readComplete;
} }
/// @brief DOCME
/// @param encoding
/// @return
///
void TextFileReader::EnsureValid(wxString enc) {
if (enc == _T("binary")) return;
enc = AegisubCSConv::GetRealEncodingName(enc);
iconv_t cd = iconv_open(WCHAR_T_ENCODING, enc.ToAscii());
bool canOpen = cd != (iconv_t)-1;
iconv_close(cd);
if (!canOpen) {
throw wxString::Format(_T("Character set %s is not supported."), enc.c_str());
}
}
/// @brief DOCME
///
wxString TextFileReader::GetCurrentEncoding() { wxString TextFileReader::GetCurrentEncoding() {
return encoding; return encoding;
} }

View file

@ -1,4 +1,4 @@
// Copyright (c) 2005, Rodrigo Braz Monteiro // Copyright (c) 2010, Rodrigo Braz Monteiro
// All rights reserved. // All rights reserved.
// //
// Redistribution and use in source and binary forms, with or without // Redistribution and use in source and binary forms, with or without
@ -45,62 +45,53 @@
#include <wx/string.h> #include <wx/string.h>
#endif #endif
/// DOCME
/// @class TextFileReader /// @class TextFileReader
/// @brief DOCME /// @brief A line-based text file reader
///
/// DOCME
class TextFileReader { class TextFileReader {
private: private:
/// Encoding of the file being read
/// DOCME
wxString encoding; wxString encoding;
/// DOCME
std::ifstream file; std::ifstream file;
/// DOCME
iconv_t conv; iconv_t conv;
/// DOCME
bool trim; bool trim;
/// DOCME
bool readComplete; bool readComplete;
// Iconv buffers and state
/// DOCME
wchar_t outbuf[256]; wchar_t outbuf[256];
/// DOCME
wchar_t *currout; wchar_t *currout;
/// DOCME
wchar_t *outptr; wchar_t *outptr;
/// DOCME
size_t outbytesleft; size_t outbytesleft;
/// Current line number
/// DOCME
unsigned int currentLine; unsigned int currentLine;
/// @brief Read a single wchar_t from the file
wchar_t GetWChar(); wchar_t GetWChar();
TextFileReader(const TextFileReader&); TextFileReader(const TextFileReader&);
TextFileReader& operator=(const TextFileReader&); TextFileReader& operator=(const TextFileReader&);
public: public:
TextFileReader(wxString filename,wxString encoding=_T(""), bool trim=true); /// @brief Constructor
/// @param filename File to open
/// @param enc Encoding to use, or empty to autodetect
/// @param trim Whether to trim whitespace from lines read
TextFileReader(wxString filename,wxString encoding=L"", bool trim=true);
/// @brief Destructor
~TextFileReader(); ~TextFileReader();
/// @brief Read a line from the file
/// @return The line, possibly trimmed
wxString ReadLineFromFile(); wxString ReadLineFromFile();
/// @brief Check if there are any more lines to read
bool HasMoreLines(); bool HasMoreLines();
static void EnsureValid(const wxString encoding); /// @brief Get the file encoding used by this reader
/// @return "unknown", "binary", or a character encoding name
wxString GetCurrentEncoding(); wxString GetCurrentEncoding();
static wxString GetEncoding(const wxString filename);
/// @brief Attempt to detect a file's encoding
/// @param filename The file to check
/// @return "unknown", "binary", or a character encoding name
static wxString GetEncoding(wxString const& filename);
}; };