diff --git a/aegisub/build/tests_vs2008/tests_vs2008.vcproj b/aegisub/build/tests_vs2008/tests_vs2008.vcproj index 34ad939ed..539537a6a 100644 --- a/aegisub/build/tests_vs2008/tests_vs2008.vcproj +++ b/aegisub/build/tests_vs2008/tests_vs2008.vcproj @@ -232,6 +232,10 @@ RelativePath="..\..\tests\libaegisub_iconv.cpp" > + + diff --git a/aegisub/libaegisub/include/libaegisub/charset_conv.h b/aegisub/libaegisub/include/libaegisub/charset_conv.h index 2c1c17e9e..53392cfa6 100644 --- a/aegisub/libaegisub/include/libaegisub/charset_conv.h +++ b/aegisub/libaegisub/include/libaegisub/charset_conv.h @@ -18,6 +18,8 @@ /// @brief Wrapper for libiconv to present a more C++-friendly API /// @ingroup libaegisub +#pragma once + #ifndef LAGI_PRE #include #include diff --git a/aegisub/libaegisub/include/libaegisub/line_iterator.h b/aegisub/libaegisub/include/libaegisub/line_iterator.h new file mode 100644 index 000000000..cf0c5b2a4 --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/line_iterator.h @@ -0,0 +1,218 @@ +// Copyright (c) 2010, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file line_iterator.h +/// @brief An iterator over lines in a stream +/// @ingroup libaegisub + +#pragma once + +#if !defined(AGI_PRE) && !defined(LAGI_PRE) +#include +#ifdef _WIN32 +#include +#else +#include +#endif +#include + +#include +#endif + +#include + +namespace agi { + +/// @class line_iterator +/// @brief An iterator over lines in a stream +template +class line_iterator : public std::iterator { + std::istream *stream; ///< Stream to iterator over + bool valid; ///< Are there any more values to read? + OutputType value; ///< Value to return when this is dereference + std::string encoding; ///< Encoding of source stream + std::tr1::shared_ptr conv; + int cr; ///< CR character in the source encoding + int lf; ///< LF character in the source encoding + int width; ///< width of LF character in the source encoding + + /// @brief Convert a string to the output type + /// @param str Line read from the file + /// + /// line_iterator users can either ensure that operator>> is defined for + /// their desired output type or simply provide a specialization of this + /// method which does the conversion. + inline bool convert(std::string &str); + /// Called after construction for specializations that need to do things + void init() { }; + /// @brief Get the next line from the stream + /// @param[out] str String to fill with the next line + void getline(std::string &str); + + /// @brief Get the next value from the stream + void next(); +public: + /// @brief Constructor + /// @param stream The stream to read from. The calling code is responsible + /// for ensuring that the stream remains valid for the + /// lifetime of the iterator and that it get cleaned up. + /// @param encoding Encoding of the text read from the stream + line_iterator(std::istream &stream, std::string encoding = "utf-8") + : stream(&stream) + , valid(true) + , encoding(encoding) + , cr(0) + , lf(0) + , width(0) + { + agi::charset::IconvWrapper c("utf-8", encoding.c_str()); + c.Convert("\r", 1, reinterpret_cast(&cr), sizeof(int)); + c.Convert("\n", 1, reinterpret_cast(&lf), sizeof(int)); + width = c.RequiredBufferSize("\n"); + + if (encoding != "utf-8") { + conv.reset(new agi::charset::IconvWrapper(encoding.c_str(), "utf-8")); + } + init(); + ++(*this); + } + /// @brief Invalid iterator constructor; use for end iterator + line_iterator() + : stream(0) + , valid(false) + { + } + /// @brief Copy constructor + /// @param that line_iterator to copy from + line_iterator(line_iterator const& that) + : stream(that.stream) + , valid(that.valid) + , value(that.value) + , encoding(that.encoding) + , conv(that.conv) + , cr(that.cr) + , lf(that.lf) + , width(that.width) + { + } + OutputType operator*() const { + return value; + } + line_iterator& operator++() { + next(); + return *this; + } + line_iterator operator++(int) { + line_iterator tmp(*this); + ++(*this); + return tmp; + } + bool operator==(line_iterator const& rgt) const { + return valid == rgt.valid; + } + bool operator!=(line_iterator const& rgt) const { + return !operator==(rgt); + } + + // typedefs needed by some stl algorithms + typedef OutputType* pointer; + typedef OutputType& reference; + typedef const OutputType* const_pointer; + typedef const OutputType& const_reference; + + line_iterator operator=(line_iterator that) { + using std::swap; + swap(*this, that); + return *this; + } + void swap(line_iterator &that) throw() { + using std::swap; + swap(stream, that.stream); + swap(valid, that.valid); + swap(value, that.value); + swap(encoding, that.encoding); + swap(conv, that.conv); + swap(lf, that.lf); + swap(cr, that.cr); + swap(width, that.width); + } +}; + +template +void line_iterator::getline(std::string &str) { + union { + int32_t chr; + char buf[4]; + }; + + for (;;) { + chr = 0; +#ifdef _WIN32 + int read = stream->rdbuf()->_Sgetn_s(buf, 4, width); +#else + int read = stream->rdbuf()->sgetn(buf, width); +#endif + if (read < width) { + for (int i = 0; i < read; i++) { + str += buf[i]; + } + stream->setstate(std::ios::eofbit); + return; + } + if (chr == cr) continue; + if (chr == lf) return; + for (int i = 0; i < read; i++) { + str += buf[i]; + } + } +} + +template +void line_iterator::next() { + if (!valid) return; + if (!stream->good()) { + valid = false; + return; + } + std::string str; + getline(str); + if (conv.get()) { + str = conv->Convert(str); + } + if (!convert(str)) { + next(); + return; + } +} + +template +inline bool line_iterator::convert(std::string &str) { + std::istringstream ss(str); + ss >> value; + return !ss.fail(); +} +template<> +inline bool line_iterator::convert(std::string &str) { + value = str; + return true; +} + +template +void swap(agi::line_iterator &lft, agi::line_iterator &rgt) { + lft.swap(rgt); +} + +} diff --git a/aegisub/libaegisub/include/libaegisub/log.h b/aegisub/libaegisub/include/libaegisub/log.h index 9703f136a..7e22747fb 100644 --- a/aegisub/libaegisub/include/libaegisub/log.h +++ b/aegisub/libaegisub/include/libaegisub/log.h @@ -24,6 +24,7 @@ #include #include +#include #ifdef __DEPRECATED // Dodge GCC warnings # undef __DEPRECATED # include diff --git a/aegisub/libaegisub/lagi_pre.h b/aegisub/libaegisub/lagi_pre.h index d010a5105..1fbcc7ebe 100644 --- a/aegisub/libaegisub/lagi_pre.h +++ b/aegisub/libaegisub/lagi_pre.h @@ -28,7 +28,11 @@ #include #include #include +#ifdef _WIN32 #include +#else +#include +#endif #include #include diff --git a/aegisub/src/agi_pre.h b/aegisub/src/agi_pre.h index 17d817d77..175b349e4 100644 --- a/aegisub/src/agi_pre.h +++ b/aegisub/src/agi_pre.h @@ -72,8 +72,13 @@ #include #include #include +#ifdef _WIN32 #include +#else +#include +#endif #include +#include #include #include #include diff --git a/aegisub/src/frame_main.cpp b/aegisub/src/frame_main.cpp index 928510c48..c2b8a77f8 100644 --- a/aegisub/src/frame_main.cpp +++ b/aegisub/src/frame_main.cpp @@ -57,7 +57,6 @@ #ifdef WITH_AVISYNTH #include "avisynth_wrap.h" #endif -#include "charset_conv.h" #include "compat.h" #include "dialog_detached_video.h" #include "dialog_search_replace.h" diff --git a/aegisub/src/frame_main_events.cpp b/aegisub/src/frame_main_events.cpp index 71cfad2fd..a8d7cdce4 100644 --- a/aegisub/src/frame_main_events.cpp +++ b/aegisub/src/frame_main_events.cpp @@ -54,7 +54,6 @@ #ifdef WITH_AUTOMATION #include "auto4_base.h" #endif -#include "charset_conv.h" #include "compat.h" #include "dialog_about.h" #include "dialog_attachments.h" diff --git a/aegisub/src/text_file_reader.cpp b/aegisub/src/text_file_reader.cpp index 039cd3993..25725858c 100644 --- a/aegisub/src/text_file_reader.cpp +++ b/aegisub/src/text_file_reader.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2010, Rodrigo Braz Monteiro, Thomas Goyne +// Copyright (c) 2010, Thomas Goyne // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -43,120 +43,54 @@ #include #include #include + +#include #endif +#include #include -#include "charset_conv.h" #include "charset_detect.h" +#include "compat.h" #include "text_file_reader.h" TextFileReader::TextFileReader(wxString const& filename, wxString encoding, bool trim) -: isBinary(false) -, conv() -, trim(trim) -, readComplete(false) -, currout(0) -, outptr(0) -, currentLine(0) +: trim(trim) +, isBinary(false) { -#ifdef __WINDOWS__ - file.open(filename.wc_str(),std::ios::in | std::ios::binary); -#else - file.open(wxFNCONV(filename),std::ios::in | std::ios::binary); -#endif - if (!file.is_open()) throw L"Failed opening file for reading."; - - if (encoding.IsEmpty()) encoding = CharSetDetect::GetEncoding(filename); + if (encoding.empty()) encoding = CharSetDetect::GetEncoding(filename); if (encoding == L"binary") { isBinary = true; return; } - conv.reset(new agi::charset::IconvWrapper(encoding.c_str(), "wchar_t")); + file.reset(agi::io::Open(STD_STR(filename))); + iter = agi::line_iterator(*file, STD_STR(encoding)); } TextFileReader::~TextFileReader() { } -wchar_t TextFileReader::GetWChar() { - // If there's already some converted characters waiting, return the next one - if (++currout < outptr) { - return *currout; - } - - if (file.eof()) return 0; - - // Otherwise convert another block - char inbuf[64]; - char *inptr = inbuf; - size_t inbytesleft = sizeof(inbuf) - 4; - int bytesAdded = 0; - memset(inbuf, 0, inbytesleft); - - outptr = outbuf; - outbytesleft = sizeof(outbuf); - currout = outbuf; - - file.read(inbuf, inbytesleft); - inbytesleft = file.gcount(); - if (inbytesleft == 0) - return 0; - - do { - // Without this const_cast the wrong overload is chosen - size_t ret = conv->Convert(const_cast(&inptr), &inbytesleft, reinterpret_cast(&outptr), &outbytesleft); - if (ret != (size_t)-1) break; - - int err = errno; - // If 64 chars do not fit into 256 wchar_ts the environment is so bizzare that doing - // anything is probably futile - assert(err != E2BIG); - - // (Hopefully) the edge of the buffer happened to split a multibyte character, so keep - // adding one byte to the input buffer until either it succeeds or we add enough bytes to - // complete any character - if (++bytesAdded > 3) - throw wxString::Format(L"Invalid input character found near line %u", currentLine); - - file.read(inptr + inbytesleft, 1); - inbytesleft++; - } while (!file.eof() && file.gcount()); - - if (outptr > outbuf) - return *currout; - - throw wxString::Format(L"Invalid input character found near line %u", currentLine); -} - wxString TextFileReader::ReadLineFromFile() { - wxString buffer; - buffer.Alloc(1024); - - currentLine++; - // Read a line - wchar_t ch; - bool first = true; - // This doesn't work for \r deliminated files, but it's very unlikely - // that we'll run into one of those - for (ch = GetWChar(); ch != L'\n' && ch != 0; ch = GetWChar()) { - if (ch == L'\r') continue; - // Skip the BOM -- we don't need it as the encoding is already known - // and it sometimes causes conversion problems - if (ch == 0xFEFF && first) continue; - - buffer += ch; - first = false; - } - if (ch == 0) - readComplete = true; - - if (trim) { - buffer.Trim(true); - buffer.Trim(false); - } - return buffer; + wxString str = *iter; + ++iter; + if (trim) str.Trim(true).Trim(false); + if (str.StartsWith(L"\uFEFF")) str = str.Mid(1); + return str; } -bool TextFileReader::HasMoreLines() { - return !readComplete; +namespace agi { +#ifdef _WIN32 + template<> void line_iterator::init() { + conv.reset(new agi::charset::IconvWrapper(encoding.c_str(), "utf-16le")); + } + template<> bool line_iterator::convert(std::string &str) { + value = wxString(str.c_str(), wxMBConvUTF16LE(), str.size()); + return true; + } +#else + template<> bool line_iterator::convert(std::string &str) { + value = str; + return true; + } +#endif } diff --git a/aegisub/src/text_file_reader.h b/aegisub/src/text_file_reader.h index 1f6c9bf6a..1a48461d7 100644 --- a/aegisub/src/text_file_reader.h +++ b/aegisub/src/text_file_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2010, Rodrigo Braz Monteiro +// Copyright (c) 2010, Thomas Goyne // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -44,31 +44,15 @@ #include #endif -namespace agi { namespace charset { - class IconvWrapper; -} } +#include /// @class TextFileReader /// @brief A line-based text file reader class TextFileReader { -private: - bool isBinary; - std::ifstream file; - std::auto_ptr conv; + std::auto_ptr file; bool trim; - bool readComplete; - - // Iconv buffers and state - wchar_t outbuf[256]; - wchar_t *currout; - wchar_t *outptr; - size_t outbytesleft; - - /// Current line number - unsigned int currentLine; - - /// @brief Read a single wchar_t from the file - wchar_t GetWChar(); + bool isBinary; + agi::line_iterator iter; TextFileReader(const TextFileReader&); TextFileReader& operator=(const TextFileReader&); @@ -86,6 +70,6 @@ public: /// @return The line, possibly trimmed wxString ReadLineFromFile(); /// @brief Check if there are any more lines to read - bool HasMoreLines(); - bool IsBinary() { return isBinary; } + bool HasMoreLines() const { return iter != agi::line_iterator(); } + bool IsBinary() const { return isBinary; } }; diff --git a/aegisub/tests/Makefile.am b/aegisub/tests/Makefile.am index 48c6b65ae..f85d0d1d6 100644 --- a/aegisub/tests/Makefile.am +++ b/aegisub/tests/Makefile.am @@ -13,8 +13,9 @@ run_SOURCES = \ libaegisub_access.cpp \ libaegisub_cajun.cpp \ libaegisub_iconv.cpp \ - libaegisub_util.cpp \ - libaegisub_mru.cpp + libaegisub_line_iterator.cpp \ + libaegisub_mru.cpp \ + libaegisub_util.cpp run_SOURCES += \ *.h diff --git a/aegisub/tests/libaegisub_iconv.cpp b/aegisub/tests/libaegisub_iconv.cpp index 74b0b896e..1de3f3eb3 100644 --- a/aegisub/tests/libaegisub_iconv.cpp +++ b/aegisub/tests/libaegisub_iconv.cpp @@ -138,10 +138,15 @@ TEST(lagi_iconv, wchar_tSupport) { EXPECT_NO_THROW(IconvWrapper("UTF-8", "wchar_t")); } -TEST(lagi_iconv, pretty_names) { +TEST(lagi_iconv, Roundtrip) { std::vector names = GetEncodingsList >(); for (std::vector::iterator cur = names.begin(); cur != names.end(); ++cur) { EXPECT_NO_THROW(IconvWrapper("utf-8", cur->c_str())); EXPECT_NO_THROW(IconvWrapper(cur->c_str(), "utf-8")); + EXPECT_EQ( + "Jackdaws love my big sphinx of quartz", + IconvWrapper(cur->c_str(), "utf-8").Convert( + IconvWrapper("utf-8", cur->c_str()).Convert( + "Jackdaws love my big sphinx of quartz"))); } } diff --git a/aegisub/tests/libaegisub_line_iterator.cpp b/aegisub/tests/libaegisub_line_iterator.cpp new file mode 100644 index 000000000..900ae95df --- /dev/null +++ b/aegisub/tests/libaegisub_line_iterator.cpp @@ -0,0 +1,87 @@ +// Copyright (c) 2010, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file libaegisub_line_iterator.cpp +/// @brief agi::line_iterator tests +/// @ingroup + +#include + +#include +#include +#include +#include +#include + +#include "main.h" +#include "util.h" + +using agi::line_iterator; + +using namespace util; + +template +struct varg_type { + typedef T type; +}; +template<> struct varg_type { + typedef const char * type; +}; + +template +void expect_eq(const char (&str)[N], const char *charset, int num, ...) { + std::string string(str, N - 1); + agi::charset::IconvWrapper conv("utf-8", charset); + string = conv.Convert(string); + std::stringstream ss(string); + line_iterator iter; + EXPECT_NO_THROW(iter = line_iterator(ss, charset)); + va_list argp; + va_start(argp, num); + for (; num > 0; --num) { + EXPECT_FALSE(iter == line_iterator()); + EXPECT_EQ(*iter, va_arg(argp, typename varg_type::type)); + EXPECT_NO_THROW(++iter); + } + va_end(argp); + EXPECT_TRUE(iter == line_iterator()); +} + +TEST(lagi_line, int) { + std::vector charsets = agi::charset::GetEncodingsList >(); + for (std::vector::iterator cur = charsets.begin(); cur != charsets.end(); ++cur) { + expect_eq("1\n2\n3\n4", cur->c_str(), 4, 1, 2, 3, 4); + expect_eq("1\n2\n3\n4\n", cur->c_str(), 4, 1, 2, 3, 4); + expect_eq("1\n2\nb\n3\n4", cur->c_str(), 4, 1, 2, 3, 4); + expect_eq("1.0\n2.0\n3.0\n4.0", cur->c_str(), 4, 1, 2, 3, 4); + expect_eq(" 0x16 \n 09 \n -2", cur->c_str(), 3, 0, 9, -2); + } +} +TEST(lagi_line, double) { + std::vector charsets = agi::charset::GetEncodingsList >(); + for (std::vector::iterator cur = charsets.begin(); cur != charsets.end(); ++cur) { + expect_eq("1.0\n2.0", cur->c_str(), 2, 1.0, 2.0); + expect_eq("#1.0\n\t2.5", cur->c_str(), 1, 2.5); + } +} +TEST(lagi_line, string) { + std::vector charsets = agi::charset::GetEncodingsList >(); + for (std::vector::iterator cur = charsets.begin(); cur != charsets.end(); ++cur) { + expect_eq("line 1\nline 2\nline 3", cur->c_str(), 3, "line 1", "line 2", "line 3"); + expect_eq(" white space ", cur->c_str(), 1, " white space "); + expect_eq("blank\n\nlines\n", cur->c_str(), 4, "blank", "", "lines", ""); + } +} diff --git a/aegisub/tests/util.h b/aegisub/tests/util.h index 9d7b1625b..a23c47a6f 100644 --- a/aegisub/tests/util.h +++ b/aegisub/tests/util.h @@ -19,11 +19,28 @@ /// @ingroup util #include +#include + +#include + namespace util { void copy(const std::string from, const std::string to); void remove(const std::string& file); +template +static std::vector make_vector(int len, ...) { + std::vector vec(len); + + va_list argp; + va_start(argp, len); + for (int i = 0; i < len; i++) { + vec[i] = va_arg(argp, T); + } + va_end(argp); + return vec; +} + } // namespace util