Aegisub/aegisub/src/text_file_reader.cpp

// Copyright (c) 2010, Rodrigo Braz Monteiro, Thomas Goyne
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//   * Redistributions of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//   * Redistributions in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//   * Neither the name of the Aegisub Group nor the names of its contributors
//     may be used to endorse or promote products derived from this software
//     without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Aegisub Project http://www.aegisub.org/
//
// $Id$

/// @file text_file_reader.cpp
/// @brief Read plain text files line by line
/// @ingroup utility
///

#include "config.h"

#ifndef AGI_PRE
#include <assert.h>
#include <errno.h>

#include <algorithm>
#include <fstream>
#include <string>
#endif

#include <libaegisub/log.h>

#include "charset_conv.h"
#include "charset_detect.h"
#include "text_file_reader.h"

TextFileReader::TextFileReader(wxString const& filename, wxString encoding, bool trim)
: isBinary(false)
, conv()
, trim(trim)
, readComplete(false)
, currout(0)
, outptr(0)
, currentLine(0)
{
#ifdef __WINDOWS__
	file.open(filename.wc_str(),std::ios::in | std::ios::binary);
#else
	file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
#endif
	if (!file.is_open()) throw L"Failed opening file for reading.";

	if (encoding.IsEmpty()) encoding = CharSetDetect::GetEncoding(filename);
	if (encoding == L"binary") {
		isBinary = true;
		return;
	}
	conv.reset(new agi::charset::IconvWrapper(encoding.c_str(), "wchar_t"));
}

TextFileReader::~TextFileReader() {
}

wchar_t TextFileReader::GetWChar() {
	// If there's already some converted characters waiting, return the next one
	if (++currout < outptr) {
		return *currout;
	}

	if (file.eof()) return 0;

	// Otherwise convert another block
	char    inbuf[64];
	char    *inptr = inbuf;
	size_t  inbytesleft = sizeof(inbuf) - 4;
	int     bytesAdded = 0;
	memset(inbuf, 0, inbytesleft);

	outptr       = outbuf;
	outbytesleft = sizeof(outbuf);
	currout      = outbuf;

	file.read(inbuf, inbytesleft);
	inbytesleft = file.gcount();
	if (inbytesleft == 0)
		return 0;

	do {
		// Without this const_cast the wrong overload is chosen
		size_t ret = conv->Convert(const_cast<const char**>(&inptr), &inbytesleft, reinterpret_cast<char **>(&outptr), &outbytesleft);
		if (ret != (size_t)-1) break;

		int err = errno;
		// If 64 chars do not fit into 256 wchar_ts the environment is so bizzare that doing
		// anything is probably futile
		assert(err != E2BIG);

		// (Hopefully) the edge of the buffer happened to split a multibyte character, so keep
		// adding one byte to the input buffer until either it succeeds or we add enough bytes to
		// complete any character
		if (++bytesAdded > 3)
			throw wxString::Format(L"Invalid input character found near line %u", currentLine);

		file.read(inptr + inbytesleft, 1);
		inbytesleft++;
	} while (!file.eof() && file.gcount());

	if (outptr > outbuf)
		return *currout;

	throw wxString::Format(L"Invalid input character found near line %u", currentLine);
}

wxString TextFileReader::ReadLineFromFile() {
	wxString buffer;
	buffer.Alloc(1024);

	currentLine++;
	// Read a line
	wchar_t ch;
	bool first = true;
	// This doesn't work for \r deliminated files, but it's very unlikely
	// that we'll run into one of those
	for (ch = GetWChar(); ch != L'\n' && ch != 0; ch = GetWChar()) {
		if (ch == L'\r') continue;
		// Skip the BOM -- we don't need it as the encoding is already known
		// and it sometimes causes conversion problems
		if (ch == 0xFEFF && first) continue;

		buffer += ch;
		first = false;
	}
	if (ch == 0)
		readComplete = true;

	if (trim) {
		buffer.Trim(true);
		buffer.Trim(false);
	}
	return buffer;
}

bool TextFileReader::HasMoreLines() {
	return !readComplete;
}
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`// Copyright (c) 2010, Rodrigo Braz Monteiro, Thomas Goyne`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00			`// All rights reserved.`
			`//`
			`// Redistribution and use in source and binary forms, with or without`
			`// modification, are permitted provided that the following conditions are met:`
			`//`
			`// * Redistributions of source code must retain the above copyright notice,`
			`// this list of conditions and the following disclaimer.`
			`// * Redistributions in binary form must reproduce the above copyright notice,`
			`// this list of conditions and the following disclaimer in the documentation`
			`// and/or other materials provided with the distribution.`
			`// * Neither the name of the Aegisub Group nor the names of its contributors`
			`// may be used to endorse or promote products derived from this software`
			`// without specific prior written permission.`
			`//`
			`// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`// POSSIBILITY OF SUCH DAMAGE.`
			`//`
Switch all headers to using Doxygen and cleanup contact info * Swap old email + website address with 'Aegisub Project http://www.aegisub.org/' * Set categories for all files (jfs) * Add descriptions for each file (jfs) * Add $Id$ keyword Originally committed to SVN as r3310. 2009-07-29 07:43:02 +02:00			`// Aegisub Project http://www.aegisub.org/`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00			`//`
Switch all headers to using Doxygen and cleanup contact info * Swap old email + website address with 'Aegisub Project http://www.aegisub.org/' * Set categories for all files (jfs) * Add descriptions for each file (jfs) * Add $Id$ keyword Originally committed to SVN as r3310. 2009-07-29 07:43:02 +02:00			`// $Id$`

			`/// @file text_file_reader.cpp`
			`/// @brief Read plain text files line by line`
			`/// @ingroup utility`
			`///`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Change how configuration works. * move win32/config0.h -> config/config_windows0.h * move win32/stdint.h -> msvc/stdint.h * move posix/defines.h -> config/config_unix.h * add config.h - brings in config_(windows\|unix).h as required * add config.h to .cpp Self-contain FFmpegSource2 with required function defines wrapped with __UNIX__ instead of the now removed defines.h. * Edit aegisub_vs2008.vcproj to remove explicit inclusion of config.h, also change VCPreBuildEventTool to create windows_config.h from windows_config0.h. Add msvc to include path. * Change configure to create ./acconf.h instead of posix/acconf.h This will allow us to create a more standard and platform agnostic way of configuration aegsisub during build time. Originally committed to SVN as r2621. 2009-01-04 07:31:48 +01:00			`#include "config.h"`

Fix all the headers in .cpp, this includes: Wrapping all headers that are in agi_pre.h with AGI_PRE. * Sorting alphabetically. Originally committed to SVN as r3515. 2009-09-10 15:06:40 +02:00			`#ifndef AGI_PRE`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`#include <assert.h>`
A few minor cleanups to the new charset conversion code. Originally committed to SVN as r3159. 2009-07-18 02:58:13 +02:00			`#include <errno.h>`

Fix all the headers in .cpp, this includes: Wrapping all headers that are in agi_pre.h with AGI_PRE. * Sorting alphabetically. Originally committed to SVN as r3515. 2009-09-10 15:06:40 +02:00			`#include <algorithm>`
			`#include <fstream>`
			`#include <string>`
			`#endif`
Added shb's incomplete perl support code (doesn't build on VC++ yet) Originally committed to SVN as r1741. 2008-01-16 19:29:29 +01:00
Convert a bunch of wxLog* to the new logging method. Originally committed to SVN as r4399. 2010-06-01 10:21:30 +02:00			`#include <libaegisub/log.h>`

Fix all the headers in .cpp, this includes: Wrapping all headers that are in agi_pre.h with AGI_PRE. * Sorting alphabetically. Originally committed to SVN as r3515. 2009-09-10 15:06:40 +02:00			`#include "charset_conv.h"`
Instead of falling back to your local charset, Aegisub will now use the "universalchardet" library (the one used by the Mozilla project) to autodetect the character set of non-Unicode files. Originally committed to SVN as r1028. 2007-04-08 08:01:41 +02:00			`#include "charset_detect.h"`
Fix all the headers in .cpp, this includes: Wrapping all headers that are in agi_pre.h with AGI_PRE. * Sorting alphabetically. Originally committed to SVN as r3515. 2009-09-10 15:06:40 +02:00			`#include "text_file_reader.h"`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Move most character set conversion code to libaegisub and make everything use the new conversion functionality. Originally committed to SVN as r4423. 2010-06-03 22:32:25 +02:00			`TextFileReader::TextFileReader(wxString const& filename, wxString encoding, bool trim)`
			`: isBinary(false)`
			`, conv()`
			`, trim(trim)`
			`, readComplete(false)`
			`, currout(0)`
			`, outptr(0)`
			`, currentLine(0)`
			`{`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`#ifdef __WINDOWS__`
			`file.open(filename.wc_str(),std::ios::in \| std::ios::binary);`
			`#else`
			`file.open(wxFNCONV(filename),std::ios::in \| std::ios::binary);`
			`#endif`
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`if (!file.is_open()) throw L"Failed opening file for reading.";`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Simplify charset detection Originally committed to SVN as r4419. 2010-06-03 22:31:43 +02:00			`if (encoding.IsEmpty()) encoding = CharSetDetect::GetEncoding(filename);`
Move most character set conversion code to libaegisub and make everything use the new conversion functionality. Originally committed to SVN as r4423. 2010-06-03 22:32:25 +02:00			`if (encoding == L"binary") {`
			`isBinary = true;`
			`return;`
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`}`
Move most character set conversion code to libaegisub and make everything use the new conversion functionality. Originally committed to SVN as r4423. 2010-06-03 22:32:25 +02:00			`conv.reset(new agi::charset::IconvWrapper(encoding.c_str(), "wchar_t"));`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00			`}`

			`TextFileReader::~TextFileReader() {`
			`}`

Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`wchar_t TextFileReader::GetWChar() {`
			`// If there's already some converted characters waiting, return the next one`
			`if (++currout < outptr) {`
			`return *currout;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00			`}`

Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`if (file.eof()) return 0;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`// Otherwise convert another block`
			`char inbuf[64];`
			`char *inptr = inbuf;`
			`size_t inbytesleft = sizeof(inbuf) - 4;`
			`int bytesAdded = 0;`
			`memset(inbuf, 0, inbytesleft);`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`outptr = outbuf;`
			`outbytesleft = sizeof(outbuf);`
			`currout = outbuf;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`file.read(inbuf, inbytesleft);`
			`inbytesleft = file.gcount();`
Fix a regression introduced by Plorkyeran's iconv patch that caused the text file reader to throw an exception when trying to read an empty file. Originally committed to SVN as r3146. 2009-07-16 17:10:40 +02:00			`if (inbytesleft == 0)`
			`return 0;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`do {`
Move most character set conversion code to libaegisub and make everything use the new conversion functionality. Originally committed to SVN as r4423. 2010-06-03 22:32:25 +02:00			`// Without this const_cast the wrong overload is chosen`
			`size_t ret = conv->Convert(const_cast<const char>(&inptr), &inbytesleft, reinterpret_cast<char >(&outptr), &outbytesleft);`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`if (ret != (size_t)-1) break;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`int err = errno;`
			`// If 64 chars do not fit into 256 wchar_ts the environment is so bizzare that doing`
			`// anything is probably futile`
			`assert(err != E2BIG);`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`// (Hopefully) the edge of the buffer happened to split a multibyte character, so keep`
			`// adding one byte to the input buffer until either it succeeds or we add enough bytes to`
			`// complete any character`
			`if (++bytesAdded > 3)`
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`throw wxString::Format(L"Invalid input character found near line %u", currentLine);`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`file.read(inptr + inbytesleft, 1);`
			`inbytesleft++;`
A few minor cleanups to the new charset conversion code. Originally committed to SVN as r3159. 2009-07-18 02:58:13 +02:00			`} while (!file.eof() && file.gcount());`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`if (outptr > outbuf)`
			`return *currout;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`throw wxString::Format(L"Invalid input character found near line %u", currentLine);`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00			`}`

Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`wxString TextFileReader::ReadLineFromFile() {`
			`wxString buffer;`
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`buffer.Alloc(1024);`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00
			`currentLine++;`
			`// Read a line`
			`wchar_t ch;`
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`bool first = true;`
			`// This doesn't work for \r deliminated files, but it's very unlikely`
			`// that we'll run into one of those`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`for (ch = GetWChar(); ch != L'\n' && ch != 0; ch = GetWChar()) {`
			`if (ch == L'\r') continue;`
A few minor cleanups to the new charset conversion code. Originally committed to SVN as r3159. 2009-07-18 02:58:13 +02:00			`// Skip the BOM -- we don't need it as the encoding is already known`
			`// and it sometimes causes conversion problems`
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`if (ch == 0xFEFF && first) continue;`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00
			`buffer += ch;`
Add some documentation for AegisubCSConv and TextFileReader. Originally committed to SVN as r4036. 2010-01-24 19:56:51 +01:00			`first = false;`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`}`
			`if (ch == 0)`
			`readComplete = true;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`if (trim) {`
			`buffer.Trim(true);`
			`buffer.Trim(false);`
			`}`
			`return buffer;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00			`}`

			`bool TextFileReader::HasMoreLines() {`
Replaced most wx-based charset conversions with a custom iconv-based conversion. Closes #639, #666, #837, #849 and #877. Originally committed to SVN as r3137. 2009-07-14 23:28:49 +02:00			`return !readComplete;`
Originally committed to SVN as r2. 2006-01-16 22:02:54 +01:00			`}`