2010-01-24 19:56:51 +01:00
|
|
|
// Copyright (c) 2010, Rodrigo Braz Monteiro, Thomas Goyne
|
2006-01-16 22:02:54 +01:00
|
|
|
// All rights reserved.
|
|
|
|
//
|
|
|
|
// Redistribution and use in source and binary forms, with or without
|
|
|
|
// modification, are permitted provided that the following conditions are met:
|
|
|
|
//
|
|
|
|
// * Redistributions of source code must retain the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer.
|
|
|
|
// * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
|
|
// and/or other materials provided with the distribution.
|
|
|
|
// * Neither the name of the Aegisub Group nor the names of its contributors
|
|
|
|
// may be used to endorse or promote products derived from this software
|
|
|
|
// without specific prior written permission.
|
|
|
|
//
|
|
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
// POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
//
|
2009-07-29 07:43:02 +02:00
|
|
|
// Aegisub Project http://www.aegisub.org/
|
2006-01-16 22:02:54 +01:00
|
|
|
//
|
2009-07-29 07:43:02 +02:00
|
|
|
// $Id$
|
|
|
|
|
|
|
|
/// @file text_file_reader.cpp
|
|
|
|
/// @brief Read plain text files line by line
|
|
|
|
/// @ingroup utility
|
|
|
|
///
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-01-04 07:31:48 +01:00
|
|
|
#include "config.h"
|
|
|
|
|
2009-09-10 15:06:40 +02:00
|
|
|
#ifndef AGI_PRE
|
2009-07-14 23:28:49 +02:00
|
|
|
#include <assert.h>
|
2009-07-18 02:58:13 +02:00
|
|
|
#include <errno.h>
|
|
|
|
|
2009-09-10 15:06:40 +02:00
|
|
|
#include <algorithm>
|
|
|
|
#include <fstream>
|
|
|
|
#include <string>
|
|
|
|
#endif
|
2008-01-16 19:29:29 +01:00
|
|
|
|
2010-06-01 10:21:30 +02:00
|
|
|
#include <libaegisub/log.h>
|
|
|
|
|
2009-09-10 15:06:40 +02:00
|
|
|
#include "charset_conv.h"
|
2007-04-08 08:01:41 +02:00
|
|
|
#include "charset_detect.h"
|
2009-09-10 15:06:40 +02:00
|
|
|
#include "text_file_reader.h"
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2010-06-03 22:32:25 +02:00
|
|
|
TextFileReader::TextFileReader(wxString const& filename, wxString encoding, bool trim)
|
|
|
|
: isBinary(false)
|
|
|
|
, conv()
|
|
|
|
, trim(trim)
|
|
|
|
, readComplete(false)
|
|
|
|
, currout(0)
|
|
|
|
, outptr(0)
|
|
|
|
, currentLine(0)
|
|
|
|
{
|
2009-07-14 23:28:49 +02:00
|
|
|
#ifdef __WINDOWS__
|
|
|
|
file.open(filename.wc_str(),std::ios::in | std::ios::binary);
|
|
|
|
#else
|
|
|
|
file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
|
|
|
|
#endif
|
2010-01-24 19:56:51 +01:00
|
|
|
if (!file.is_open()) throw L"Failed opening file for reading.";
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2010-06-03 22:31:43 +02:00
|
|
|
if (encoding.IsEmpty()) encoding = CharSetDetect::GetEncoding(filename);
|
2010-06-03 22:32:25 +02:00
|
|
|
if (encoding == L"binary") {
|
|
|
|
isBinary = true;
|
|
|
|
return;
|
2010-01-24 19:56:51 +01:00
|
|
|
}
|
2010-06-03 22:32:25 +02:00
|
|
|
conv.reset(new agi::charset::IconvWrapper(encoding.c_str(), "wchar_t"));
|
2006-01-16 22:02:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
TextFileReader::~TextFileReader() {
|
|
|
|
}
|
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
wchar_t TextFileReader::GetWChar() {
|
|
|
|
// If there's already some converted characters waiting, return the next one
|
|
|
|
if (++currout < outptr) {
|
|
|
|
return *currout;
|
2006-01-16 22:02:54 +01:00
|
|
|
}
|
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
if (file.eof()) return 0;
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
// Otherwise convert another block
|
|
|
|
char inbuf[64];
|
|
|
|
char *inptr = inbuf;
|
|
|
|
size_t inbytesleft = sizeof(inbuf) - 4;
|
|
|
|
int bytesAdded = 0;
|
|
|
|
memset(inbuf, 0, inbytesleft);
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
outptr = outbuf;
|
|
|
|
outbytesleft = sizeof(outbuf);
|
|
|
|
currout = outbuf;
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
file.read(inbuf, inbytesleft);
|
|
|
|
inbytesleft = file.gcount();
|
2009-07-16 17:10:40 +02:00
|
|
|
if (inbytesleft == 0)
|
|
|
|
return 0;
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
do {
|
2010-06-03 22:32:25 +02:00
|
|
|
// Without this const_cast the wrong overload is chosen
|
|
|
|
size_t ret = conv->Convert(const_cast<const char**>(&inptr), &inbytesleft, reinterpret_cast<char **>(&outptr), &outbytesleft);
|
2009-07-14 23:28:49 +02:00
|
|
|
if (ret != (size_t)-1) break;
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
int err = errno;
|
|
|
|
// If 64 chars do not fit into 256 wchar_ts the environment is so bizzare that doing
|
|
|
|
// anything is probably futile
|
|
|
|
assert(err != E2BIG);
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
// (Hopefully) the edge of the buffer happened to split a multibyte character, so keep
|
|
|
|
// adding one byte to the input buffer until either it succeeds or we add enough bytes to
|
|
|
|
// complete any character
|
|
|
|
if (++bytesAdded > 3)
|
2010-01-24 19:56:51 +01:00
|
|
|
throw wxString::Format(L"Invalid input character found near line %u", currentLine);
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
file.read(inptr + inbytesleft, 1);
|
|
|
|
inbytesleft++;
|
2009-07-18 02:58:13 +02:00
|
|
|
} while (!file.eof() && file.gcount());
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
if (outptr > outbuf)
|
|
|
|
return *currout;
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2010-01-24 19:56:51 +01:00
|
|
|
throw wxString::Format(L"Invalid input character found near line %u", currentLine);
|
2006-01-16 22:02:54 +01:00
|
|
|
}
|
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
wxString TextFileReader::ReadLineFromFile() {
|
|
|
|
wxString buffer;
|
2010-01-24 19:56:51 +01:00
|
|
|
buffer.Alloc(1024);
|
2009-07-14 23:28:49 +02:00
|
|
|
|
|
|
|
currentLine++;
|
|
|
|
// Read a line
|
|
|
|
wchar_t ch;
|
2010-01-24 19:56:51 +01:00
|
|
|
bool first = true;
|
|
|
|
// This doesn't work for \r deliminated files, but it's very unlikely
|
|
|
|
// that we'll run into one of those
|
2009-07-14 23:28:49 +02:00
|
|
|
for (ch = GetWChar(); ch != L'\n' && ch != 0; ch = GetWChar()) {
|
|
|
|
if (ch == L'\r') continue;
|
2009-07-18 02:58:13 +02:00
|
|
|
// Skip the BOM -- we don't need it as the encoding is already known
|
|
|
|
// and it sometimes causes conversion problems
|
2010-01-24 19:56:51 +01:00
|
|
|
if (ch == 0xFEFF && first) continue;
|
2009-07-14 23:28:49 +02:00
|
|
|
|
|
|
|
buffer += ch;
|
2010-01-24 19:56:51 +01:00
|
|
|
first = false;
|
2009-07-14 23:28:49 +02:00
|
|
|
}
|
|
|
|
if (ch == 0)
|
|
|
|
readComplete = true;
|
2006-01-16 22:02:54 +01:00
|
|
|
|
2009-07-14 23:28:49 +02:00
|
|
|
if (trim) {
|
|
|
|
buffer.Trim(true);
|
|
|
|
buffer.Trim(false);
|
|
|
|
}
|
|
|
|
return buffer;
|
2006-01-16 22:02:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
bool TextFileReader::HasMoreLines() {
|
2009-07-14 23:28:49 +02:00
|
|
|
return !readComplete;
|
2006-01-16 22:02:54 +01:00
|
|
|
}
|