2008-03-08 08:52:20 +01:00
|
|
|
// Copyright (c) 2005, Rodrigo Braz Monteiro
|
|
|
|
// All rights reserved.
|
|
|
|
//
|
|
|
|
// Redistribution and use in source and binary forms, with or without
|
|
|
|
// modification, are permitted provided that the following conditions are met:
|
|
|
|
//
|
|
|
|
// * Redistributions of source code must retain the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer.
|
|
|
|
// * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
|
|
// and/or other materials provided with the distribution.
|
|
|
|
// * Neither the name of the Aegisub Group nor the names of its contributors
|
|
|
|
// may be used to endorse or promote products derived from this software
|
|
|
|
// without specific prior written permission.
|
|
|
|
//
|
|
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
// POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
//
|
|
|
|
// -----------------------------------------------------------------------------
|
|
|
|
//
|
|
|
|
// AEGISUB
|
|
|
|
//
|
|
|
|
// Website: http://aegisub.cellosoft.com
|
|
|
|
// Contact: mailto:zeratul@cellosoft.com
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
|
|
///////////
|
|
|
|
// Headers
|
|
|
|
#include <algorithm>
|
|
|
|
#include <string>
|
|
|
|
#include <wx/wfstream.h>
|
|
|
|
#include "text_file_reader.h"
|
2008-03-18 04:44:00 +01:00
|
|
|
using namespace Athenasub;
|
2008-03-08 08:52:20 +01:00
|
|
|
|
|
|
|
#ifdef WITH_UNIVCHARDET
|
|
|
|
#include "charset_detect.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
///////////////
|
|
|
|
// Constructor
|
2008-11-13 03:13:48 +01:00
|
|
|
TextFileReader::TextFileReader(wxInputStream &stream,String enc,bool _trim,bool prefetch)
|
2008-03-08 08:52:20 +01:00
|
|
|
: file(stream)
|
|
|
|
{
|
|
|
|
// Setup
|
|
|
|
trim = _trim;
|
2008-03-15 01:29:17 +01:00
|
|
|
threaded = prefetch && false;
|
2008-03-14 09:00:55 +01:00
|
|
|
thread = NULL;
|
2008-03-08 08:52:20 +01:00
|
|
|
|
|
|
|
// Set encoding
|
2008-11-08 23:07:23 +01:00
|
|
|
encoding = enc.GetWxString();
|
2008-03-08 08:52:20 +01:00
|
|
|
if (encoding == _T("binary")) return;
|
|
|
|
SetEncodingConfiguration();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//////////////
|
|
|
|
// Destructor
|
2008-03-14 09:00:55 +01:00
|
|
|
TextFileReader::~TextFileReader()
|
|
|
|
{
|
2008-03-15 01:29:17 +01:00
|
|
|
wxCriticalSectionLocker locker(mutex);
|
|
|
|
if (thread) thread->Delete();
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////
|
|
|
|
// Set encoding configuration
|
2008-03-14 09:00:55 +01:00
|
|
|
void TextFileReader::SetEncodingConfiguration()
|
|
|
|
{
|
2008-03-08 08:52:20 +01:00
|
|
|
// Set encoding configuration
|
|
|
|
swap = false;
|
|
|
|
Is16 = false;
|
2008-11-13 03:23:46 +01:00
|
|
|
isUtf8 = false;
|
2008-03-13 19:06:13 +01:00
|
|
|
//conv = shared_ptr<wxMBConv>();
|
2008-03-08 08:52:20 +01:00
|
|
|
if (encoding == _T("UTF-8")) {
|
2008-03-13 04:42:27 +01:00
|
|
|
conv = shared_ptr<wxMBConv> (new wxMBConvUTF8);
|
2008-11-13 03:23:46 +01:00
|
|
|
isUtf8 = true;
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
|
|
|
else if (encoding == _T("UTF-16LE")) {
|
|
|
|
Is16 = true;
|
|
|
|
}
|
|
|
|
else if (encoding == _T("UTF-16BE")) {
|
|
|
|
Is16 = true;
|
|
|
|
swap = true;
|
|
|
|
}
|
|
|
|
else if (encoding == _T("UTF-7")) {
|
2008-03-13 04:42:27 +01:00
|
|
|
conv = shared_ptr<wxMBConv>(new wxCSConv(encoding));
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
|
|
|
else if (encoding == _T("Local")) {
|
2008-03-13 04:42:27 +01:00
|
|
|
conv = shared_ptr<wxMBConv> (wxConvCurrent,NullDeleter());
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
|
|
|
else {
|
2008-03-13 04:42:27 +01:00
|
|
|
conv = shared_ptr<wxMBConv> (new wxCSConv(encoding));
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
2008-03-15 09:36:52 +01:00
|
|
|
|
|
|
|
// Allocate buffer
|
|
|
|
if (!Is16) buffer1.Alloc(4096);
|
|
|
|
else buffer2.Alloc(4096);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
////////////////////
|
|
|
|
// Helper functions
|
2008-11-13 03:23:46 +01:00
|
|
|
String GetString(char *read,shared_ptr<wxMBConv> conv,bool isUtf8)
|
|
|
|
{
|
|
|
|
if (isUtf8) {
|
|
|
|
return String(read);
|
|
|
|
} else {
|
|
|
|
return wxString(read,*conv);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
String GetString(wchar_t *read,shared_ptr<wxMBConv> conv,bool isUtf8)
|
|
|
|
{
|
|
|
|
(void)conv;
|
|
|
|
(void)isUtf8;
|
|
|
|
return wxString(read);
|
|
|
|
}
|
2008-03-15 09:36:52 +01:00
|
|
|
inline void Swap(wchar_t &a) {
|
|
|
|
char *c = (char*) &a;
|
|
|
|
char aux = c[0];
|
|
|
|
c[0] = c[1];
|
|
|
|
c[1] = aux;
|
|
|
|
}
|
|
|
|
inline void Swap(char &a) { (void) a; }
|
|
|
|
|
|
|
|
|
|
|
|
////////////////
|
|
|
|
// Parse a line
|
|
|
|
template <typename T>
|
2008-11-13 03:23:46 +01:00
|
|
|
void ParseLine(FastBuffer<T> &_buffer,wxInputStream &file,String &stringBuffer,shared_ptr<wxMBConv> conv,bool swap,bool isUtf8)
|
2008-03-15 09:36:52 +01:00
|
|
|
{
|
|
|
|
// Look for a new line
|
|
|
|
int newLinePos = -1;
|
|
|
|
T newLineChar = 0;
|
|
|
|
size_t size = _buffer.GetSize();
|
|
|
|
|
|
|
|
// Find first line break
|
|
|
|
if (size) _buffer.FindLineBreak(0,size,newLinePos,newLineChar);
|
|
|
|
|
|
|
|
// If no line breaks were found, load more data into file
|
|
|
|
while (newLinePos == -1) {
|
|
|
|
// Read 2048 bytes
|
2008-03-17 03:10:10 +01:00
|
|
|
const size_t readBytes = 1024;
|
2008-03-15 09:36:52 +01:00
|
|
|
const size_t read = readBytes/sizeof(T);
|
|
|
|
size_t oldSize = _buffer.GetSize();
|
|
|
|
T *ptr = _buffer.GetWritePtr(read);
|
|
|
|
file.Read(ptr,readBytes);
|
|
|
|
size_t lastRead = file.LastRead()/sizeof(T);
|
|
|
|
_buffer.AssumeSize(_buffer.GetSize()+lastRead-read);
|
|
|
|
|
|
|
|
// Swap
|
|
|
|
if (swap) {
|
|
|
|
T* ptr2 = ptr;
|
|
|
|
for (size_t i=0;i<lastRead;i++) {
|
|
|
|
Swap(*ptr2++);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find line break
|
|
|
|
_buffer.FindLineBreak(oldSize,lastRead+oldSize,newLinePos,newLineChar);
|
|
|
|
|
|
|
|
// End of file, force a line break
|
|
|
|
if (file.Eof() && newLinePos == -1) newLinePos = (int) _buffer.GetSize();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Found newline
|
|
|
|
if (newLinePos != -1) {
|
|
|
|
T *read = _buffer.GetMutableReadPtr();
|
|
|
|
// Replace newline with null character and convert to proper charset
|
|
|
|
if (newLinePos) {
|
|
|
|
read[newLinePos] = 0;
|
2008-11-13 03:23:46 +01:00
|
|
|
stringBuffer = GetString(read,conv,isUtf8);
|
2008-03-15 09:36:52 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Remove an extra character if the new is the complement of \n,\r (13^7=10, 10^7=13)
|
|
|
|
if (read[newLinePos+1] == (newLineChar ^ 7)) newLinePos++;
|
|
|
|
_buffer.ShiftLeft(newLinePos+1);
|
|
|
|
}
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////
|
|
|
|
// Reads a line from file
|
2008-03-18 04:44:00 +01:00
|
|
|
Athenasub::String TextFileReader::ActuallyReadLine()
|
2008-03-14 09:00:55 +01:00
|
|
|
{
|
2008-11-13 03:23:46 +01:00
|
|
|
String stringBuffer;
|
2008-03-08 08:52:20 +01:00
|
|
|
size_t bufAlloc = 1024;
|
2008-11-13 03:23:46 +01:00
|
|
|
stringBuffer.reserve(bufAlloc);
|
2008-03-08 08:52:20 +01:00
|
|
|
std::string buffer = "";
|
|
|
|
|
|
|
|
// Read UTF-16 line from file
|
2008-11-13 03:23:46 +01:00
|
|
|
if (Is16) ParseLine<wchar_t>(buffer2,file,stringBuffer,conv,swap,false);
|
2008-03-08 08:52:20 +01:00
|
|
|
|
|
|
|
// Read ASCII/UTF-8 line from file
|
2008-11-13 03:23:46 +01:00
|
|
|
else ParseLine<char>(buffer1,file,stringBuffer,conv,false,isUtf8);
|
2008-03-08 08:52:20 +01:00
|
|
|
|
|
|
|
// Remove BOM
|
2008-03-15 04:24:38 +01:00
|
|
|
size_t startPos = 0;
|
2008-11-09 19:48:37 +01:00
|
|
|
if (stringBuffer.Length() > 0 && stringBuffer[0] == 0xFEFF) startPos = 3;
|
2008-03-08 08:52:20 +01:00
|
|
|
|
|
|
|
// Trim
|
2008-11-08 23:07:23 +01:00
|
|
|
String str = String(stringBuffer);
|
|
|
|
if (trim) return String(String::StringTrim(str,startPos));
|
|
|
|
if (startPos) return String(str.c_str() + startPos);
|
|
|
|
return str;
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////
|
|
|
|
// Checks if there's more to read
|
2008-03-14 09:00:55 +01:00
|
|
|
bool TextFileReader::HasMoreLines()
|
|
|
|
{
|
2008-03-15 01:29:17 +01:00
|
|
|
if (cache.size()) return true;
|
2008-03-14 09:00:55 +01:00
|
|
|
wxCriticalSectionLocker locker(mutex);
|
2008-03-15 09:36:52 +01:00
|
|
|
return (!file.Eof() || buffer1.GetSize() || buffer2.GetSize());
|
2008-03-08 08:52:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////
|
|
|
|
// Ensure that charset is valid
|
2008-03-18 04:44:00 +01:00
|
|
|
void TextFileReader::EnsureValid(Athenasub::String enc)
|
2008-03-14 09:00:55 +01:00
|
|
|
{
|
2008-11-08 23:07:23 +01:00
|
|
|
if (enc == "unknown" || enc == "UTF-32BE" || enc == "UTF-32LE") {
|
|
|
|
String error = "Character set ";
|
2008-03-08 08:52:20 +01:00
|
|
|
error += enc;
|
2008-11-08 23:07:23 +01:00
|
|
|
error += " is not supported.";
|
2008-03-08 08:52:20 +01:00
|
|
|
throw error.c_str();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
///////////////////////////
|
|
|
|
// Get encoding being used
|
2008-03-14 09:00:55 +01:00
|
|
|
String TextFileReader::GetCurrentEncoding()
|
|
|
|
{
|
2008-03-08 08:52:20 +01:00
|
|
|
return encoding.c_str();
|
|
|
|
}
|
2008-03-14 09:00:55 +01:00
|
|
|
|
|
|
|
|
|
|
|
///////////////////////////
|
|
|
|
// Reads a line from cache
|
|
|
|
String TextFileReader::ReadLineFromFile()
|
|
|
|
{
|
|
|
|
// Not threaded, just return it
|
|
|
|
if (!threaded) return ActuallyReadLine();
|
|
|
|
|
|
|
|
// Load into cache if needed
|
|
|
|
String final;
|
2008-03-15 01:29:17 +01:00
|
|
|
{
|
2008-03-14 09:00:55 +01:00
|
|
|
wxCriticalSectionLocker locker(mutex);
|
2008-03-15 01:29:17 +01:00
|
|
|
if (cache.size() == 0) {
|
|
|
|
cache.push_back(ActuallyReadLine());
|
|
|
|
}
|
2008-03-14 09:00:55 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Retrieve from cache
|
|
|
|
wxCriticalSectionLocker locker(mutex);
|
|
|
|
if (cache.size()) {
|
|
|
|
final = cache.front();
|
|
|
|
cache.pop_front();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start the thread to prefetch more
|
|
|
|
if (cache.size() < 3 && thread == NULL) {
|
|
|
|
thread = new PrefetchThread(this);
|
2008-03-15 01:29:17 +01:00
|
|
|
thread->Create();
|
|
|
|
thread->Run();
|
2008-03-14 09:00:55 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return final;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
////////////////
|
|
|
|
// Thread entry
|
|
|
|
wxThread::ExitCode PrefetchThread::Entry()
|
|
|
|
{
|
|
|
|
// Lock
|
2008-03-15 01:29:17 +01:00
|
|
|
bool run = true;
|
|
|
|
while (run) {
|
2008-03-14 09:00:55 +01:00
|
|
|
if (TestDestroy()) {
|
|
|
|
parent->thread = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
2008-03-15 01:29:17 +01:00
|
|
|
{
|
|
|
|
wxCriticalSectionLocker locker(parent->mutex);
|
|
|
|
if (parent->cache.size() < 6) {
|
|
|
|
if (!parent->file.Eof()) {
|
|
|
|
// Get line
|
|
|
|
parent->cache.push_back(parent->ActuallyReadLine());
|
|
|
|
}
|
|
|
|
else run = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Sleep(50);
|
2008-03-14 09:00:55 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Die
|
2008-03-15 01:29:17 +01:00
|
|
|
wxCriticalSectionLocker locker(parent->mutex);
|
2008-03-14 09:00:55 +01:00
|
|
|
parent->thread = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|