Made gorgonsub's UTF-8/ASCII parser much faster, resulting in a 2x subtitles reading speedup for such files.

Originally committed to SVN as r2060.
This commit is contained in:
Rodrigo Braz Monteiro 2008-03-15 03:24:38 +00:00
parent 98d5794f20
commit d6d3f8aecb
6 changed files with 196 additions and 33 deletions

View file

@ -176,6 +176,10 @@
RelativePath=".\include\aegilib\exception.h"
>
</File>
<File
RelativePath=".\include\aegilib\fastbuffer.h"
>
</File>
<File
RelativePath=".\include\aegilib\format.h"
>

View file

@ -0,0 +1,104 @@
// Copyright (c) 2005, Rodrigo Braz Monteiro
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of the Aegisub Group nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// -----------------------------------------------------------------------------
//
// AEGISUB
//
// Website: http://aegisub.cellosoft.com
// Contact: mailto:zeratul@cellosoft.com
//
#pragma once
#include <vector>
#include "utils.h"
namespace Gorgonsub {
// Fast buffer class
template <typename T>
class FastBuffer {
private:
std::vector<T> buffer;
size_t _size;
public:
// Constructor
FastBuffer() { _size = 0; }
// Gets the stored size
size_t GetSize() const { return _size; }
// Shifts all the buffer left, destroying steps entries
void ShiftLeft(size_t steps) {
steps = Min(_size,steps);
memcpy(&buffer[0],&buffer[steps],_size-steps);
_size -= steps;
}
// Get a read pointer
const T* GetReadPtr() const { return &buffer[0]; }
// Get a non-const read pointer
T* GetMutableReadPtr() { return &buffer[0]; }
// Get a write pointer to a new area of the specified size
T* GetWritePtr(size_t size) {
size_t oldSize = _size;
_size += size;
if (buffer.size() < _size+4) buffer.resize(_size+4);
return &buffer[oldSize];
}
// Assume that has a certain size, discarding anything beyond it
void AssumeSize(size_t size) {
_size = Min(size,_size);
}
// Pre-Allocates memory
void Alloc(size_t size) {
buffer.resize(size);
}
// Finds a line break
void FindLineBreak(size_t start,size_t end,int &pos,T &character) {
pos = -1;
character = 0;
T c1 = '\n';
T c2 = '\r';
for (size_t i=start;i<end;i++) {
T chr = buffer[i];
if (chr == c1 || chr == c2) {
pos = (int)i;
character = chr;
return;
}
}
}
};
};

View file

@ -80,7 +80,7 @@ namespace Gorgonsub {
String IntegerToString(int value);
String PrettySize(int bytes);
// Fast string write
// Fast string functions
inline void WriteText(wxChar *&dst,const wxChar *src,size_t len,size_t &pos) {
memcpy(dst,src,len*sizeof(wxChar));
dst += len;
@ -92,4 +92,5 @@ namespace Gorgonsub {
pos++;
}
void WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t &pos);
const wxChar *StringTrim(wxString &str,size_t start);
};

View file

@ -56,6 +56,7 @@ TextFileReader::TextFileReader(wxInputStream &stream,Gorgonsub::String enc,bool
trim = _trim;
threaded = prefetch && false;
thread = NULL;
_buffer.Alloc(4096);
// Set encoding
encoding = enc.c_str();
@ -107,9 +108,9 @@ void TextFileReader::SetEncodingConfiguration()
// Reads a line from file
Gorgonsub::String TextFileReader::ActuallyReadLine()
{
wxString wxbuffer;
wxString stringBuffer;
size_t bufAlloc = 1024;
wxbuffer.Alloc(bufAlloc);
stringBuffer.Alloc(bufAlloc);
std::string buffer = "";
// Read UTF-16 line from file
@ -135,48 +136,68 @@ Gorgonsub::String TextFileReader::ActuallyReadLine()
ch = *((wchar_t*)charbuffer);
if (len >= bufAlloc - 1) {
bufAlloc *= 2;
wxbuffer.Alloc(bufAlloc);
stringBuffer.Alloc(bufAlloc);
}
wxbuffer += ch;
stringBuffer += ch;
len++;
}
// Remove line breaks
len = stringBuffer.Length();
for (size_t i=0;i<len;i++) {
if (stringBuffer[i] == _T('\r') || stringBuffer[i] == _T('\n')) stringBuffer[i] = _T(' ');
}
}
// Read ASCII/UTF-8 line from file
else {
//getline(file,buffer);
//wxbuffer.Clear();
//if (buffer.length()) wxbuffer = wxString(buffer.c_str(),*conv);
char temp = 0;
std::string buff;
while (temp != '\n' && !file.Eof()) {
file.Read(&temp,1);
if (temp != '\r') {
buff += temp;
}
}
if (buff.size()) wxbuffer = wxString(buff.c_str(),*conv);
// Look for a new line
int newLinePos = -1;
char newLineChar = 0;
size_t size = _buffer.GetSize();
// Find first line break
if (size) _buffer.FindLineBreak(0,size,newLinePos,newLineChar);
// If no line breaks were found, load more data into file
while (newLinePos == -1) {
// Read 2048 bytes
const size_t read = 2048;
size_t oldSize = _buffer.GetSize();
char *ptr = _buffer.GetWritePtr(read);
file.Read(ptr,read);
size_t lastRead = file.LastRead();
_buffer.AssumeSize(_buffer.GetSize()+lastRead-read);
// Find line break
_buffer.FindLineBreak(oldSize,lastRead+oldSize,newLinePos,newLineChar);
// End of file, force a line break
if (file.Eof() && newLinePos == -1) newLinePos = (int) _buffer.GetSize();
}
// Remove line breaks
//wxbuffer.Replace(_T("\r"),_T("\0"));
//wxbuffer.Replace(_T("\n"),_T("\0"));
size_t len=wxbuffer.Length();
for (size_t i=0;i<len;i++) {
if (wxbuffer[i] == _T('\r') || wxbuffer[i] == _T('\n')) wxbuffer[i] = _T(' ');
// Found newline
if (newLinePos != -1) {
// Replace newline with null character and convert to proper charset
char *read = _buffer.GetMutableReadPtr();
if (newLinePos) {
read[newLinePos] = 0;
stringBuffer = wxString(read,*conv);
}
// Remove an extra character if the new is the complement of \n,\r (13^7=10, 10^7=13)
if (read[newLinePos+1] == (newLineChar ^ 7)) newLinePos++;
_buffer.ShiftLeft(newLinePos+1);
}
}
// Remove BOM
if (wxbuffer.Length() > 0 && wxbuffer[0] == 0xFEFF) {
wxbuffer = wxbuffer.Mid(1);
}
size_t startPos = 0;
if (stringBuffer.Length() > 0 && stringBuffer[0] == 0xFEFF) startPos = 1;
// Trim
if (trim) {
wxbuffer.Trim(true);
wxbuffer.Trim(false);
}
return Gorgonsub::String(wxbuffer.c_str());
if (trim) return String(StringTrim(stringBuffer,startPos));
return String(stringBuffer.c_str() + startPos);
}
@ -186,7 +207,7 @@ bool TextFileReader::HasMoreLines()
{
if (cache.size()) return true;
wxCriticalSectionLocker locker(mutex);
return (!file.Eof());
return (!file.Eof() || _buffer.GetSize());
}

View file

@ -39,6 +39,7 @@
// Headers
#include "Gorgonsub.h"
#include "fastbuffer.h"
#include <wx/stream.h>
@ -51,6 +52,8 @@ namespace Gorgonsub {
wxCriticalSection mutex;
std::list<String> cache;
FastBuffer<char> _buffer;
wxString encoding;
wxInputStream &file;
shared_ptr<wxMBConv> conv;

View file

@ -112,3 +112,33 @@ void Gorgonsub::WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t
pos++;
}
}
/////////////////
// Trim a string
const wxChar *Gorgonsub::StringTrim(wxString &str,size_t startPos)
{
size_t len = str.Length();
size_t start = startPos;
size_t end = len;
bool isStart = true;
bool isEnd = false;
wxChar cur;
for (size_t i=start;i<len;i++) {
cur = str[i];
if (isStart)
if (cur == ' ') start++;
else isStart = false;
if (isEnd)
if (cur != ' ') isEnd = false;
else {
if (cur == ' ') {
isEnd = true;
end = i;
}
}
}
startPos = start;
if (isEnd) str[end] = 0;
return str.c_str() + startPos;
}