forked from mia/Aegisub
Made gorgonsub's UTF-8/ASCII parser much faster, resulting in a 2x subtitles reading speedup for such files.
Originally committed to SVN as r2060.
This commit is contained in:
parent
98d5794f20
commit
d6d3f8aecb
6 changed files with 196 additions and 33 deletions
|
@ -176,6 +176,10 @@
|
||||||
RelativePath=".\include\aegilib\exception.h"
|
RelativePath=".\include\aegilib\exception.h"
|
||||||
>
|
>
|
||||||
</File>
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath=".\include\aegilib\fastbuffer.h"
|
||||||
|
>
|
||||||
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath=".\include\aegilib\format.h"
|
RelativePath=".\include\aegilib\format.h"
|
||||||
>
|
>
|
||||||
|
|
104
aegilib/include/aegilib/fastbuffer.h
Normal file
104
aegilib/include/aegilib/fastbuffer.h
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
// Copyright (c) 2005, Rodrigo Braz Monteiro
|
||||||
|
// All rights reserved.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
// modification, are permitted provided that the following conditions are met:
|
||||||
|
//
|
||||||
|
// * Redistributions of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer.
|
||||||
|
// * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
// this list of conditions and the following disclaimer in the documentation
|
||||||
|
// and/or other materials provided with the distribution.
|
||||||
|
// * Neither the name of the Aegisub Group nor the names of its contributors
|
||||||
|
// may be used to endorse or promote products derived from this software
|
||||||
|
// without specific prior written permission.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
// POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// AEGISUB
|
||||||
|
//
|
||||||
|
// Website: http://aegisub.cellosoft.com
|
||||||
|
// Contact: mailto:zeratul@cellosoft.com
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include <vector>
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace Gorgonsub {
|
||||||
|
// Fast buffer class
|
||||||
|
template <typename T>
|
||||||
|
class FastBuffer {
|
||||||
|
private:
|
||||||
|
std::vector<T> buffer;
|
||||||
|
size_t _size;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Constructor
|
||||||
|
FastBuffer() { _size = 0; }
|
||||||
|
|
||||||
|
// Gets the stored size
|
||||||
|
size_t GetSize() const { return _size; }
|
||||||
|
|
||||||
|
// Shifts all the buffer left, destroying steps entries
|
||||||
|
void ShiftLeft(size_t steps) {
|
||||||
|
steps = Min(_size,steps);
|
||||||
|
memcpy(&buffer[0],&buffer[steps],_size-steps);
|
||||||
|
_size -= steps;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a read pointer
|
||||||
|
const T* GetReadPtr() const { return &buffer[0]; }
|
||||||
|
|
||||||
|
// Get a non-const read pointer
|
||||||
|
T* GetMutableReadPtr() { return &buffer[0]; }
|
||||||
|
|
||||||
|
// Get a write pointer to a new area of the specified size
|
||||||
|
T* GetWritePtr(size_t size) {
|
||||||
|
size_t oldSize = _size;
|
||||||
|
_size += size;
|
||||||
|
if (buffer.size() < _size+4) buffer.resize(_size+4);
|
||||||
|
return &buffer[oldSize];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assume that has a certain size, discarding anything beyond it
|
||||||
|
void AssumeSize(size_t size) {
|
||||||
|
_size = Min(size,_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-Allocates memory
|
||||||
|
void Alloc(size_t size) {
|
||||||
|
buffer.resize(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finds a line break
|
||||||
|
void FindLineBreak(size_t start,size_t end,int &pos,T &character) {
|
||||||
|
pos = -1;
|
||||||
|
character = 0;
|
||||||
|
T c1 = '\n';
|
||||||
|
T c2 = '\r';
|
||||||
|
for (size_t i=start;i<end;i++) {
|
||||||
|
T chr = buffer[i];
|
||||||
|
if (chr == c1 || chr == c2) {
|
||||||
|
pos = (int)i;
|
||||||
|
character = chr;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
|
@ -80,7 +80,7 @@ namespace Gorgonsub {
|
||||||
String IntegerToString(int value);
|
String IntegerToString(int value);
|
||||||
String PrettySize(int bytes);
|
String PrettySize(int bytes);
|
||||||
|
|
||||||
// Fast string write
|
// Fast string functions
|
||||||
inline void WriteText(wxChar *&dst,const wxChar *src,size_t len,size_t &pos) {
|
inline void WriteText(wxChar *&dst,const wxChar *src,size_t len,size_t &pos) {
|
||||||
memcpy(dst,src,len*sizeof(wxChar));
|
memcpy(dst,src,len*sizeof(wxChar));
|
||||||
dst += len;
|
dst += len;
|
||||||
|
@ -92,4 +92,5 @@ namespace Gorgonsub {
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
void WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t &pos);
|
void WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t &pos);
|
||||||
|
const wxChar *StringTrim(wxString &str,size_t start);
|
||||||
};
|
};
|
||||||
|
|
|
@ -56,6 +56,7 @@ TextFileReader::TextFileReader(wxInputStream &stream,Gorgonsub::String enc,bool
|
||||||
trim = _trim;
|
trim = _trim;
|
||||||
threaded = prefetch && false;
|
threaded = prefetch && false;
|
||||||
thread = NULL;
|
thread = NULL;
|
||||||
|
_buffer.Alloc(4096);
|
||||||
|
|
||||||
// Set encoding
|
// Set encoding
|
||||||
encoding = enc.c_str();
|
encoding = enc.c_str();
|
||||||
|
@ -107,9 +108,9 @@ void TextFileReader::SetEncodingConfiguration()
|
||||||
// Reads a line from file
|
// Reads a line from file
|
||||||
Gorgonsub::String TextFileReader::ActuallyReadLine()
|
Gorgonsub::String TextFileReader::ActuallyReadLine()
|
||||||
{
|
{
|
||||||
wxString wxbuffer;
|
wxString stringBuffer;
|
||||||
size_t bufAlloc = 1024;
|
size_t bufAlloc = 1024;
|
||||||
wxbuffer.Alloc(bufAlloc);
|
stringBuffer.Alloc(bufAlloc);
|
||||||
std::string buffer = "";
|
std::string buffer = "";
|
||||||
|
|
||||||
// Read UTF-16 line from file
|
// Read UTF-16 line from file
|
||||||
|
@ -135,48 +136,68 @@ Gorgonsub::String TextFileReader::ActuallyReadLine()
|
||||||
ch = *((wchar_t*)charbuffer);
|
ch = *((wchar_t*)charbuffer);
|
||||||
if (len >= bufAlloc - 1) {
|
if (len >= bufAlloc - 1) {
|
||||||
bufAlloc *= 2;
|
bufAlloc *= 2;
|
||||||
wxbuffer.Alloc(bufAlloc);
|
stringBuffer.Alloc(bufAlloc);
|
||||||
}
|
}
|
||||||
wxbuffer += ch;
|
stringBuffer += ch;
|
||||||
len++;
|
len++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Remove line breaks
|
||||||
|
len = stringBuffer.Length();
|
||||||
|
for (size_t i=0;i<len;i++) {
|
||||||
|
if (stringBuffer[i] == _T('\r') || stringBuffer[i] == _T('\n')) stringBuffer[i] = _T(' ');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read ASCII/UTF-8 line from file
|
// Read ASCII/UTF-8 line from file
|
||||||
else {
|
else {
|
||||||
//getline(file,buffer);
|
// Look for a new line
|
||||||
//wxbuffer.Clear();
|
int newLinePos = -1;
|
||||||
//if (buffer.length()) wxbuffer = wxString(buffer.c_str(),*conv);
|
char newLineChar = 0;
|
||||||
char temp = 0;
|
size_t size = _buffer.GetSize();
|
||||||
std::string buff;
|
|
||||||
while (temp != '\n' && !file.Eof()) {
|
|
||||||
file.Read(&temp,1);
|
|
||||||
if (temp != '\r') {
|
|
||||||
buff += temp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (buff.size()) wxbuffer = wxString(buff.c_str(),*conv);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove line breaks
|
// Find first line break
|
||||||
//wxbuffer.Replace(_T("\r"),_T("\0"));
|
if (size) _buffer.FindLineBreak(0,size,newLinePos,newLineChar);
|
||||||
//wxbuffer.Replace(_T("\n"),_T("\0"));
|
|
||||||
size_t len=wxbuffer.Length();
|
// If no line breaks were found, load more data into file
|
||||||
for (size_t i=0;i<len;i++) {
|
while (newLinePos == -1) {
|
||||||
if (wxbuffer[i] == _T('\r') || wxbuffer[i] == _T('\n')) wxbuffer[i] = _T(' ');
|
// Read 2048 bytes
|
||||||
|
const size_t read = 2048;
|
||||||
|
size_t oldSize = _buffer.GetSize();
|
||||||
|
char *ptr = _buffer.GetWritePtr(read);
|
||||||
|
file.Read(ptr,read);
|
||||||
|
size_t lastRead = file.LastRead();
|
||||||
|
_buffer.AssumeSize(_buffer.GetSize()+lastRead-read);
|
||||||
|
|
||||||
|
// Find line break
|
||||||
|
_buffer.FindLineBreak(oldSize,lastRead+oldSize,newLinePos,newLineChar);
|
||||||
|
|
||||||
|
// End of file, force a line break
|
||||||
|
if (file.Eof() && newLinePos == -1) newLinePos = (int) _buffer.GetSize();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Found newline
|
||||||
|
if (newLinePos != -1) {
|
||||||
|
// Replace newline with null character and convert to proper charset
|
||||||
|
char *read = _buffer.GetMutableReadPtr();
|
||||||
|
if (newLinePos) {
|
||||||
|
read[newLinePos] = 0;
|
||||||
|
stringBuffer = wxString(read,*conv);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove an extra character if the new is the complement of \n,\r (13^7=10, 10^7=13)
|
||||||
|
if (read[newLinePos+1] == (newLineChar ^ 7)) newLinePos++;
|
||||||
|
_buffer.ShiftLeft(newLinePos+1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove BOM
|
// Remove BOM
|
||||||
if (wxbuffer.Length() > 0 && wxbuffer[0] == 0xFEFF) {
|
size_t startPos = 0;
|
||||||
wxbuffer = wxbuffer.Mid(1);
|
if (stringBuffer.Length() > 0 && stringBuffer[0] == 0xFEFF) startPos = 1;
|
||||||
}
|
|
||||||
|
|
||||||
// Trim
|
// Trim
|
||||||
if (trim) {
|
if (trim) return String(StringTrim(stringBuffer,startPos));
|
||||||
wxbuffer.Trim(true);
|
return String(stringBuffer.c_str() + startPos);
|
||||||
wxbuffer.Trim(false);
|
|
||||||
}
|
|
||||||
return Gorgonsub::String(wxbuffer.c_str());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -186,7 +207,7 @@ bool TextFileReader::HasMoreLines()
|
||||||
{
|
{
|
||||||
if (cache.size()) return true;
|
if (cache.size()) return true;
|
||||||
wxCriticalSectionLocker locker(mutex);
|
wxCriticalSectionLocker locker(mutex);
|
||||||
return (!file.Eof());
|
return (!file.Eof() || _buffer.GetSize());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,7 @@
|
||||||
|
|
||||||
// Headers
|
// Headers
|
||||||
#include "Gorgonsub.h"
|
#include "Gorgonsub.h"
|
||||||
|
#include "fastbuffer.h"
|
||||||
#include <wx/stream.h>
|
#include <wx/stream.h>
|
||||||
|
|
||||||
|
|
||||||
|
@ -51,6 +52,8 @@ namespace Gorgonsub {
|
||||||
wxCriticalSection mutex;
|
wxCriticalSection mutex;
|
||||||
|
|
||||||
std::list<String> cache;
|
std::list<String> cache;
|
||||||
|
FastBuffer<char> _buffer;
|
||||||
|
|
||||||
wxString encoding;
|
wxString encoding;
|
||||||
wxInputStream &file;
|
wxInputStream &file;
|
||||||
shared_ptr<wxMBConv> conv;
|
shared_ptr<wxMBConv> conv;
|
||||||
|
|
|
@ -112,3 +112,33 @@ void Gorgonsub::WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////
|
||||||
|
// Trim a string
|
||||||
|
const wxChar *Gorgonsub::StringTrim(wxString &str,size_t startPos)
|
||||||
|
{
|
||||||
|
size_t len = str.Length();
|
||||||
|
size_t start = startPos;
|
||||||
|
size_t end = len;
|
||||||
|
bool isStart = true;
|
||||||
|
bool isEnd = false;
|
||||||
|
wxChar cur;
|
||||||
|
for (size_t i=start;i<len;i++) {
|
||||||
|
cur = str[i];
|
||||||
|
if (isStart)
|
||||||
|
if (cur == ' ') start++;
|
||||||
|
else isStart = false;
|
||||||
|
if (isEnd)
|
||||||
|
if (cur != ' ') isEnd = false;
|
||||||
|
else {
|
||||||
|
if (cur == ' ') {
|
||||||
|
isEnd = true;
|
||||||
|
end = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
startPos = start;
|
||||||
|
if (isEnd) str[end] = 0;
|
||||||
|
return str.c_str() + startPos;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue