forked from mia/Aegisub
Made gorgonsub's UTF-8/ASCII parser much faster, resulting in a 2x subtitles reading speedup for such files.
Originally committed to SVN as r2060.
This commit is contained in:
parent
98d5794f20
commit
d6d3f8aecb
6 changed files with 196 additions and 33 deletions
|
@ -176,6 +176,10 @@
|
|||
RelativePath=".\include\aegilib\exception.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\include\aegilib\fastbuffer.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\include\aegilib\format.h"
|
||||
>
|
||||
|
|
104
aegilib/include/aegilib/fastbuffer.h
Normal file
104
aegilib/include/aegilib/fastbuffer.h
Normal file
|
@ -0,0 +1,104 @@
|
|||
// Copyright (c) 2005, Rodrigo Braz Monteiro
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
// * Neither the name of the Aegisub Group nor the names of its contributors
|
||||
// may be used to endorse or promote products derived from this software
|
||||
// without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
// POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// AEGISUB
|
||||
//
|
||||
// Website: http://aegisub.cellosoft.com
|
||||
// Contact: mailto:zeratul@cellosoft.com
|
||||
//
|
||||
|
||||
|
||||
#pragma once
|
||||
#include <vector>
|
||||
#include "utils.h"
|
||||
|
||||
|
||||
namespace Gorgonsub {
|
||||
// Fast buffer class
|
||||
template <typename T>
|
||||
class FastBuffer {
|
||||
private:
|
||||
std::vector<T> buffer;
|
||||
size_t _size;
|
||||
|
||||
public:
|
||||
// Constructor
|
||||
FastBuffer() { _size = 0; }
|
||||
|
||||
// Gets the stored size
|
||||
size_t GetSize() const { return _size; }
|
||||
|
||||
// Shifts all the buffer left, destroying steps entries
|
||||
void ShiftLeft(size_t steps) {
|
||||
steps = Min(_size,steps);
|
||||
memcpy(&buffer[0],&buffer[steps],_size-steps);
|
||||
_size -= steps;
|
||||
}
|
||||
|
||||
// Get a read pointer
|
||||
const T* GetReadPtr() const { return &buffer[0]; }
|
||||
|
||||
// Get a non-const read pointer
|
||||
T* GetMutableReadPtr() { return &buffer[0]; }
|
||||
|
||||
// Get a write pointer to a new area of the specified size
|
||||
T* GetWritePtr(size_t size) {
|
||||
size_t oldSize = _size;
|
||||
_size += size;
|
||||
if (buffer.size() < _size+4) buffer.resize(_size+4);
|
||||
return &buffer[oldSize];
|
||||
}
|
||||
|
||||
// Assume that has a certain size, discarding anything beyond it
|
||||
void AssumeSize(size_t size) {
|
||||
_size = Min(size,_size);
|
||||
}
|
||||
|
||||
// Pre-Allocates memory
|
||||
void Alloc(size_t size) {
|
||||
buffer.resize(size);
|
||||
}
|
||||
|
||||
// Finds a line break
|
||||
void FindLineBreak(size_t start,size_t end,int &pos,T &character) {
|
||||
pos = -1;
|
||||
character = 0;
|
||||
T c1 = '\n';
|
||||
T c2 = '\r';
|
||||
for (size_t i=start;i<end;i++) {
|
||||
T chr = buffer[i];
|
||||
if (chr == c1 || chr == c2) {
|
||||
pos = (int)i;
|
||||
character = chr;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
|
@ -80,7 +80,7 @@ namespace Gorgonsub {
|
|||
String IntegerToString(int value);
|
||||
String PrettySize(int bytes);
|
||||
|
||||
// Fast string write
|
||||
// Fast string functions
|
||||
inline void WriteText(wxChar *&dst,const wxChar *src,size_t len,size_t &pos) {
|
||||
memcpy(dst,src,len*sizeof(wxChar));
|
||||
dst += len;
|
||||
|
@ -92,4 +92,5 @@ namespace Gorgonsub {
|
|||
pos++;
|
||||
}
|
||||
void WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t &pos);
|
||||
const wxChar *StringTrim(wxString &str,size_t start);
|
||||
};
|
||||
|
|
|
@ -56,6 +56,7 @@ TextFileReader::TextFileReader(wxInputStream &stream,Gorgonsub::String enc,bool
|
|||
trim = _trim;
|
||||
threaded = prefetch && false;
|
||||
thread = NULL;
|
||||
_buffer.Alloc(4096);
|
||||
|
||||
// Set encoding
|
||||
encoding = enc.c_str();
|
||||
|
@ -107,9 +108,9 @@ void TextFileReader::SetEncodingConfiguration()
|
|||
// Reads a line from file
|
||||
Gorgonsub::String TextFileReader::ActuallyReadLine()
|
||||
{
|
||||
wxString wxbuffer;
|
||||
wxString stringBuffer;
|
||||
size_t bufAlloc = 1024;
|
||||
wxbuffer.Alloc(bufAlloc);
|
||||
stringBuffer.Alloc(bufAlloc);
|
||||
std::string buffer = "";
|
||||
|
||||
// Read UTF-16 line from file
|
||||
|
@ -135,48 +136,68 @@ Gorgonsub::String TextFileReader::ActuallyReadLine()
|
|||
ch = *((wchar_t*)charbuffer);
|
||||
if (len >= bufAlloc - 1) {
|
||||
bufAlloc *= 2;
|
||||
wxbuffer.Alloc(bufAlloc);
|
||||
stringBuffer.Alloc(bufAlloc);
|
||||
}
|
||||
wxbuffer += ch;
|
||||
stringBuffer += ch;
|
||||
len++;
|
||||
}
|
||||
|
||||
// Remove line breaks
|
||||
len = stringBuffer.Length();
|
||||
for (size_t i=0;i<len;i++) {
|
||||
if (stringBuffer[i] == _T('\r') || stringBuffer[i] == _T('\n')) stringBuffer[i] = _T(' ');
|
||||
}
|
||||
}
|
||||
|
||||
// Read ASCII/UTF-8 line from file
|
||||
else {
|
||||
//getline(file,buffer);
|
||||
//wxbuffer.Clear();
|
||||
//if (buffer.length()) wxbuffer = wxString(buffer.c_str(),*conv);
|
||||
char temp = 0;
|
||||
std::string buff;
|
||||
while (temp != '\n' && !file.Eof()) {
|
||||
file.Read(&temp,1);
|
||||
if (temp != '\r') {
|
||||
buff += temp;
|
||||
}
|
||||
}
|
||||
if (buff.size()) wxbuffer = wxString(buff.c_str(),*conv);
|
||||
}
|
||||
// Look for a new line
|
||||
int newLinePos = -1;
|
||||
char newLineChar = 0;
|
||||
size_t size = _buffer.GetSize();
|
||||
|
||||
// Remove line breaks
|
||||
//wxbuffer.Replace(_T("\r"),_T("\0"));
|
||||
//wxbuffer.Replace(_T("\n"),_T("\0"));
|
||||
size_t len=wxbuffer.Length();
|
||||
for (size_t i=0;i<len;i++) {
|
||||
if (wxbuffer[i] == _T('\r') || wxbuffer[i] == _T('\n')) wxbuffer[i] = _T(' ');
|
||||
// Find first line break
|
||||
if (size) _buffer.FindLineBreak(0,size,newLinePos,newLineChar);
|
||||
|
||||
// If no line breaks were found, load more data into file
|
||||
while (newLinePos == -1) {
|
||||
// Read 2048 bytes
|
||||
const size_t read = 2048;
|
||||
size_t oldSize = _buffer.GetSize();
|
||||
char *ptr = _buffer.GetWritePtr(read);
|
||||
file.Read(ptr,read);
|
||||
size_t lastRead = file.LastRead();
|
||||
_buffer.AssumeSize(_buffer.GetSize()+lastRead-read);
|
||||
|
||||
// Find line break
|
||||
_buffer.FindLineBreak(oldSize,lastRead+oldSize,newLinePos,newLineChar);
|
||||
|
||||
// End of file, force a line break
|
||||
if (file.Eof() && newLinePos == -1) newLinePos = (int) _buffer.GetSize();
|
||||
}
|
||||
|
||||
// Found newline
|
||||
if (newLinePos != -1) {
|
||||
// Replace newline with null character and convert to proper charset
|
||||
char *read = _buffer.GetMutableReadPtr();
|
||||
if (newLinePos) {
|
||||
read[newLinePos] = 0;
|
||||
stringBuffer = wxString(read,*conv);
|
||||
}
|
||||
|
||||
// Remove an extra character if the new is the complement of \n,\r (13^7=10, 10^7=13)
|
||||
if (read[newLinePos+1] == (newLineChar ^ 7)) newLinePos++;
|
||||
_buffer.ShiftLeft(newLinePos+1);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove BOM
|
||||
if (wxbuffer.Length() > 0 && wxbuffer[0] == 0xFEFF) {
|
||||
wxbuffer = wxbuffer.Mid(1);
|
||||
}
|
||||
size_t startPos = 0;
|
||||
if (stringBuffer.Length() > 0 && stringBuffer[0] == 0xFEFF) startPos = 1;
|
||||
|
||||
// Trim
|
||||
if (trim) {
|
||||
wxbuffer.Trim(true);
|
||||
wxbuffer.Trim(false);
|
||||
}
|
||||
return Gorgonsub::String(wxbuffer.c_str());
|
||||
if (trim) return String(StringTrim(stringBuffer,startPos));
|
||||
return String(stringBuffer.c_str() + startPos);
|
||||
}
|
||||
|
||||
|
||||
|
@ -186,7 +207,7 @@ bool TextFileReader::HasMoreLines()
|
|||
{
|
||||
if (cache.size()) return true;
|
||||
wxCriticalSectionLocker locker(mutex);
|
||||
return (!file.Eof());
|
||||
return (!file.Eof() || _buffer.GetSize());
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@
|
|||
|
||||
// Headers
|
||||
#include "Gorgonsub.h"
|
||||
#include "fastbuffer.h"
|
||||
#include <wx/stream.h>
|
||||
|
||||
|
||||
|
@ -51,6 +52,8 @@ namespace Gorgonsub {
|
|||
wxCriticalSection mutex;
|
||||
|
||||
std::list<String> cache;
|
||||
FastBuffer<char> _buffer;
|
||||
|
||||
wxString encoding;
|
||||
wxInputStream &file;
|
||||
shared_ptr<wxMBConv> conv;
|
||||
|
|
|
@ -112,3 +112,33 @@ void Gorgonsub::WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t
|
|||
pos++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/////////////////
|
||||
// Trim a string
|
||||
const wxChar *Gorgonsub::StringTrim(wxString &str,size_t startPos)
|
||||
{
|
||||
size_t len = str.Length();
|
||||
size_t start = startPos;
|
||||
size_t end = len;
|
||||
bool isStart = true;
|
||||
bool isEnd = false;
|
||||
wxChar cur;
|
||||
for (size_t i=start;i<len;i++) {
|
||||
cur = str[i];
|
||||
if (isStart)
|
||||
if (cur == ' ') start++;
|
||||
else isStart = false;
|
||||
if (isEnd)
|
||||
if (cur != ' ') isEnd = false;
|
||||
else {
|
||||
if (cur == ' ') {
|
||||
isEnd = true;
|
||||
end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
startPos = start;
|
||||
if (isEnd) str[end] = 0;
|
||||
return str.c_str() + startPos;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue