Made gorgonsub's UTF-8/ASCII parser much faster, resulting in a 2x subtitles reading speedup for such files.

Originally committed to SVN as r2060.
2008-03-15 03:24:38 +00:00 · 2008-03-15 03:24:38 +00:00 · d6d3f8aecb
commit d6d3f8aecb
parent 98d5794f20
6 changed files with 196 additions and 33 deletions
--- a/aegilib/aegilib.vcproj
+++ b/aegilib/aegilib.vcproj
@ -176,6 +176,10 @@
 				RelativePath=".\include\aegilib\exception.h"
 				>
 			</File>
+			<File
+				RelativePath=".\include\aegilib\fastbuffer.h"
+				>
+			</File>
 			<File
 				RelativePath=".\include\aegilib\format.h"
 				>
--- a/aegilib/include/aegilib/fastbuffer.h
+++ b/aegilib/include/aegilib/fastbuffer.h
@ -0,0 +1,104 @@
+// Copyright (c) 2005, Rodrigo Braz Monteiro
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   * Redistributions of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//   * Redistributions in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//   * Neither the name of the Aegisub Group nor the names of its contributors
+//     may be used to endorse or promote products derived from this software
+//     without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// -----------------------------------------------------------------------------
+//
+// AEGISUB
+//
+// Website: http://aegisub.cellosoft.com
+// Contact: mailto:zeratul@cellosoft.com
+//
+
+
+#pragma once
+#include <vector>
+#include "utils.h"
+
+
+namespace Gorgonsub {
+	// Fast buffer class
+	template <typename T>
+	class FastBuffer {
+	private:
+		std::vector<T> buffer;
+		size_t _size;
+
+	public:
+		// Constructor
+		FastBuffer() { _size = 0; }
+
+		// Gets the stored size
+		size_t GetSize() const { return _size; }
+
+		// Shifts all the buffer left, destroying steps entries
+		void ShiftLeft(size_t steps) {
+			steps = Min(_size,steps);
+			memcpy(&buffer[0],&buffer[steps],_size-steps);
+			_size -= steps;
+		}
+
+		// Get a read pointer
+		const T* GetReadPtr() const { return &buffer[0]; }
+
+		// Get a non-const read pointer
+		T* GetMutableReadPtr() { return &buffer[0]; }
+
+		// Get a write pointer to a new area of the specified size
+		T* GetWritePtr(size_t size) {
+			size_t oldSize = _size;
+			_size += size;
+			if (buffer.size() < _size+4) buffer.resize(_size+4);
+			return &buffer[oldSize];
+		}
+
+		// Assume that has a certain size, discarding anything beyond it
+		void AssumeSize(size_t size) {
+			_size = Min(size,_size);
+		}
+
+		// Pre-Allocates memory
+		void Alloc(size_t size) {
+			buffer.resize(size);
+		}
+
+		// Finds a line break
+		void FindLineBreak(size_t start,size_t end,int &pos,T &character) {
+			pos = -1;
+			character = 0;
+			T c1 = '\n';
+			T c2 = '\r';
+			for (size_t i=start;i<end;i++) {
+				T chr = buffer[i];
+				if (chr == c1 || chr == c2) {
+					pos = (int)i;
+					character = chr;
+					return;
+				}
+			}
+		}
+	};
+};
--- a/aegilib/include/aegilib/utils.h
+++ b/aegilib/include/aegilib/utils.h
@ -80,7 +80,7 @@ namespace Gorgonsub {
 	String IntegerToString(int value);
 	String PrettySize(int bytes);

-	// Fast string write
+	// Fast string functions
 	inline void WriteText(wxChar *&dst,const wxChar *src,size_t len,size_t &pos) {
 		memcpy(dst,src,len*sizeof(wxChar));
 		dst += len;
@ -92,4 +92,5 @@ namespace Gorgonsub {
 		pos++;
 	}
 	void WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t &pos);
+	const wxChar *StringTrim(wxString &str,size_t start);
 };
--- a/aegilib/src/text_file_reader.cpp
+++ b/aegilib/src/text_file_reader.cpp
@ -56,6 +56,7 @@ TextFileReader::TextFileReader(wxInputStream &stream,Gorgonsub::String enc,bool
 	trim = _trim;
 	threaded = prefetch && false;
 	thread = NULL;
+	_buffer.Alloc(4096);

 	// Set encoding
 	encoding = enc.c_str();
@ -107,9 +108,9 @@ void TextFileReader::SetEncodingConfiguration()
 // Reads a line from file
 Gorgonsub::String TextFileReader::ActuallyReadLine()
 {
-	wxString wxbuffer;
+	wxString stringBuffer;
 	size_t bufAlloc = 1024;
-	wxbuffer.Alloc(bufAlloc);
+	stringBuffer.Alloc(bufAlloc);
 	std::string buffer = "";

 	// Read UTF-16 line from file
@ -135,48 +136,68 @@ Gorgonsub::String TextFileReader::ActuallyReadLine()
 			ch = *((wchar_t*)charbuffer);
 			if (len >= bufAlloc - 1) {
 				bufAlloc *= 2;
-				wxbuffer.Alloc(bufAlloc);
+				stringBuffer.Alloc(bufAlloc);
 			}
-			wxbuffer += ch;
+			stringBuffer += ch;
 			len++;
 		}
+
+		// Remove line breaks
+		len = stringBuffer.Length();
+		for (size_t i=0;i<len;i++) {
+			if (stringBuffer[i] == _T('\r') || stringBuffer[i] == _T('\n')) stringBuffer[i] = _T(' ');
+		}
 	}

 	// Read ASCII/UTF-8 line from file
 	else {
-		//getline(file,buffer);
-		//wxbuffer.Clear();
-		//if (buffer.length()) wxbuffer = wxString(buffer.c_str(),*conv);
-		char temp = 0;
-		std::string buff;
-		while (temp != '\n' && !file.Eof()) {
-			file.Read(&temp,1);
-			if (temp != '\r') {
-				buff += temp;
-			}
-		}
-		if (buff.size()) wxbuffer = wxString(buff.c_str(),*conv);
+		// Look for a new line
+		int newLinePos = -1;
+		char newLineChar = 0;
+		size_t size = _buffer.GetSize();
+
+		// Find first line break
+		if (size) _buffer.FindLineBreak(0,size,newLinePos,newLineChar);
+
+		// If no line breaks were found, load more data into file
+		while (newLinePos == -1) {
+			// Read 2048 bytes
+			const size_t read = 2048;
+			size_t oldSize = _buffer.GetSize();
+			char *ptr = _buffer.GetWritePtr(read);
+			file.Read(ptr,read);
+			size_t lastRead = file.LastRead();
+			_buffer.AssumeSize(_buffer.GetSize()+lastRead-read);
+
+			// Find line break
+			_buffer.FindLineBreak(oldSize,lastRead+oldSize,newLinePos,newLineChar);
+
+			// End of file, force a line break
+			if (file.Eof() && newLinePos == -1) newLinePos = (int) _buffer.GetSize();
 		}

-	// Remove line breaks
-	//wxbuffer.Replace(_T("\r"),_T("\0"));
-	//wxbuffer.Replace(_T("\n"),_T("\0"));
-	size_t len=wxbuffer.Length();
-	for (size_t i=0;i<len;i++) {
-		if (wxbuffer[i] == _T('\r') || wxbuffer[i] == _T('\n')) wxbuffer[i] = _T(' ');
+		// Found newline
+		if (newLinePos != -1) {
+			// Replace newline with null character and convert to proper charset
+			char *read = _buffer.GetMutableReadPtr();
+			if (newLinePos) {
+				read[newLinePos] = 0;
+				stringBuffer = wxString(read,*conv);
+			}
+
+			// Remove an extra character if the new is the complement of \n,\r (13^7=10, 10^7=13)
+			if (read[newLinePos+1] == (newLineChar ^ 7)) newLinePos++;
+			_buffer.ShiftLeft(newLinePos+1);
+		}
 	}

 	// Remove BOM
-	if (wxbuffer.Length() > 0 && wxbuffer[0] == 0xFEFF) {
-		wxbuffer = wxbuffer.Mid(1);
-	}
+	size_t startPos = 0;
+	if (stringBuffer.Length() > 0 && stringBuffer[0] == 0xFEFF) startPos = 1;

 	// Trim
-	if (trim) {
-		wxbuffer.Trim(true);
-		wxbuffer.Trim(false);
-	}
-	return Gorgonsub::String(wxbuffer.c_str());
+	if (trim) return String(StringTrim(stringBuffer,startPos));
+	return String(stringBuffer.c_str() + startPos);
 }


@ -186,7 +207,7 @@ bool TextFileReader::HasMoreLines()
 {
 	if (cache.size()) return true;
 	wxCriticalSectionLocker locker(mutex);
-	return (!file.Eof());
+	return (!file.Eof() || _buffer.GetSize());
 }


--- a/aegilib/src/text_file_reader.h
+++ b/aegilib/src/text_file_reader.h
@ -39,6 +39,7 @@

 // Headers
 #include "Gorgonsub.h"
+#include "fastbuffer.h"
 #include <wx/stream.h>


@ -51,6 +52,8 @@ namespace Gorgonsub {
 		wxCriticalSection mutex;

 		std::list<String> cache;
+		FastBuffer<char> _buffer;
+
 		wxString encoding;
 		wxInputStream &file;
 		shared_ptr<wxMBConv> conv;
--- a/aegilib/src/utils.cpp
+++ b/aegilib/src/utils.cpp
@ -112,3 +112,33 @@ void Gorgonsub::WriteNumber(wxChar *&dst,wxChar *temp,int number,int pad,size_t
 		pos++;
 	}
 }
+
+
+/////////////////
+// Trim a string
+const wxChar *Gorgonsub::StringTrim(wxString &str,size_t startPos)
+{
+	size_t len = str.Length();
+	size_t start = startPos;
+	size_t end = len;
+	bool isStart = true;
+	bool isEnd = false;
+	wxChar cur;
+	for (size_t i=start;i<len;i++) {
+		cur = str[i];
+		if (isStart)
+			if (cur == ' ') start++;
+			else isStart = false;
+		if (isEnd)
+			if (cur != ' ') isEnd = false;
+		else {
+			if (cur == ' ') {
+				isEnd = true;
+				end = i;
+			}
+		}
+	}
+	startPos = start;
+	if (isEnd) str[end] = 0;
+	return str.c_str() + startPos;
+}