Extract word-splitting logic from the syntax highlighter

2012-11-06 16:26:00 -08:00 · 2012-11-06 16:26:00 -08:00 · 88fdee726b
commit 88fdee726b
parent 24c21dd425
13 changed files with 482 additions and 230 deletions
--- a/aegisub/libaegisub/ass/dialogue_parser.cpp
+++ b/aegisub/libaegisub/ass/dialogue_parser.cpp
@ -26,49 +26,76 @@
 namespace {
 typedef std::vector<agi::ass::DialogueToken> TokenVec;
-namespace dt = agi::ass::DialogueTokenType;
+using namespace agi::ass;
-namespace ss = agi::ass::SyntaxStyle;
+namespace dt = DialogueTokenType;
 namespace ss = SyntaxStyle;
 class SyntaxHighlighter {
 	TokenVec ranges;
 	std::string const& text;
 	agi::SpellChecker *spellchecker;
 	agi::scoped_holder<iconv_t, int(&)(iconv_t)> utf8_to_utf32;
 	void SetStyling(int len, int type) {
 		if (ranges.size() && ranges.back().type == type)
 			ranges.back().length += len;
 		else
-			ranges.push_back(agi::ass::DialogueToken(type, len));
+			ranges.push_back(DialogueToken(type, len));
 	}
-	void CheckWord(int start, int end) {
+public:
-		int len = end - start;
+	SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker)
-		if (!len) return;
+	: text(text)
 	, spellchecker(spellchecker)
 	{ }
-		if (!spellchecker->CheckWord(text.substr(start, len)))
+	TokenVec Highlight(TokenVec const& tokens, bool template_line) {
-			SetStyling(len, ss::SPELLING);
+		if (tokens.empty()) return ranges;
-		else
+
-			SetStyling(len, ss::NORMAL);
+		size_t pos = 0;
 		for (size_t i = 0; i < tokens.size(); ++i) {
 			size_t len = tokens[i].length;
 			switch (tokens[i].type) {
 				case dt::LINE_BREAK: SetStyling(len, ss::LINE_BREAK); break;
 				case dt::ERROR:      SetStyling(len, ss::ERROR);      break;
 				case dt::ARG:        SetStyling(len, ss::PARAMETER);  break;
 				case dt::COMMENT:    SetStyling(len, ss::COMMENT);    break;
 				case dt::WHITESPACE: SetStyling(len, ss::NORMAL);     break;
 				case dt::DRAWING:    SetStyling(len, ss::DRAWING);    break;
 				case dt::TEXT:       SetStyling(len, ss::NORMAL);     break;
 				case dt::TAG_NAME:   SetStyling(len, ss::TAG);        break;
 				case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START:
 					SetStyling(len, ss::PUNCTUATION);
 					break;
 				case dt::OVR_BEGIN: case dt::OVR_END:
 					SetStyling(len, ss::OVERRIDE);
 					break;
 				case dt::WORD:
 					if (spellchecker && !spellchecker->CheckWord(text.substr(pos, len)))
 						SetStyling(len, ss::SPELLING);
 					else
 						SetStyling(len, ss::NORMAL);
 					break;
 			}
 			pos += len;
 			// karaoke templater
 		}
 		return ranges;
 	}
 };
-	int NextChar(int pos, int len, int& char_len) {
+class WordSplitter {
-		int chr = 0;
+	std::string const& text;
-		char *inptr = const_cast<char *>(&text[pos]);
+	std::vector<DialogueToken> &tokens;
-		size_t inlen = len;
+	agi::scoped_holder<iconv_t, int(&)(iconv_t)> utf8_to_utf32;
-		char *outptr = (char *)&chr;
+	size_t last_ovr_end;
-		size_t outlen = sizeof chr;
+	size_t pos;
 	bool in_drawing;
-		iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen);
+	bool IsWordSep(int chr) {
-		if (outlen != 0)
+		static const int delims[] = {
 			return 0;
 		char_len = len - inlen;
 		return chr;
 	}
 	void StyleSpellCheck(int pos, int len) {
 		const int delims[] = {
 			0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028,
 			0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a,
 			0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e,
@ -86,38 +113,72 @@ class SyntaxHighlighter {
 			0xff5b, 0xff5d, 0xff5e
 		};
-		int chrlen = 0;
+		return std::binary_search(std::begin(delims), std::end(delims), chr);
-		int start = pos;
+	}
 		for (; len > 0; pos += chrlen, len -= chrlen) {
 			int chr = NextChar(pos, len, chrlen);
 			if (!chr) return;
-			if (std::binary_search(std::begin(delims), std::end(delims), chr)) {
+	int NextChar(int pos, int len, int& char_len) {
-				CheckWord(start, pos);
+		int chr = 0;
-				SetStyling(1, ss::NORMAL);
+		char *inptr = const_cast<char *>(&text[pos]);
-				start = pos + 1;
+		size_t inlen = len;
-			}
+		char *outptr = (char *)&chr;
 		size_t outlen = sizeof chr;
 		iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen);
 		if (outlen != 0)
 			return 0;
 		char_len = len - inlen;
 		return chr;
 	}
 	void SwitchTo(size_t &i, int type, int len) {
 		if (tokens[i].type == type) return;
 		if (tokens[i].length == (size_t)len)
 			tokens[i].type = type;
 		else {
 			tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len));
 			tokens[i].length -= len;
 			++i;
 			++last_ovr_end;
 		}
 	}
 	void SplitText(size_t &i) {
 		if (in_drawing) {
 			tokens[i].type = dt::DRAWING;
 			return;
 		}
-		CheckWord(start, pos);
+		int chrlen = 0;
 		int len = tokens[i].length;
 		int tpos = pos;
 		for (; len > 0; tpos += chrlen, len -= chrlen) {
 			int chr = NextChar(tpos, len, chrlen);
 			if (!chr) return;
 			if (IsWordSep(chr))
 				SwitchTo(i, dt::TEXT, len);
 			else
 				SwitchTo(i, dt::WORD, len);
 		}
 	}
 public:
-	SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker)
+	WordSplitter(std::string const& text, std::vector<DialogueToken> &tokens)
 	: text(text)
-	, spellchecker(spellchecker)
+	, tokens(tokens)
 	, utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close)
 	, last_ovr_end(0)
 	, pos(0)
 	, in_drawing(false)
 	{ }
-	TokenVec Highlight(TokenVec const& tokens, bool template_line) {
+	void SplitWords() {
-		if (tokens.empty()) return ranges;
+		if (tokens.empty()) return;
 		bool in_drawing = false;
 		size_t pos = 0;
 		// VSFilter treats unclosed override blocks as plain text, so pretend
 		// all tokens after the last override block are TEXT
 		size_t last_ovr_end = 0;
 		for (size_t i = tokens.size(); i > 0; --i) {
 			if (tokens[i - 1].type == dt::OVR_END) {
 				last_ovr_end = i - 1;
@ -127,30 +188,14 @@ public:
 		for (size_t i = 0; i < tokens.size(); ++i) {
 			size_t len = tokens[i].length;
-			switch (i > last_ovr_end ? dt::TEXT : tokens[i].type) {
+			switch (tokens[i].type) {
-				case dt::LINE_BREAK: SetStyling(len, ss::LINE_BREAK); break;
+				case dt::LINE_BREAK: break;
-				case dt::ERROR:      SetStyling(len, ss::ERROR); break;
+				case dt::TEXT: SplitText(i); break;
 				case dt::ARG:        SetStyling(len, ss::PARAMETER); break;
 				case dt::COMMENT:    SetStyling(len, ss::COMMENT); break;
 				case dt::WHITESPACE: SetStyling(len, ss::NORMAL); break;
 				case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START:
 					SetStyling(len, ss::PUNCTUATION);
 					break;
 				case dt::OVR_BEGIN: case dt::OVR_END:
 					SetStyling(len, ss::OVERRIDE);
 					break;
 				case dt::TEXT:
 					if (in_drawing)
 						SetStyling(len, ss::DRAWING);
 					else if (spellchecker)
 						StyleSpellCheck(pos, len);
 					else
 						SetStyling(len, ss::NORMAL);
 					break;
 				case dt::TAG_NAME:
-					SetStyling(len, ss::TAG);
+					if (i > last_ovr_end) {
 						SplitText(i);
 						break;
 					}
 					if (len != 1 || i + 1 >= tokens.size() || text[pos] != 'p')
 						break;
@ -170,13 +215,14 @@ public:
 							break;
 					}
 					break;
 				default:
 					if (i > last_ovr_end)
 						SplitText(i);
 					break;
 			}
 			pos += len;
 			// karaoke templater
 		}
 		return ranges;
 	}
 };
 }
@ -188,5 +234,9 @@ std::vector<DialogueToken> SyntaxHighlight(std::string const& text, std::vector<
 	return SyntaxHighlighter(text, spellchecker).Highlight(tokens, template_line);
 }
 void SplitWords(std::string const& str, std::vector<DialogueToken> &tokens) {
 	WordSplitter(str, tokens).SplitWords();
 }
 }
 }
--- a/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h
+++ b/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h
@ -27,6 +27,7 @@ namespace agi {
 		namespace DialogueTokenType {
 			enum {
 				TEXT = 1000,
 				WORD,
 				LINE_BREAK,
 				OVR_BEGIN,
 				OVR_END,
@ -38,7 +39,8 @@ namespace agi {
 				ARG,
 				ERROR,
 				COMMENT,
-				WHITESPACE
+				WHITESPACE,
 				DRAWING
 			};
 		}
@ -66,8 +68,13 @@ namespace agi {
 			DialogueToken(int type, size_t length) : type(type), length(length) { }
 		};
 		/// Tokenize the passed string as the body of a dialogue line
 		std::vector<DialogueToken> TokenizeDialogueBody(std::string const& str);
 		/// Split the words in the TEXT tokens of the lexed line into their
 		/// own tokens and convert the body of drawings to DRAWING tokens
 		void SplitWords(std::string const& str, std::vector<DialogueToken> &tokens);
 		std::vector<DialogueToken> SyntaxHighlight(std::string const& text, std::vector<DialogueToken> const& tokens, bool template_line, SpellChecker *spellchecker);
 	}
 }
--- a/aegisub/src/dialog_spellchecker.cpp
+++ b/aegisub/src/dialog_spellchecker.cpp
@ -47,6 +47,7 @@
 #include "subs_edit_ctrl.h"
 #include "utils.h"
 #include <libaegisub/ass/dialogue_parser.h>
 #include <libaegisub/exception.h>
 #include <libaegisub/spellchecker.h>
@ -168,7 +169,7 @@ void DialogSpellChecker::OnReplace(wxCommandEvent&) {
 }
 void DialogSpellChecker::OnReplaceAll(wxCommandEvent&) {
-	auto_replace[orig_word->GetValue()] = replace_word->GetValue();
+	auto_replace[from_wx(orig_word->GetValue())] = from_wx(replace_word->GetValue());
 	Replace();
 	FindNext();
@ -179,7 +180,7 @@ void DialogSpellChecker::OnIgnore(wxCommandEvent&) {
 }
 void DialogSpellChecker::OnIgnoreAll(wxCommandEvent&) {
-	auto_ignore.insert(orig_word->GetValue());
+	auto_ignore.emplace(from_wx(orig_word->GetValue()));
 	FindNext();
 }
@ -247,19 +248,22 @@ bool DialogSpellChecker::FindNext() {
 bool DialogSpellChecker::CheckLine(AssDialogue *active_line, int start_pos, int *commit_id) {
 	if (active_line->Comment && skip_comments->GetValue()) return false;
-	IntPairVector results;
+	std::string text = from_wx(active_line->Text);
-	GetWordBoundaries(active_line->Text, results);
+	auto tokens = agi::ass::TokenizeDialogueBody(text);
 	agi::ass::SplitWords(text, tokens);
-	int shift = 0;
+	word_start = 0;
-	for (auto const& result : results) {
+	for (auto const& tok : tokens) {
-		word_start = result.first + shift;
+		word_start += tok.length;
 		if (tok.type != agi::ass::DialogueTokenType::WORD) continue;
 		if (word_start < start_pos) continue;
 		word_end = result.second + shift;
 		wxString word = active_line->Text.Mid(word_start, word_end - word_start);
-		if (auto_ignore.count(word) || spellchecker->CheckWord(from_wx(word))) continue;
+		word_len = tok.length;
 		std::string word = text.substr(word_start, word_len);
-		std::map<wxString, wxString>::const_iterator auto_rep = auto_replace.find(word);
+		if (auto_ignore.count(word) || spellchecker->CheckWord(word)) continue;
 		auto auto_rep = auto_replace.find(word);
 		if (auto_rep == auto_replace.end()) {
 #ifdef __WXGTK__
 			// http://trac.wxwidgets.org/ticket/14369
@ -274,9 +278,10 @@ bool DialogSpellChecker::CheckLine(AssDialogue *active_line, int start_pos, int
 			return true;
 		}
-		active_line->Text = active_line->Text.Left(word_start) + auto_rep->second + active_line->Text.Mid(word_end);
+		text.replace(word_start, word_len, auto_rep->second);
 		active_line->Text = from_wx(text);
 		*commit_id = context->ass->Commit(_("spell check replace"), AssFile::COMMIT_DIAG_TEXT, *commit_id);
-		shift += auto_rep->second.size() - auto_rep->first.size();
+		word_start += auto_rep->second.size() - auto_rep->first.size();
 	}
 	return false;
 }
@ -285,23 +290,23 @@ void DialogSpellChecker::Replace() {
 	AssDialogue *active_line = context->selectionController->GetActiveLine();
 	// Only replace if the user hasn't changed the selection to something else
-	if (active_line->Text.Mid(word_start, word_end - word_start) == orig_word->GetValue()) {
+	if (active_line->Text.Mid(word_start, word_len) == orig_word->GetValue()) {
-		active_line->Text = active_line->Text.Left(word_start) + replace_word->GetValue() + active_line->Text.Mid(word_end);
+		active_line->Text.replace(word_start, word_len, replace_word->GetValue());
 		context->ass->Commit(_("spell check replace"), AssFile::COMMIT_DIAG_TEXT);
 		context->textSelectionController->SetInsertionPoint(word_start + replace_word->GetValue().size());
 	}
 }
-void DialogSpellChecker::SetWord(wxString const& word) {
+void DialogSpellChecker::SetWord(std::string const& word) {
-	orig_word->SetValue(word);
+	orig_word->SetValue(to_wx(word));
-	wxArrayString suggestions = to_wx(spellchecker->GetSuggestions(from_wx(word)));
+	wxArrayString suggestions = to_wx(spellchecker->GetSuggestions(word));
-	replace_word->SetValue(suggestions.size() ? suggestions[0] : word);
+	replace_word->SetValue(suggestions.size() ? suggestions[0] : to_wx(word));
 	suggest_list->Clear();
 	suggest_list->Append(suggestions);
-	context->textSelectionController->SetSelection(word_start, word_end);
+	context->textSelectionController->SetSelection(word_start, word_start + word_len);
-	context->textSelectionController->SetInsertionPoint(word_end);
+	context->textSelectionController->SetInsertionPoint(word_start + word_len);
-	add_button->Enable(spellchecker->CanAddWord(from_wx(word)));
+	add_button->Enable(spellchecker->CanAddWord(word));
 }
--- a/aegisub/src/dialog_spellchecker.h
+++ b/aegisub/src/dialog_spellchecker.h
@ -48,16 +48,16 @@ class DialogSpellChecker : public wxDialog {
 	agi::scoped_ptr<agi::SpellChecker> spellchecker; ///< The spellchecking engine
 	/// Words which the user has indicated should always be corrected
-	std::map<wxString,wxString> auto_replace;
+	std::map<std::string, std::string> auto_replace;
 	/// Words which the user has temporarily added to the dictionary
-	std::set<wxString> auto_ignore;
+	std::set<std::string> auto_ignore;
 	/// Dictionaries available
 	wxArrayString dictionary_lang_codes;
 	int word_start; ///< Start index of the current misspelled word
-	int word_end;   ///< End index of the current misspelled word
+	int word_len;   ///< Length of the current misspelled word
 	wxTextCtrl *orig_word;    ///< The word being corrected
 	wxTextCtrl *replace_word; ///< The replacement that will be used if "Replace" is clicked
@ -83,7 +83,7 @@ class DialogSpellChecker : public wxDialog {
 	bool CheckLine(AssDialogue *active_line, int start_pos, int *commit_id);
 	/// Set the current word to be corrected
-	void SetWord(wxString const& word);
+	void SetWord(std::string const& word);
 	/// Correct the currently selected word
 	void Replace();
--- a/aegisub/src/scintilla_text_ctrl.cpp
+++ b/aegisub/src/scintilla_text_ctrl.cpp
@ -82,32 +82,6 @@ void ScintillaTextCtrl::SetUnicodeStyling(int start,int length,int style) {
 	SetStyling(len,style);
 }
 /// @brief Get boundaries of word at position
 void ScintillaTextCtrl::GetBoundsOfWordAtPosition(int pos,int &start,int &end) {
 	IntPairVector results;
 	GetWordBoundaries(GetText(), results);
 	// Get boundaries
 	for (auto const& result : results) {
 		if (result.first <= pos && result.second >= pos) {
 			start = result.first;
 			end = result.second;
 			return;
 		}
 	}
 	// Word not found
 	start = 0;
 	end = 0;
 }
 /// @brief Get word at specified position
 wxString ScintillaTextCtrl::GetWordAtPosition(int pos) {
 	int start,end;
 	GetBoundsOfWordAtPosition(pos, start, end);
 	return GetText().Mid(start, end - start);
 }
 /// @brief Set selection, unicode-aware
 void ScintillaTextCtrl::SetSelectionU(int start, int end) {
 	SetSelection(GetUnicodePosition(start),GetUnicodePosition(end));
--- a/aegisub/src/scintilla_text_ctrl.h
+++ b/aegisub/src/scintilla_text_ctrl.h
@ -34,6 +34,8 @@
 #ifndef AGI_PRE
 #include <wx/stc/stc.h>
 #include <string>
 #endif
 /// DOCME
@ -46,8 +48,6 @@ class ScintillaTextCtrl : public wxStyledTextCtrl {
 	void OnMouseWheel(wxMouseEvent& evt);
 public:
 	wxString GetWordAtPosition(int pos);
 	void GetBoundsOfWordAtPosition(int pos,int &start,int &end);
 	int GetUnicodePosition(int pos);
 	int GetReverseUnicodePosition(int pos);
--- a/aegisub/src/subs_edit_ctrl.cpp
+++ b/aegisub/src/subs_edit_ctrl.cpp
@ -213,6 +213,7 @@ void SubsTextEditCtrl::UpdateStyle() {
 		line_text = move(text);
 	}
 	tokenized_line = agi::ass::TokenizeDialogueBody(line_text);
 	agi::ass::SplitWords(line_text, tokenized_line);
 	cursor_pos = -1;
 	UpdateCallTip();
@ -298,15 +299,13 @@ void SubsTextEditCtrl::Paste() {
 void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {
 	wxPoint pos = event.GetPosition();
 	int activePos;
-	if (pos == wxDefaultPosition) {
+	if (pos == wxDefaultPosition)
 		activePos = GetCurrentPos();
-	}
+	else
 	else {
 		activePos = PositionFromPoint(ScreenToClient(pos));
 	}
-	currentWordPos = GetReverseUnicodePosition(activePos);
+	currentWordPos = GetBoundsOfWordAtPosition(activePos);
-	currentWord = from_wx(GetWordAtPosition(currentWordPos));
+	currentWord = line_text.substr(currentWordPos.first, currentWordPos.second);
 	wxMenu menu;
 	if (!currentWord.empty()) {
@ -431,27 +430,22 @@ void SubsTextEditCtrl::OnAddToDictionary(wxCommandEvent &) {
 void SubsTextEditCtrl::OnUseSuggestion(wxCommandEvent &event) {
 	std::string suggestion;
 	int sugIdx = event.GetId() - EDIT_MENU_THESAURUS_SUGS;
-	if (sugIdx >= 0) {
+	if (sugIdx >= 0)
-		suggestion = lagi_wxString(thesSugs[sugIdx]);
+		suggestion = from_wx(thesSugs[sugIdx]);
-	}
+	else
 	else {
 		suggestion = sugs[event.GetId() - EDIT_MENU_SUGGESTIONS];
 	}
 	// Strip suggestion of parenthesis
 	size_t pos = suggestion.find("(");
 	if (pos != suggestion.npos)
 		suggestion.resize(pos - 1);
-	// Get boundaries of text being replaced
+	// line_text needs to get cleared before SetTextRaw to ensure it gets reparsed
-	int start, end;
+	std::string new_text;
-	GetBoundsOfWordAtPosition(currentWordPos, start, end);
+	swap(line_text, new_text);
 	SetTextRaw(new_text.replace(currentWordPos.first, currentWordPos.second, suggestion).c_str());
-	wxString text = GetText();
+	SetSelection(currentWordPos.first, currentWordPos.first + suggestion.size());
 	SetText(text.Left(std::max(0, start)) + to_wx(suggestion) + text.Mid(end));
 	// Set selection
 	SetSelectionU(start, start+suggestion.size());
 	SetFocus();
 }
@ -480,3 +474,17 @@ void SubsTextEditCtrl::OnSetThesLanguage(wxCommandEvent &event) {
 	UpdateStyle();
 }
 std::pair<int, int> SubsTextEditCtrl::GetBoundsOfWordAtPosition(int pos) {
 	int len = 0;
 	for (auto const& tok : tokenized_line) {
 		if ((int)tok.length > pos) {
 			if (tok.type == agi::ass::DialogueTokenType::WORD)
 				return std::make_pair(len, tok.length);
 			return std::make_pair(0, 0);
 		}
 		len += tok.length;
 	}
 	return std::make_pair(0, 0);
 }
--- a/aegisub/src/subs_edit_ctrl.h
+++ b/aegisub/src/subs_edit_ctrl.h
@ -68,7 +68,7 @@ class SubsTextEditCtrl : public ScintillaTextCtrl {
 	std::string currentWord;
 	/// The beginning of the word right-clicked on, for spellchecker replacing
-	int currentWordPos;
+	std::pair<int, int> currentWordPos;
 	/// Spellchecker suggestions for the last right-clicked word
 	std::vector<std::string> sugs;
@ -129,5 +129,7 @@ public:
 	void SetTextTo(wxString const& text);
 	void Paste();
 	std::pair<int, int> GetBoundsOfWordAtPosition(int pos);
 	DECLARE_EVENT_TABLE()
 };
--- a/aegisub/src/utils.cpp
+++ b/aegisub/src/utils.cpp
@ -118,75 +118,6 @@ int SmallestPowerOf2(int x) {
 	return x;
 }
 void GetWordBoundaries(wxString const& text, IntPairVector &results, int start, int end) {
 	int depth = 0;
 	bool in_draw_mode = false;
 	if (end < 0) end = text.size();
 	// Delimiters
 	const wxUniChar delims[] = {
 		0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028,
 		0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a,
 		0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e,
 		0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2,
 		0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab,
 		0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f,
 		0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d,
 		0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031,
 		0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9,
 		0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422,
 		0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001,
 		0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e,
 		0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018,
 		0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a,
 		0xff5b, 0xff5d, 0xff5e
 	};
 	for (int i = start; i < end + 1; ++i) {
 		// Current character
 		wxUniChar cur = i < end ? text[i] : wxUniChar('.');
 		// Increase depth
 		if (cur == '{') {
 			depth++;
 			if (depth == 1 && start != i && !in_draw_mode)
 				results.push_back(std::make_pair(start, i));
 		}
 		// Decrease depth
 		else if (cur == '}') {
 			depth--;
 			start = i + 1;
 		}
 		else if (depth > 0) {
 			// Check for draw mode
 			if (cur == '\\' && i + 1 < end && text[i + 1] == 'p') {
 				i += 2;
 				// Eat leading zeros
 				while (i < end && text[i] == '0') ++i;
 				in_draw_mode = i < end && text[i] >= '0' && text[i] <= '9';
 				if (!in_draw_mode) --i;
 			}
 		}
 		else if (!in_draw_mode) {
 			// Check if it is \n or \N
 			if (cur == '\\' && i < end-1 && (text[i+1] == 'N' || text[i+1] == 'n' || text[i+1] == 'h')) {
 				if (start != i)
 					results.push_back(std::make_pair(start, i));
 				start = i + 2;
 				i++;
 			}
 			// Check for standard delimiters
 			else if (std::binary_search(delims, delims + sizeof(delims) / sizeof(delims[0]), cur)) {
 				if (start != i)
 					results.push_back(std::make_pair(start, i));
 				start = i + 1;
 			}
 		}
 	}
 }
 bool IsWhitespace(wchar_t c)
 {
 	const wchar_t whitespaces[] = {
--- a/aegisub/src/utils.h
+++ b/aegisub/src/utils.h
@ -49,8 +49,6 @@
 class wxMouseEvent;
 class wxWindow;
 typedef std::vector<std::pair<int,int> > IntPairVector;
 /// @brief Make a path relative to reference
 wxString MakeRelativePath(wxString path,wxString reference);
 /// @brief Extract original path from relative
@ -64,16 +62,6 @@ wxString PrettySize(int bytes);
 /// Algorithm from http://bob.allegronetwork.com/prog/tricks.html
 int SmallestPowerOf2(int x);
 /// Get the indices in text which are the beginnings of words
 /// @param text Text to split into words
 /// @param[out] results Vector of indices which are the beginnings of words
 /// @param start First index in text to check
 /// @param end Last index in text to check, or -1 for end
 ///
 /// This is ASS-specific and not a general purpose word boundary finder; words
 /// within override blocks or drawing blocks are ignored
 void GetWordBoundaries(wxString const& text, IntPairVector &results, int start=0, int end=-1);
 /// Check if wchar 'c' is a whitespace character
 bool IsWhitespace(wchar_t c);
--- a/aegisub/tests/Makefile
+++ b/aegisub/tests/Makefile
@ -25,14 +25,15 @@ SRC = \
 		libaegisub_iconv.cpp \
 		libaegisub_keyframe.cpp \
 		libaegisub_line_iterator.cpp \
 		libaegisub_line_wrap.cpp \
 		libaegisub_option.cpp \
 		libaegisub_mru.cpp \
 		libaegisub_signals.cpp \
 		libaegisub_thesaurus.cpp \
 		libaegisub_util.cpp \
 		libaegisub_vfr.cpp \
-		libaegisub_line_wrap.cpp
+		libaegisub_word_split.cpp
-		
+
 HEADER = \
 	*.h
--- a/aegisub/tests/libaegisub_syntax_highlight.cpp
+++ b/aegisub/tests/libaegisub_syntax_highlight.cpp
@ -0,0 +1,151 @@
 // Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
 //
 // Permission to use, copy, modify, and distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice appear in all copies.
 //
 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include <libaegisub/ass/dialogue_parser.h>
 #include <libaegisub/spellchecker.h>
 #include "main.h"
 class MockSpellChecker : public agi::SpellChecker {
 	void AddWord(std::string const&) { }
 	bool CanAddWord(std::string const&) { return false; }
 	std::vector<std::string> GetSuggestions(std::string const&) { return std::vector<std::string>(); }
 	std::vector<std::string> GetLanguageList() { return std::vector<std::string>(); }
 	bool CheckWord(std::string const& word) { return word != "incorrect"; }
 };
 using namespace agi::ass;
 namespace dt = DialogueTokenType;
 namespace ss = SyntaxStyle;
 class lagi_syntax : public libagi { };
 TEST(lagi_syntax, empty) {
 	std::string text;
 	std::vector<DialogueToken> tokens;
 	EXPECT_TRUE(SyntaxHighlight(text, tokens, false, 0).empty());
 	tokens.emplace_back(dt::TEXT, 0);
 	auto syntax = SyntaxHighlight(text, tokens, false, 0);
 	EXPECT_EQ(1u, syntax.size());
 	EXPECT_EQ(ss::NORMAL, syntax[0].type);
 }
 #define tok_str(arg1, ...) do { \
 	MockSpellChecker spellchecker; \
 	std::string str = arg1; \
 	std::vector<DialogueToken> tok = TokenizeDialogueBody(str); \
 	SplitWords(str, tok); \
 	std::vector<DialogueToken> styles = SyntaxHighlight(str, tok, false, &spellchecker); \
 	size_t token_index = 0; \
 	__VA_ARGS__ \
 	EXPECT_EQ(token_index, styles.size()); \
 } while(false)
 #define expect_style(expected_type, expected_len) do { \
 	EXPECT_LT(token_index, styles.size()); \
 	if (token_index < styles.size()) { \
 		EXPECT_EQ(expected_type, styles[token_index].type); \
 		EXPECT_EQ(expected_len, styles[token_index].length); \
 		++token_index; \
 	} \
 } while(false)
 TEST(lagi_syntax, spellcheck) {
 	tok_str("correct incorrect correct",
 		expect_style(ss::NORMAL, 8u);
 		expect_style(ss::SPELLING, 9u);
 		expect_style(ss::NORMAL, 8u);
 	);
 }
 TEST(lagi_syntax, drawing) {
 	tok_str("incorrect{\\p1}m 10 10{\\p}correct",
 		expect_style(ss::SPELLING, 9u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::TAG, 1u);
 		expect_style(ss::PARAMETER, 1u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::DRAWING, 7u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::TAG, 1u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::NORMAL, 7u);
 	);
 }
 TEST(lagi_syntax, transform) {
 	tok_str("{\\t(0, 0, \\clip(0,0,10,10)}clipped text",
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::TAG, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::PARAMETER, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::NORMAL, 1u);
 		expect_style(ss::PARAMETER, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::NORMAL, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::TAG, 4u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::PARAMETER, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::PARAMETER, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::PARAMETER, 2u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::PARAMETER, 2u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::NORMAL, 12u);
 	);
 }
 TEST(lagi_syntax, unclosed) {
 	tok_str("{\\incorrect}{\\incorrect",
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::PUNCTUATION, 1u);
 		expect_style(ss::TAG, 9u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::NORMAL, 2u);
 		expect_style(ss::SPELLING, 9u);
 	);
 }
 TEST(lagi_syntax, comment) {
 	tok_str("abc{def}ghi",
 		expect_style(ss::NORMAL, 3u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::COMMENT, 3u);
 		expect_style(ss::OVERRIDE, 1u);
 		expect_style(ss::NORMAL, 3u);
 	);
 }
 TEST(lagi_syntax, linebreak) {
 	tok_str("a\\Nb\\nc\\hd\\N\\N",
 		expect_style(ss::NORMAL, 1u);
 		expect_style(ss::LINE_BREAK, 2u);
 		expect_style(ss::NORMAL, 1u);
 		expect_style(ss::LINE_BREAK, 2u);
 		expect_style(ss::NORMAL, 1u);
 		expect_style(ss::LINE_BREAK, 2u);
 		expect_style(ss::NORMAL, 1u);
 		expect_style(ss::LINE_BREAK, 4u);
 	);
 }
--- a/aegisub/tests/libaegisub_word_split.cpp
+++ b/aegisub/tests/libaegisub_word_split.cpp
@ -0,0 +1,135 @@
 // Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
 //
 // Permission to use, copy, modify, and distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
 // copyright notice and this permission notice appear in all copies.
 //
 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include "main.h"
 #include <libaegisub/ass/dialogue_parser.h>
 class lagi_word_split : public libagi { };
 using namespace agi::ass;
 namespace dt = DialogueTokenType;
 TEST(lagi_word_split, empty) {
 	std::string text;
 	std::vector<DialogueToken> tokens;
 	SplitWords(text, tokens);
 	EXPECT_TRUE(tokens.empty());
 	tokens.emplace_back(0, 0);
 	SplitWords(text, tokens);
 	EXPECT_EQ(1u, tokens.size());
 }
 TEST(lagi_word_split, one_word) {
 	std::string text = "abc";
 	std::vector<DialogueToken> tokens = {{dt::TEXT, 3}};
 	SplitWords(text, tokens);
 	ASSERT_EQ(1u, tokens.size());
 	EXPECT_EQ(dt::WORD, tokens[0].type);
 }
 TEST(lagi_word_split, two_words_space) {
 	std::string text = "abc def";
 	std::vector<DialogueToken> tokens = {{dt::TEXT, 7}};
 	SplitWords(text, tokens);
 	ASSERT_EQ(3u, tokens.size());
 	EXPECT_EQ(dt::WORD, tokens[0].type);
 	EXPECT_EQ(3u, tokens[0].length);
 	EXPECT_EQ(dt::TEXT, tokens[1].type);
 	EXPECT_EQ(1u, tokens[1].length);
 	EXPECT_EQ(dt::WORD, tokens[2].type);
 	EXPECT_EQ(3u, tokens[2].length);
 }
 TEST(lagi_word_split, two_words_newline) {
 	std::string text = "abc\\Ndef";
 	std::vector<DialogueToken> tokens = {
 		{dt::TEXT, 3},
 		{dt::LINE_BREAK, 2},
 		{dt::TEXT, 3}
 	};
 	SplitWords(text, tokens);
 	ASSERT_EQ(3u, tokens.size());
 	EXPECT_EQ(dt::WORD, tokens[0].type);
 	EXPECT_EQ(3u, tokens[0].length);
 	EXPECT_EQ(dt::LINE_BREAK, tokens[1].type);
 	EXPECT_EQ(2u, tokens[1].length);
 	EXPECT_EQ(dt::WORD, tokens[2].type);
 	EXPECT_EQ(3u, tokens[2].length);
 }
 TEST(lagi_word_split, two_words_unicode) {
 	std::string text = u8"abc\u300adef";
 	std::vector<DialogueToken> tokens = {{dt::TEXT, 9}};
 	SplitWords(text, tokens);
 	ASSERT_EQ(3u, tokens.size());
 	EXPECT_EQ(dt::WORD, tokens[0].type);
 	EXPECT_EQ(3u, tokens[0].length);
 	EXPECT_EQ(dt::TEXT, tokens[1].type);
 	EXPECT_EQ(3u, tokens[1].length);
 	EXPECT_EQ(dt::WORD, tokens[2].type);
 	EXPECT_EQ(3u, tokens[2].length);
 }
 TEST(lagi_word_split, drawing) {
 	std::string text = "a b{\\p1}m 10{\\p0}c";
 	std::vector<DialogueToken> tokens = {
 		{dt::TEXT, 3},
 		{dt::OVR_BEGIN, 1},
 		{dt::TAG_START, 1},
 		{dt::TAG_NAME, 1},
 		{dt::ARG, 1},
 		{dt::OVR_END, 1},
 		{dt::TEXT, 4},
 		{dt::OVR_BEGIN, 1},
 		{dt::TAG_START, 1},
 		{dt::TAG_NAME, 1},
 		{dt::ARG, 1},
 		{dt::OVR_END, 1},
 		{dt::TEXT, 1}
 	};
 	SplitWords(text, tokens);
 	ASSERT_EQ(15u, tokens.size());
 	EXPECT_EQ(dt::WORD, tokens[0].type);
 	EXPECT_EQ(dt::WORD, tokens[2].type);
 	EXPECT_EQ(dt::WORD, tokens[14].type);
 	EXPECT_EQ(dt::DRAWING, tokens[8].type);
 }
 TEST(lagi_word_split, unclosed_ovr) {
 	std::string text = "a{\\b";
 	std::vector<DialogueToken> tokens = {
 		{dt::TEXT, 1},
 		{dt::OVR_BEGIN, 1},
 		{dt::TAG_START, 1},
 		{dt::TAG_NAME, 1}
 	};
 	SplitWords(text, tokens);
 	ASSERT_EQ(4u, tokens.size());
 	EXPECT_EQ(dt::WORD, tokens[0].type);
 	EXPECT_EQ(dt::TEXT, tokens[1].type);
 	EXPECT_EQ(dt::TEXT, tokens[2].type);
 	EXPECT_EQ(dt::WORD, tokens[3].type);
 }