Add a lexer for the body of dialogue lines to libaegisub

2012-10-26 19:03:56 -07:00 · 2012-10-26 19:03:56 -07:00 · 47bafe4b9f
commit 47bafe4b9f
parent bd78692148
5 changed files with 393 additions and 1 deletions
--- a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
+++ b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
@ -542,6 +542,14 @@
 				>
 			</File>
 		</Filter>
+		<Filter
+			Name="ASS"
+			>
+			<File
+				RelativePath="..\..\libaegisub\include\libaegisub\ass\dialogue_parser.h"
+				>
+			</File>
+		</Filter>
 		<File
 			RelativePath="..\..\libaegisub\lagi_pre.h"
 			>
--- a/aegisub/libaegisub/common/parser.cpp
+++ b/aegisub/libaegisub/common/parser.cpp
@ -17,12 +17,15 @@
 #include "parser.h"

 #include "libaegisub/color.h"
+#include "libaegisub/ass/dialogue_parser.h"

 #include <boost/spirit/include/qi.hpp>
 #include <boost/spirit/include/phoenix_core.hpp>
 #include <boost/spirit/include/phoenix_operator.hpp>
 #include <boost/spirit/include/phoenix_fusion.hpp>
 #include <boost/fusion/include/adapt_struct.hpp>
+#include <boost/spirit/include/lex_lexertl.hpp>
+#include <boost/spirit/home/phoenix/statement.hpp>

 BOOST_FUSION_ADAPT_STRUCT(
 	agi::Color,
@ -96,13 +99,95 @@ struct color_grammar : qi::grammar<Iterator, agi::Color()> {
 	}
 };

+template <typename Lexer>
+struct dialogue_tokens : lex::lexer<Lexer> {
+	int paren_depth;
+
+	dialogue_tokens() : paren_depth(0) {
+		using lex::_state;
+		using lex::char_;
+		using lex::string;
+		using namespace boost::phoenix;
+		using namespace agi::ass::DialogueTokenType;
+
+		this->self
+			= string("\\\\[nNh]", LINE_BREAK)
+			| char_('{', OVR_BEGIN)[ref(paren_depth) = 0, _state = "OVR"]
+			| string(".", TEXT)
+			;
+
+		this->self("OVR")
+			= char_('{', ERROR)
+			| char_('}', OVR_END)[_state = "INITIAL"]
+			| char_('\\', TAG_START)[_state = "TAGSTART"]
+			| string("\\s+", WHITESPACE)
+			| string(".", COMMENT)
+			;
+
+		this->self("ARG")
+			= char_('{', ERROR)
+			| char_('}', OVR_END)[_state = "INITIAL"]
+			| char_('(', OPEN_PAREN)[++ref(paren_depth)]
+			| char_(')', CLOSE_PAREN)[--ref(paren_depth), if_(ref(paren_depth) == 0)[_state = "OVR"]]
+			| char_('\\', TAG_START)[_state = "TAGSTART"]
+			| char_(',', ARG_SEP)
+			| string("\\s+", WHITESPACE)
+			| string(".", ARG)
+			;
+
+		this->self("TAGSTART")
+			= string("\\s+", WHITESPACE)
+			| string("r|fn", TAG_NAME)[_state = "ARG"]
+			| char_('\\', TAG_START)
+			| char_('}', OVR_END)[_state = "INITIAL"]
+			| string("[a-z0-9]", TAG_NAME)[_state = "TAGNAME"]
+			| string(".", COMMENT)[_state = "OVR"]
+			;
+
+		this->self("TAGNAME")
+			= string("[a-z]+", TAG_NAME)[_state = "ARG"]
+			| char_('(', OPEN_PAREN)[++ref(paren_depth), _state = "ARG"]
+			| char_(')', CLOSE_PAREN)[--ref(paren_depth), if_(ref(paren_depth) == 0)[_state = "OVR"]]
+			| char_('}', OVR_END)[_state = "INITIAL"]
+			| char_('\\', TAG_START)[_state = "TAGSTART"]
+			| string(".", ARG)[_state = "ARG"]
+			;
+	}
+};
+
 }

-namespace agi { namespace parser {
+namespace agi {
+namespace parser {
 	bool parse(Color &dst, std::string const& str) {
 		std::string::const_iterator begin = str.begin();
 		bool parsed = parse(begin, str.end(), color_grammar<std::string::const_iterator>(), dst);
 		return parsed && begin == str.end();
 	}
 }
+
+namespace ass {
+	std::vector<DialogueToken> TokenizeDialogueBody(std::string const& str) {
+		dialogue_tokens<lex::lexertl::actor_lexer<> > tokenizer;
+
+		char const* first = str.c_str();
+		char const* last = first + str.size();
+		std::vector<DialogueToken> data;
+		dialogue_tokens<lex::lexertl::actor_lexer<> >::iterator_type
+			it = tokenizer.begin(first, last),
+			end = tokenizer.end();
+
+		for (; it != end && token_is_valid(*it); ++it) {
+			int id = it->id();
+			ptrdiff_t len = it->value().end() - it->value().begin();
+			assert(len > 0);
+			if (data.empty() || data.back().type != id)
+				data.push_back(DialogueToken(id, len));
+			else
+				data.back().length += len;
+		}
+
+		return data;
+	}
+}
 }
--- a/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h
+++ b/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h
@ -0,0 +1,47 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#ifndef LAGI_PRE
+#include <vector>
+#endif
+
+namespace agi {
+	namespace ass {
+		namespace DialogueTokenType {
+			enum {
+				TEXT = 1000,
+				LINE_BREAK,
+				OVR_BEGIN,
+				OVR_END,
+				TAG_START,
+				TAG_NAME,
+				OPEN_PAREN,
+				CLOSE_PAREN,
+				ARG_SEP,
+				ARG,
+				ERROR,
+				COMMENT,
+				WHITESPACE
+			};
+		}
+
+		struct DialogueToken {
+			int type;
+			size_t length;
+			DialogueToken(int type, size_t length) : type(type), length(length) { }
+		};
+
+		std::vector<DialogueToken> TokenizeDialogueBody(std::string const& str);
+	}
+}
--- a/aegisub/tests/Makefile
+++ b/aegisub/tests/Makefile
@ -20,6 +20,7 @@ SRC = \
 		libaegisub_access.cpp \
 		libaegisub_cajun.cpp \
 		libaegisub_color.cpp \
+		libaegisub_dialogue_lexer.cpp \
 		libaegisub_hotkey.cpp \
 		libaegisub_iconv.cpp \
 		libaegisub_keyframe.cpp \
--- a/aegisub/tests/libaegisub_dialogue_lexer.cpp
+++ b/aegisub/tests/libaegisub_dialogue_lexer.cpp
@ -0,0 +1,251 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#include <libaegisub/ass/dialogue_parser.h>
+
+#include "main.h"
+#include "util.h"
+
+class lagi_dialogue_lexer : public libagi {
+};
+
+using namespace agi::ass;
+
+TEST(lagi_dialogue_lexer, empty) {
+	ASSERT_TRUE(TokenizeDialogueBody("").empty());
+}
+
+#define tok_str(arg1, ...) do { \
+	std::string str = arg1; \
+	std::vector<DialogueToken> tok = TokenizeDialogueBody(str); \
+	size_t token_index = 0; \
+	__VA_ARGS__ \
+	EXPECT_EQ(token_index, tok.size()); \
+} while(false)
+
+#define expect_tok(expected_type, expected_len) do { \
+	EXPECT_LT(token_index, tok.size()); \
+	if (token_index < tok.size()) { \
+		EXPECT_EQ(DialogueTokenType::expected_type, tok[token_index].type); \
+		EXPECT_EQ(expected_len, tok[token_index].length); \
+		++token_index; \
+	} \
+} while(false)
+
+TEST(lagi_dialogue_lexer, plain_text) {
+	tok_str("hello there",
+		expect_tok(TEXT, 11);
+	);
+
+	tok_str("hello\\Nthere",
+		expect_tok(TEXT, 5);
+		expect_tok(LINE_BREAK, 2);
+		expect_tok(TEXT, 5);
+	);
+
+	tok_str("hello\\n\\h\\kthere",
+		expect_tok(TEXT, 5);
+		expect_tok(LINE_BREAK, 4);
+		expect_tok(TEXT, 7);
+	);
+}
+
+TEST(lagi_dialogue_lexer, basic_override_tags) {
+	tok_str("{\\b1}bold text{\\b0}",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+		expect_tok(ARG, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 9);
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+		expect_tok(ARG, 1);
+		expect_tok(OVR_END, 1);
+	);
+
+	tok_str("{\\fnComic Sans MS}text",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 2);
+		expect_tok(ARG, 5);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 4);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 2);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 4);
+	);
+
+	tok_str("{\\pos(0,0)}a",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 3);
+		expect_tok(OPEN_PAREN, 1);
+		expect_tok(ARG, 1);
+		expect_tok(ARG_SEP, 1);
+		expect_tok(ARG, 1);
+		expect_tok(CLOSE_PAREN, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 1);
+	);
+
+	tok_str("{\\pos( 0 , 0 )}a",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 3);
+		expect_tok(OPEN_PAREN, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG_SEP, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(CLOSE_PAREN, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 1);
+	);
+
+	tok_str("{\\c&HFFFFFF&\\2c&H0000FF&\\3c&H000000&}a",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+		expect_tok(ARG, 9);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 2);
+		expect_tok(ARG, 9);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 2);
+		expect_tok(ARG, 9);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 1);
+	);
+
+	tok_str("{\\t(0,100,\\clip(1, m 0 0 l 10 10 10 20))}a",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+		expect_tok(OPEN_PAREN, 1);
+		expect_tok(ARG, 1);
+		expect_tok(ARG_SEP, 1);
+		expect_tok(ARG, 3);
+		expect_tok(ARG_SEP, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 4);
+		expect_tok(OPEN_PAREN, 1);
+		expect_tok(ARG, 1);
+		expect_tok(ARG_SEP, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 2);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 2);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 2);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 2);
+		expect_tok(CLOSE_PAREN, 2);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 1);
+	);
+}
+
+TEST(lagi_dialogue_lexer, merging) {
+	tok_str("{\\b\\b",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+	);
+}
+
+TEST(lagi_dialogue_lexer, whitespace) {
+	tok_str("{ \\ fn Comic Sans MS }asd",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(TAG_NAME, 2);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 5);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 4);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(ARG, 2);
+		expect_tok(WHITESPACE, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 3);
+	);
+}
+
+TEST(lagi_dialogue_lexer, comment) {
+	tok_str("{a}b",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(COMMENT, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 1);
+	);
+
+	tok_str("{a\\b}c",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(COMMENT, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 1);
+	);
+}
+
+TEST(lagi_dialogue_lexer, malformed) {
+	tok_str("}",
+		expect_tok(TEXT, 1);
+	);
+
+	tok_str("{{",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(ERROR, 1);
+	);
+
+	tok_str("{\\pos(0,0}a",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 3);
+		expect_tok(OPEN_PAREN, 1);
+		expect_tok(ARG, 1);
+		expect_tok(ARG_SEP, 1);
+		expect_tok(ARG, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 1);
+	);
+
+	tok_str("{\\b1\\}asdf",
+		expect_tok(OVR_BEGIN, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(TAG_NAME, 1);
+		expect_tok(ARG, 1);
+		expect_tok(TAG_START, 1);
+		expect_tok(OVR_END, 1);
+		expect_tok(TEXT, 4);
+	);
+}