Use boost.locale for spellchecker word splitting

This is currently only correct if the UI language is the same as the subtitles language as the global locale is used, but it should still never be worse than the small hardcoded table of word splitters. Closes #1206.
2013-02-01 09:29:34 -08:00 · 2013-02-01 09:29:34 -08:00 · 10a88dfb52
commit 10a88dfb52
parent 5efba3fda1
2 changed files with 37 additions and 59 deletions
--- a/aegisub/libaegisub/ass/dialogue_parser.cpp
+++ b/aegisub/libaegisub/ass/dialogue_parser.cpp
@ -18,11 +18,12 @@

 #include "libaegisub/ass/dialogue_parser.h"

-#include "libaegisub/scoped_ptr.h"
 #include "libaegisub/spellchecker.h"

 #include "iconv.h"

+#include <boost/locale/boundary.hpp>
+
 namespace {

 typedef std::vector<agi::ass::DialogueToken> TokenVec;
@ -94,70 +95,28 @@ public:
 class WordSplitter {
 	std::string const& text;
 	std::vector<DialogueToken> &tokens;
-	agi::scoped_holder<iconv_t, int(&)(iconv_t)> utf8_to_utf32;
 	size_t pos;

-	bool IsWordSep(int chr) {
-		static const int delims[] = {
-			0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028,
-			0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a,
-			0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e,
-			0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2,
-			0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab,
-			0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f,
-			0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d,
-			0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031,
-			0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9,
-			0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422,
-			0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001,
-			0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e,
-			0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018,
-			0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a,
-			0xff5b, 0xff5d, 0xff5e
-		};
-
-		return std::binary_search(std::begin(delims), std::end(delims), chr);
-	}
-
-	int NextChar(int pos, int len, int& char_len) {
-		int chr = 0;
-		char *inptr = const_cast<char *>(&text[pos]);
-		size_t inlen = len;
-		char *outptr = (char *)&chr;
-		size_t outlen = sizeof chr;
-
-		iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen);
-		if (outlen != 0)
-			return 0;
-
-		char_len = len - inlen;
-		return chr;
-	}
-
 	void SwitchTo(size_t &i, int type, int len) {
-		if (tokens[i].type == type) return;
+		auto old = tokens[i];
+		tokens[i].type = type;
+		tokens[i].length = len;

-		if (tokens[i].length == (size_t)len)
-			tokens[i].type = type;
-		else {
-			tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len));
-			tokens[i].length -= len;
+		if (old.length != (size_t)len) {
+			tokens.insert(tokens.begin() + i + 1, DialogueToken(old.type, old.length - len));
 			++i;
 		}
 	}

 	void SplitText(size_t &i) {
-		int chrlen = 0;
-		int len = tokens[i].length;
-		int tpos = pos;
-		for (; len > 0; tpos += chrlen, len -= chrlen) {
-			int chr = NextChar(tpos, len, chrlen);
-			if (!chr) return;
-
-			if (IsWordSep(chr))
-				SwitchTo(i, dt::TEXT, len);
-			else
+		using namespace boost::locale::boundary;
+		ssegment_index map(word, text.begin() + pos, text.begin() + pos + tokens[i].length);
+		for (auto const& segment : map) {
+			int len = distance(begin(segment), end(segment));
+			if (segment.rule() & word_letters)
 				SwitchTo(i, dt::WORD, len);
+			else
+				SwitchTo(i, dt::TEXT, len);
 		}
 	}

@ -165,7 +124,6 @@ public:
 	WordSplitter(std::string const& text, std::vector<DialogueToken> &tokens)
 	: text(text)
 	, tokens(tokens)
-	, utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close)
 	, pos(0)
 	{ }

@ -175,7 +133,7 @@ public:
 		for (size_t i = 0; i < tokens.size(); ++i) {
 			size_t len = tokens[i].length;
 			if (tokens[i].type == dt::TEXT)
-					SplitText(i);
+				SplitText(i);
 			pos += len;
 		}
 	}
--- a/aegisub/tests/tests/word_split.cpp
+++ b/aegisub/tests/tests/word_split.cpp
@ -126,10 +126,11 @@ TEST(lagi_word_split, unclosed_ovr) {
 	};

 	SplitWords(text, tokens);
-	ASSERT_EQ(3u, tokens.size());
+	ASSERT_EQ(4u, tokens.size());
 	EXPECT_EQ(dt::WORD, tokens[0].type);
 	EXPECT_EQ(dt::TEXT, tokens[1].type);
-	EXPECT_EQ(dt::WORD, tokens[2].type);
+	EXPECT_EQ(dt::TEXT, tokens[2].type);
+	EXPECT_EQ(dt::WORD, tokens[3].type);

 	text = "{";
 	tokens.clear();
@ -140,3 +141,22 @@ TEST(lagi_word_split, unclosed_ovr) {
 	EXPECT_EQ(dt::TEXT, tokens[0].type);
 }

+TEST(lagi_word_split, several_words) {
+	std::string text = "a bb ccc dd e";
+	std::vector<DialogueToken> tokens = {
+		{dt::TEXT, 13},
+	};
+
+	SplitWords(text, tokens);
+	ASSERT_EQ(9u, tokens.size());
+	EXPECT_EQ(1, tokens[0].length);
+	EXPECT_EQ(1, tokens[1].length);
+	EXPECT_EQ(2, tokens[2].length);
+	EXPECT_EQ(1, tokens[3].length);
+	EXPECT_EQ(3, tokens[4].length);
+	EXPECT_EQ(1, tokens[5].length);
+	EXPECT_EQ(2, tokens[6].length);
+	EXPECT_EQ(1, tokens[7].length);
+	EXPECT_EQ(1, tokens[8].length);
+}
+