2012-10-29 17:33:16 -07:00
|
|
|
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
|
|
|
|
//
|
|
|
|
// Permission to use, copy, modify, and distribute this software for any
|
|
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
|
|
// copyright notice and this permission notice appear in all copies.
|
|
|
|
//
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
|
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
|
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
|
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
//
|
|
|
|
// Aegisub Project http://www.aegisub.org/
|
|
|
|
|
|
|
|
#include "../config.h"
|
|
|
|
|
|
|
|
#include "libaegisub/ass/dialogue_parser.h"
|
|
|
|
|
|
|
|
#include "libaegisub/scoped_ptr.h"
|
|
|
|
#include "libaegisub/spellchecker.h"
|
|
|
|
|
|
|
|
#include "iconv.h"
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
typedef std::vector<agi::ass::DialogueToken> TokenVec;
|
2012-11-06 16:26:00 -08:00
|
|
|
using namespace agi::ass;
|
|
|
|
namespace dt = DialogueTokenType;
|
|
|
|
namespace ss = SyntaxStyle;
|
2012-10-29 17:33:16 -07:00
|
|
|
|
|
|
|
class SyntaxHighlighter {
|
|
|
|
TokenVec ranges;
|
|
|
|
std::string const& text;
|
|
|
|
agi::SpellChecker *spellchecker;
|
|
|
|
|
|
|
|
void SetStyling(int len, int type) {
|
|
|
|
if (ranges.size() && ranges.back().type == type)
|
|
|
|
ranges.back().length += len;
|
|
|
|
else
|
2012-11-06 16:26:00 -08:00
|
|
|
ranges.push_back(DialogueToken(type, len));
|
2012-10-29 17:33:16 -07:00
|
|
|
}
|
|
|
|
|
2012-11-06 16:26:00 -08:00
|
|
|
public:
|
|
|
|
SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker)
|
|
|
|
: text(text)
|
|
|
|
, spellchecker(spellchecker)
|
|
|
|
{ }
|
2012-10-29 17:33:16 -07:00
|
|
|
|
2012-11-11 06:54:27 -08:00
|
|
|
TokenVec Highlight(TokenVec const& tokens) {
|
2012-11-06 16:26:00 -08:00
|
|
|
if (tokens.empty()) return ranges;
|
2012-10-29 17:33:16 -07:00
|
|
|
|
2012-11-06 16:26:00 -08:00
|
|
|
size_t pos = 0;
|
2012-10-29 17:33:16 -07:00
|
|
|
|
2012-11-11 06:54:27 -08:00
|
|
|
for (auto tok : tokens) {
|
|
|
|
switch (tok.type) {
|
|
|
|
case dt::KARAOKE_TEMPLATE: SetStyling(tok.length, ss::KARAOKE_TEMPLATE); break;
|
|
|
|
case dt::KARAOKE_VARIABLE: SetStyling(tok.length, ss::KARAOKE_VARIABLE); break;
|
|
|
|
case dt::LINE_BREAK: SetStyling(tok.length, ss::LINE_BREAK); break;
|
|
|
|
case dt::ERROR: SetStyling(tok.length, ss::ERROR); break;
|
|
|
|
case dt::ARG: SetStyling(tok.length, ss::PARAMETER); break;
|
|
|
|
case dt::COMMENT: SetStyling(tok.length, ss::COMMENT); break;
|
|
|
|
case dt::DRAWING: SetStyling(tok.length, ss::DRAWING); break;
|
|
|
|
case dt::TEXT: SetStyling(tok.length, ss::NORMAL); break;
|
|
|
|
case dt::TAG_NAME: SetStyling(tok.length, ss::TAG); break;
|
2012-11-06 16:26:00 -08:00
|
|
|
case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START:
|
2012-11-11 06:54:27 -08:00
|
|
|
SetStyling(tok.length, ss::PUNCTUATION);
|
2012-11-06 16:26:00 -08:00
|
|
|
break;
|
|
|
|
case dt::OVR_BEGIN: case dt::OVR_END:
|
2012-11-11 06:54:27 -08:00
|
|
|
SetStyling(tok.length, ss::OVERRIDE);
|
2012-11-06 16:26:00 -08:00
|
|
|
break;
|
2012-11-11 06:54:21 -08:00
|
|
|
case dt::WHITESPACE:
|
|
|
|
if (ranges.size() && ranges.back().type == ss::PARAMETER)
|
2012-11-11 06:54:27 -08:00
|
|
|
SetStyling(tok.length, ss::PARAMETER);
|
2012-11-11 06:54:21 -08:00
|
|
|
else
|
2012-11-11 06:54:27 -08:00
|
|
|
SetStyling(tok.length, ss::NORMAL);
|
2012-11-11 06:54:21 -08:00
|
|
|
break;
|
2012-11-06 16:26:00 -08:00
|
|
|
case dt::WORD:
|
2012-11-11 06:54:27 -08:00
|
|
|
if (spellchecker && !spellchecker->CheckWord(text.substr(pos, tok.length)))
|
|
|
|
SetStyling(tok.length, ss::SPELLING);
|
2012-11-06 16:26:00 -08:00
|
|
|
else
|
2012-11-11 06:54:27 -08:00
|
|
|
SetStyling(tok.length, ss::NORMAL);
|
2012-11-06 16:26:00 -08:00
|
|
|
break;
|
|
|
|
}
|
2012-10-29 17:33:16 -07:00
|
|
|
|
2012-11-11 06:54:27 -08:00
|
|
|
pos += tok.length;
|
2012-11-06 16:26:00 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return ranges;
|
2012-10-29 17:33:16 -07:00
|
|
|
}
|
2012-11-06 16:26:00 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
class WordSplitter {
|
|
|
|
std::string const& text;
|
|
|
|
std::vector<DialogueToken> &tokens;
|
|
|
|
agi::scoped_holder<iconv_t, int(&)(iconv_t)> utf8_to_utf32;
|
|
|
|
size_t pos;
|
2012-10-29 17:33:16 -07:00
|
|
|
|
2012-11-06 16:26:00 -08:00
|
|
|
bool IsWordSep(int chr) {
|
|
|
|
static const int delims[] = {
|
2012-10-29 17:33:16 -07:00
|
|
|
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028,
|
|
|
|
0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a,
|
|
|
|
0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e,
|
|
|
|
0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2,
|
|
|
|
0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab,
|
|
|
|
0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f,
|
|
|
|
0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d,
|
|
|
|
0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031,
|
|
|
|
0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9,
|
|
|
|
0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422,
|
|
|
|
0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001,
|
|
|
|
0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e,
|
|
|
|
0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018,
|
|
|
|
0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a,
|
|
|
|
0xff5b, 0xff5d, 0xff5e
|
|
|
|
};
|
|
|
|
|
2012-11-06 16:26:00 -08:00
|
|
|
return std::binary_search(std::begin(delims), std::end(delims), chr);
|
|
|
|
}
|
|
|
|
|
|
|
|
int NextChar(int pos, int len, int& char_len) {
|
|
|
|
int chr = 0;
|
|
|
|
char *inptr = const_cast<char *>(&text[pos]);
|
|
|
|
size_t inlen = len;
|
|
|
|
char *outptr = (char *)&chr;
|
|
|
|
size_t outlen = sizeof chr;
|
|
|
|
|
|
|
|
iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen);
|
|
|
|
if (outlen != 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
char_len = len - inlen;
|
|
|
|
return chr;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SwitchTo(size_t &i, int type, int len) {
|
|
|
|
if (tokens[i].type == type) return;
|
|
|
|
|
|
|
|
if (tokens[i].length == (size_t)len)
|
|
|
|
tokens[i].type = type;
|
|
|
|
else {
|
|
|
|
tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len));
|
|
|
|
tokens[i].length -= len;
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SplitText(size_t &i) {
|
2012-10-29 17:33:16 -07:00
|
|
|
int chrlen = 0;
|
2012-11-06 16:26:00 -08:00
|
|
|
int len = tokens[i].length;
|
|
|
|
int tpos = pos;
|
|
|
|
for (; len > 0; tpos += chrlen, len -= chrlen) {
|
|
|
|
int chr = NextChar(tpos, len, chrlen);
|
2012-10-29 17:33:16 -07:00
|
|
|
if (!chr) return;
|
|
|
|
|
2012-11-06 16:26:00 -08:00
|
|
|
if (IsWordSep(chr))
|
|
|
|
SwitchTo(i, dt::TEXT, len);
|
|
|
|
else
|
|
|
|
SwitchTo(i, dt::WORD, len);
|
2012-10-29 17:33:16 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
2012-11-06 16:26:00 -08:00
|
|
|
WordSplitter(std::string const& text, std::vector<DialogueToken> &tokens)
|
2012-10-29 17:33:16 -07:00
|
|
|
: text(text)
|
2012-11-06 16:26:00 -08:00
|
|
|
, tokens(tokens)
|
2012-10-31 19:49:29 -07:00
|
|
|
, utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close)
|
2012-11-06 16:26:00 -08:00
|
|
|
, pos(0)
|
2012-10-29 17:33:16 -07:00
|
|
|
{ }
|
|
|
|
|
2012-11-06 16:26:00 -08:00
|
|
|
void SplitWords() {
|
|
|
|
if (tokens.empty()) return;
|
2012-10-29 17:33:16 -07:00
|
|
|
|
|
|
|
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
|
|
size_t len = tokens[i].length;
|
2012-12-30 08:27:03 -08:00
|
|
|
if (tokens[i].type == dt::TEXT)
|
|
|
|
SplitText(i);
|
2012-10-29 17:33:16 -07:00
|
|
|
pos += len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace agi {
|
|
|
|
namespace ass {
|
|
|
|
|
2012-11-11 06:54:27 -08:00
|
|
|
std::vector<DialogueToken> SyntaxHighlight(std::string const& text, std::vector<DialogueToken> const& tokens, SpellChecker *spellchecker) {
|
|
|
|
return SyntaxHighlighter(text, spellchecker).Highlight(tokens);
|
2012-10-29 17:33:16 -07:00
|
|
|
}
|
|
|
|
|
2012-12-30 08:27:03 -08:00
|
|
|
void MarkDrawings(std::string const& str, std::vector<DialogueToken> &tokens) {
|
|
|
|
if (tokens.empty()) return;
|
|
|
|
|
|
|
|
size_t last_ovr_end = 0;
|
|
|
|
for (size_t i = tokens.size(); i > 0; --i) {
|
|
|
|
if (tokens[i - 1].type == dt::OVR_END) {
|
|
|
|
last_ovr_end = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t pos = 0;
|
|
|
|
bool in_drawing = false;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < last_ovr_end; ++i) {
|
|
|
|
size_t len = tokens[i].length;
|
|
|
|
switch (tokens[i].type) {
|
|
|
|
case dt::TEXT:
|
|
|
|
if (in_drawing)
|
|
|
|
tokens[i].type = dt::DRAWING;
|
|
|
|
break;
|
|
|
|
case dt::TAG_NAME:
|
|
|
|
if (len != 1 || i + 1 >= tokens.size() || str[pos] != 'p')
|
|
|
|
break;
|
|
|
|
|
|
|
|
in_drawing = false;
|
|
|
|
|
|
|
|
if (i + 1 == last_ovr_end || tokens[i + 1].type != dt::ARG)
|
|
|
|
break;
|
|
|
|
|
|
|
|
for (size_t j = pos + len; j < pos + len + tokens[i + 1].length; ++j) {
|
|
|
|
char c = str[j];
|
|
|
|
// I have no idea why one would use leading zeros for
|
|
|
|
// the scale, but vsfilter allows it
|
|
|
|
if (c >= '1' && c <= '9')
|
|
|
|
in_drawing = true;
|
|
|
|
else if (c != '0')
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
|
|
|
|
pos += len;
|
|
|
|
}
|
|
|
|
|
|
|
|
// VSFilter treats unclosed override blocks as plain text, so merge all
|
|
|
|
// the tokens after the last override block into a single TEXT (or DRAWING)
|
|
|
|
// token
|
|
|
|
for (size_t i = last_ovr_end; i < tokens.size(); ++i) {
|
|
|
|
switch (tokens[i].type) {
|
|
|
|
case dt::KARAOKE_TEMPLATE: break;
|
|
|
|
case dt::KARAOKE_VARIABLE: break;
|
|
|
|
case dt::LINE_BREAK: break;
|
|
|
|
default:
|
|
|
|
tokens[i].type = in_drawing ? dt::DRAWING : dt::TEXT;
|
|
|
|
if (i > 0 && tokens[i - 1].type == tokens[i].type) {
|
|
|
|
tokens[i - 1].length += tokens[i].length;
|
|
|
|
tokens.erase(tokens.begin() + i);
|
|
|
|
--i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-06 16:26:00 -08:00
|
|
|
void SplitWords(std::string const& str, std::vector<DialogueToken> &tokens) {
|
2012-12-30 08:27:03 -08:00
|
|
|
MarkDrawings(str, tokens);
|
2012-11-06 16:26:00 -08:00
|
|
|
WordSplitter(str, tokens).SplitWords();
|
|
|
|
}
|
|
|
|
|
2012-10-29 17:33:16 -07:00
|
|
|
}
|
|
|
|
}
|