2012-10-30 01:33:16 +01:00
|
|
|
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
|
|
|
|
//
|
|
|
|
// Permission to use, copy, modify, and distribute this software for any
|
|
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
|
|
// copyright notice and this permission notice appear in all copies.
|
|
|
|
//
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
|
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
|
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
|
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
//
|
|
|
|
// Aegisub Project http://www.aegisub.org/
|
|
|
|
|
|
|
|
#include "libaegisub/ass/dialogue_parser.h"
|
|
|
|
|
|
|
|
#include "libaegisub/spellchecker.h"
|
|
|
|
|
2014-05-23 00:40:16 +02:00
|
|
|
#include <boost/locale/boundary/index.hpp>
|
|
|
|
#include <boost/locale/boundary/segment.hpp>
|
|
|
|
#include <boost/locale/boundary/types.hpp>
|
2013-02-01 18:29:34 +01:00
|
|
|
|
2012-10-30 01:33:16 +01:00
|
|
|
namespace {
|
|
|
|
|
|
|
|
typedef std::vector<agi::ass::DialogueToken> TokenVec;
|
2012-11-07 01:26:00 +01:00
|
|
|
using namespace agi::ass;
|
|
|
|
namespace dt = DialogueTokenType;
|
|
|
|
namespace ss = SyntaxStyle;
|
2012-10-30 01:33:16 +01:00
|
|
|
|
|
|
|
class SyntaxHighlighter {
|
|
|
|
TokenVec ranges;
|
|
|
|
std::string const& text;
|
|
|
|
agi::SpellChecker *spellchecker;
|
|
|
|
|
2014-05-20 16:05:07 +02:00
|
|
|
void SetStyling(size_t len, int type) {
|
2012-10-30 01:33:16 +01:00
|
|
|
if (ranges.size() && ranges.back().type == type)
|
|
|
|
ranges.back().length += len;
|
|
|
|
else
|
2014-04-25 19:01:07 +02:00
|
|
|
ranges.push_back(DialogueToken{type, len});
|
2012-10-30 01:33:16 +01:00
|
|
|
}
|
|
|
|
|
2012-11-07 01:26:00 +01:00
|
|
|
public:
|
|
|
|
SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker)
|
|
|
|
: text(text)
|
|
|
|
, spellchecker(spellchecker)
|
|
|
|
{ }
|
2012-10-30 01:33:16 +01:00
|
|
|
|
2012-11-11 15:54:27 +01:00
|
|
|
TokenVec Highlight(TokenVec const& tokens) {
|
2012-11-07 01:26:00 +01:00
|
|
|
if (tokens.empty()) return ranges;
|
2012-10-30 01:33:16 +01:00
|
|
|
|
2012-11-07 01:26:00 +01:00
|
|
|
size_t pos = 0;
|
2012-10-30 01:33:16 +01:00
|
|
|
|
2012-11-11 15:54:27 +01:00
|
|
|
for (auto tok : tokens) {
|
|
|
|
switch (tok.type) {
|
|
|
|
case dt::KARAOKE_TEMPLATE: SetStyling(tok.length, ss::KARAOKE_TEMPLATE); break;
|
|
|
|
case dt::KARAOKE_VARIABLE: SetStyling(tok.length, ss::KARAOKE_VARIABLE); break;
|
|
|
|
case dt::LINE_BREAK: SetStyling(tok.length, ss::LINE_BREAK); break;
|
|
|
|
case dt::ERROR: SetStyling(tok.length, ss::ERROR); break;
|
|
|
|
case dt::ARG: SetStyling(tok.length, ss::PARAMETER); break;
|
|
|
|
case dt::COMMENT: SetStyling(tok.length, ss::COMMENT); break;
|
2022-11-02 19:24:42 +01:00
|
|
|
case dt::DRAWING_CMD:SetStyling(tok.length, ss::DRAWING_CMD);break;
|
|
|
|
case dt::DRAWING_X: SetStyling(tok.length, ss::DRAWING_X); break;
|
|
|
|
case dt::DRAWING_Y: SetStyling(tok.length, ss::DRAWING_Y); break;
|
|
|
|
case dt::DRAWING_ENDPOINT_X: SetStyling(tok.length, ss::DRAWING_ENDPOINT_X); break;
|
|
|
|
case dt::DRAWING_ENDPOINT_Y: SetStyling(tok.length, ss::DRAWING_ENDPOINT_Y); break;
|
2012-11-11 15:54:27 +01:00
|
|
|
case dt::TEXT: SetStyling(tok.length, ss::NORMAL); break;
|
|
|
|
case dt::TAG_NAME: SetStyling(tok.length, ss::TAG); break;
|
2012-11-07 01:26:00 +01:00
|
|
|
case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START:
|
2012-11-11 15:54:27 +01:00
|
|
|
SetStyling(tok.length, ss::PUNCTUATION);
|
2012-11-07 01:26:00 +01:00
|
|
|
break;
|
|
|
|
case dt::OVR_BEGIN: case dt::OVR_END:
|
2012-11-11 15:54:27 +01:00
|
|
|
SetStyling(tok.length, ss::OVERRIDE);
|
2012-11-07 01:26:00 +01:00
|
|
|
break;
|
2012-11-11 15:54:21 +01:00
|
|
|
case dt::WHITESPACE:
|
|
|
|
if (ranges.size() && ranges.back().type == ss::PARAMETER)
|
2012-11-11 15:54:27 +01:00
|
|
|
SetStyling(tok.length, ss::PARAMETER);
|
2022-11-02 19:24:42 +01:00
|
|
|
else if (ranges.size() && ranges.back().type == ss::DRAWING_ENDPOINT_X)
|
|
|
|
SetStyling(tok.length, ss::DRAWING_ENDPOINT_X); // connect the underline between x and y of endpoints
|
2012-11-11 15:54:21 +01:00
|
|
|
else
|
2012-11-11 15:54:27 +01:00
|
|
|
SetStyling(tok.length, ss::NORMAL);
|
2012-11-11 15:54:21 +01:00
|
|
|
break;
|
2012-11-07 01:26:00 +01:00
|
|
|
case dt::WORD:
|
2012-11-11 15:54:27 +01:00
|
|
|
if (spellchecker && !spellchecker->CheckWord(text.substr(pos, tok.length)))
|
|
|
|
SetStyling(tok.length, ss::SPELLING);
|
2012-11-07 01:26:00 +01:00
|
|
|
else
|
2012-11-11 15:54:27 +01:00
|
|
|
SetStyling(tok.length, ss::NORMAL);
|
2012-11-07 01:26:00 +01:00
|
|
|
break;
|
|
|
|
}
|
2012-10-30 01:33:16 +01:00
|
|
|
|
2012-11-11 15:54:27 +01:00
|
|
|
pos += tok.length;
|
2012-11-07 01:26:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return ranges;
|
2012-10-30 01:33:16 +01:00
|
|
|
}
|
2012-11-07 01:26:00 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
class WordSplitter {
|
|
|
|
std::string const& text;
|
|
|
|
std::vector<DialogueToken> &tokens;
|
2015-07-28 20:55:05 +02:00
|
|
|
size_t pos = 0;
|
2012-10-30 01:33:16 +01:00
|
|
|
|
2015-07-28 20:55:05 +02:00
|
|
|
void SwitchTo(size_t &i, int type, size_t len) {
|
2013-02-01 18:29:34 +01:00
|
|
|
auto old = tokens[i];
|
|
|
|
tokens[i].type = type;
|
|
|
|
tokens[i].length = len;
|
2012-11-07 01:26:00 +01:00
|
|
|
|
2013-02-01 18:29:34 +01:00
|
|
|
if (old.length != (size_t)len) {
|
2014-04-25 19:01:07 +02:00
|
|
|
tokens.insert(tokens.begin() + i + 1, DialogueToken{old.type, old.length - len});
|
2012-11-07 01:26:00 +01:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SplitText(size_t &i) {
|
2013-02-01 18:29:34 +01:00
|
|
|
using namespace boost::locale::boundary;
|
|
|
|
ssegment_index map(word, text.begin() + pos, text.begin() + pos + tokens[i].length);
|
|
|
|
for (auto const& segment : map) {
|
2015-07-28 20:55:05 +02:00
|
|
|
auto len = static_cast<size_t>(distance(begin(segment), end(segment)));
|
2013-02-01 18:29:34 +01:00
|
|
|
if (segment.rule() & word_letters)
|
2012-11-07 01:26:00 +01:00
|
|
|
SwitchTo(i, dt::WORD, len);
|
2013-02-01 18:29:34 +01:00
|
|
|
else
|
|
|
|
SwitchTo(i, dt::TEXT, len);
|
2012-10-30 01:33:16 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-02 19:24:42 +01:00
|
|
|
void SplitDrawing(size_t &i) {
|
|
|
|
size_t starti = i;
|
|
|
|
|
|
|
|
// First, split into words
|
|
|
|
size_t dpos = pos;
|
|
|
|
size_t tlen = 0;
|
|
|
|
bool tokentype = text[pos] == ' ' || text[pos] == '\t';
|
|
|
|
while (tlen < tokens[i].length) {
|
|
|
|
bool newtype = text[dpos] == ' ' || text[dpos] == '\t';
|
|
|
|
if (newtype != tokentype) {
|
|
|
|
tokentype = newtype;
|
|
|
|
SwitchTo(i, tokentype ? dt::DRAWING_FULL : dt::WHITESPACE, tlen);
|
|
|
|
tokens[i].type = tokentype ? dt::WHITESPACE : dt::DRAWING_FULL;
|
|
|
|
tlen = 0;
|
|
|
|
}
|
|
|
|
++tlen;
|
|
|
|
++dpos;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Then, label all the tokens
|
|
|
|
dpos = pos;
|
|
|
|
int num_coord = 0;
|
|
|
|
char lastcmd = ' ';
|
|
|
|
|
|
|
|
for (size_t j = starti; j <= i; j++) {
|
|
|
|
char c = text[dpos];
|
|
|
|
if (tokens[j].type == dt::WHITESPACE) {
|
|
|
|
} else if (c == 'm' || c == 'n' || c == 'l' || c == 's' || c == 'b' || c == 'p' || c == 'c') {
|
|
|
|
tokens[j].type = dt::DRAWING_CMD;
|
|
|
|
|
|
|
|
if (tokens[j].length != 1)
|
|
|
|
tokens[j].type = dt::ERROR;
|
|
|
|
if (num_coord % 2 != 0)
|
|
|
|
tokens[j].type = dt::ERROR;
|
|
|
|
|
|
|
|
lastcmd = c;
|
|
|
|
num_coord = 0;
|
|
|
|
} else {
|
|
|
|
bool valid = true;
|
|
|
|
for (size_t k = 0; k < tokens[j].length; k++) {
|
|
|
|
char c = text[dpos + k];
|
2023-01-02 02:30:25 +01:00
|
|
|
if (!((c >= '0' && c <= '9') || c == '.' || c == '+' || c == '-' || c == 'e' || c == 'E')) {
|
2022-11-02 19:24:42 +01:00
|
|
|
valid = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!valid)
|
|
|
|
tokens[j].type = dt::ERROR;
|
|
|
|
else if (lastcmd == 'b' && num_coord % 6 >= 4)
|
|
|
|
tokens[j].type = num_coord % 2 == 0 ? dt::DRAWING_ENDPOINT_X : dt::DRAWING_ENDPOINT_Y;
|
|
|
|
else
|
|
|
|
tokens[j].type = num_coord % 2 == 0 ? dt::DRAWING_X : dt::DRAWING_Y;
|
|
|
|
++num_coord;
|
|
|
|
}
|
|
|
|
|
|
|
|
dpos += tokens[j].length;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-10-30 01:33:16 +01:00
|
|
|
public:
|
2012-11-07 01:26:00 +01:00
|
|
|
WordSplitter(std::string const& text, std::vector<DialogueToken> &tokens)
|
2012-10-30 01:33:16 +01:00
|
|
|
: text(text)
|
2012-11-07 01:26:00 +01:00
|
|
|
, tokens(tokens)
|
2012-10-30 01:33:16 +01:00
|
|
|
{ }
|
|
|
|
|
2012-11-07 01:26:00 +01:00
|
|
|
void SplitWords() {
|
|
|
|
if (tokens.empty()) return;
|
2012-10-30 01:33:16 +01:00
|
|
|
|
|
|
|
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
|
|
size_t len = tokens[i].length;
|
2012-12-30 17:27:03 +01:00
|
|
|
if (tokens[i].type == dt::TEXT)
|
2013-02-01 18:29:34 +01:00
|
|
|
SplitText(i);
|
2022-11-02 19:24:42 +01:00
|
|
|
else if (tokens[i].type == dt::DRAWING_FULL) {
|
|
|
|
SplitDrawing(i);
|
|
|
|
}
|
2012-10-30 01:33:16 +01:00
|
|
|
pos += len;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace agi {
|
|
|
|
namespace ass {
|
|
|
|
|
2012-11-11 15:54:27 +01:00
|
|
|
std::vector<DialogueToken> SyntaxHighlight(std::string const& text, std::vector<DialogueToken> const& tokens, SpellChecker *spellchecker) {
|
|
|
|
return SyntaxHighlighter(text, spellchecker).Highlight(tokens);
|
2012-10-30 01:33:16 +01:00
|
|
|
}
|
|
|
|
|
2012-12-30 17:27:03 +01:00
|
|
|
void MarkDrawings(std::string const& str, std::vector<DialogueToken> &tokens) {
|
|
|
|
if (tokens.empty()) return;
|
|
|
|
|
|
|
|
size_t last_ovr_end = 0;
|
|
|
|
for (size_t i = tokens.size(); i > 0; --i) {
|
|
|
|
if (tokens[i - 1].type == dt::OVR_END) {
|
|
|
|
last_ovr_end = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t pos = 0;
|
|
|
|
bool in_drawing = false;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < last_ovr_end; ++i) {
|
|
|
|
size_t len = tokens[i].length;
|
|
|
|
switch (tokens[i].type) {
|
|
|
|
case dt::TEXT:
|
|
|
|
if (in_drawing)
|
2022-11-02 19:24:42 +01:00
|
|
|
tokens[i].type = dt::DRAWING_FULL;
|
2012-12-30 17:27:03 +01:00
|
|
|
break;
|
|
|
|
case dt::TAG_NAME:
|
2022-11-02 19:24:42 +01:00
|
|
|
if (i + 3 < tokens.size() && (len == 4 || len == 5) && !strncmp(str.c_str() + pos + len - 4, "clip", 4)) {
|
|
|
|
if (tokens[i + 1].type != dt::OPEN_PAREN)
|
|
|
|
goto tag_p;
|
|
|
|
|
|
|
|
size_t drawing_start = 0;
|
|
|
|
size_t drawing_end = 0;
|
|
|
|
|
|
|
|
// Try to find a vector clip
|
|
|
|
for (size_t j = i + 2; j < tokens.size(); j++) {
|
|
|
|
if (tokens[j].type == dt::ARG_SEP) {
|
|
|
|
if (drawing_start) {
|
|
|
|
break; // More than two arguents - this is a rectangular clip
|
|
|
|
}
|
|
|
|
drawing_start = j + 1;
|
|
|
|
} else if (tokens[j].type == dt::CLOSE_PAREN) {
|
|
|
|
drawing_end = j;
|
|
|
|
break;
|
|
|
|
} else if (tokens[j].type != dt::WHITESPACE && tokens[j].type != dt::ARG) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!drawing_end)
|
|
|
|
goto tag_p;
|
|
|
|
if (!drawing_start)
|
|
|
|
drawing_start = i + 2;
|
2022-11-09 09:36:32 +01:00
|
|
|
if (drawing_end == drawing_start)
|
2022-11-02 19:24:42 +01:00
|
|
|
goto tag_p;
|
|
|
|
|
|
|
|
// We found a clip between drawing_start and drawing_end. Now, join
|
|
|
|
// all the tokens into one and label it as a drawing.
|
|
|
|
size_t tokenlen = 0;
|
|
|
|
for (size_t j = drawing_start; j < drawing_end; j++) {
|
|
|
|
tokenlen += tokens[j].length;
|
|
|
|
}
|
|
|
|
|
|
|
|
tokens[drawing_start].length = tokenlen;
|
|
|
|
tokens[drawing_start].type = dt::DRAWING_FULL;
|
|
|
|
tokens.erase(tokens.begin() + drawing_start + 1, tokens.begin() + drawing_end);
|
|
|
|
last_ovr_end -= drawing_end - drawing_start - 1;
|
|
|
|
}
|
|
|
|
tag_p:
|
2012-12-30 17:27:03 +01:00
|
|
|
if (len != 1 || i + 1 >= tokens.size() || str[pos] != 'p')
|
|
|
|
break;
|
|
|
|
|
|
|
|
in_drawing = false;
|
|
|
|
|
|
|
|
if (i + 1 == last_ovr_end || tokens[i + 1].type != dt::ARG)
|
|
|
|
break;
|
|
|
|
|
|
|
|
for (size_t j = pos + len; j < pos + len + tokens[i + 1].length; ++j) {
|
|
|
|
char c = str[j];
|
|
|
|
// I have no idea why one would use leading zeros for
|
|
|
|
// the scale, but vsfilter allows it
|
|
|
|
if (c >= '1' && c <= '9')
|
|
|
|
in_drawing = true;
|
|
|
|
else if (c != '0')
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
|
|
|
|
pos += len;
|
|
|
|
}
|
|
|
|
|
|
|
|
// VSFilter treats unclosed override blocks as plain text, so merge all
|
|
|
|
// the tokens after the last override block into a single TEXT (or DRAWING)
|
|
|
|
// token
|
|
|
|
for (size_t i = last_ovr_end; i < tokens.size(); ++i) {
|
|
|
|
switch (tokens[i].type) {
|
|
|
|
case dt::KARAOKE_TEMPLATE: break;
|
|
|
|
case dt::KARAOKE_VARIABLE: break;
|
|
|
|
case dt::LINE_BREAK: break;
|
|
|
|
default:
|
2022-11-02 19:24:42 +01:00
|
|
|
tokens[i].type = in_drawing ? dt::DRAWING_FULL : dt::TEXT;
|
2012-12-30 17:27:03 +01:00
|
|
|
if (i > 0 && tokens[i - 1].type == tokens[i].type) {
|
|
|
|
tokens[i - 1].length += tokens[i].length;
|
|
|
|
tokens.erase(tokens.begin() + i);
|
|
|
|
--i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-07 01:26:00 +01:00
|
|
|
void SplitWords(std::string const& str, std::vector<DialogueToken> &tokens) {
|
2012-12-30 17:27:03 +01:00
|
|
|
MarkDrawings(str, tokens);
|
2012-11-07 01:26:00 +01:00
|
|
|
WordSplitter(str, tokens).SplitWords();
|
|
|
|
}
|
|
|
|
|
2012-10-30 01:33:16 +01:00
|
|
|
}
|
|
|
|
}
|