80c9f67ce8
Operate on characters rather than bytes in the dialog so that it actually works with Kanji. Rewrite the auto-matcher to handle more cases and add unit tests for it.
209 lines
7.1 KiB
C++
209 lines
7.1 KiB
C++
// Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
|
|
//
|
|
// Permission to use, copy, modify, and distribute this software for any
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
// copyright notice and this permission notice appear in all copies.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
//
|
|
// Aegisub Project http://www.aegisub.org/
|
|
|
|
#include "../config.h"
|
|
|
|
#include "libaegisub/karaoke_matcher.h"
|
|
|
|
#include "libaegisub/kana_table.h"
|
|
#include "libaegisub/util.h"
|
|
|
|
#include <boost/algorithm/string/case_conv.hpp>
|
|
#include <boost/algorithm/string/predicate.hpp>
|
|
#include <boost/locale/boundary.hpp>
|
|
#include <boost/locale/collator.hpp>
|
|
#include <boost/range/algorithm/copy.hpp>
|
|
#include <unicode/uchar.h>
|
|
#include <unicode/utf8.h>
|
|
|
|
namespace {
|
|
int32_t next_codepoint(const char *str, size_t *i) {
|
|
UChar32 c;
|
|
U8_NEXT_UNSAFE(str, *i, c);
|
|
return c;
|
|
}
|
|
|
|
bool is_whitespace(int32_t c) {
|
|
return !!u_isUWhiteSpace(c);
|
|
}
|
|
|
|
bool is_whitespace(std::string const& str) {
|
|
size_t i = 0;
|
|
while (auto c = next_codepoint(str.c_str(), &i)) {
|
|
if (!u_isUWhiteSpace(c))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// strcmp but ignoring case and accents
|
|
int compare(std::string const& a, std::string const& b) {
|
|
using namespace boost::locale;
|
|
return std::use_facet<collator<char>>(std::locale()).compare(collator_base::primary, a, b);
|
|
}
|
|
|
|
}
|
|
|
|
namespace agi {
|
|
|
|
karaoke_match_result auto_match_karaoke(std::vector<std::string> const& source_strings, std::string const& dest_string) {
|
|
karaoke_match_result result = { 0, 0 };
|
|
if (source_strings.empty()) return result;
|
|
|
|
using namespace boost::locale::boundary;
|
|
using boost::starts_with;
|
|
|
|
result.source_length = 1;
|
|
ssegment_index destination_characters(character, begin(dest_string), end(dest_string));
|
|
auto src = boost::to_lower_copy(source_strings[0]);
|
|
auto dst = destination_characters.begin();
|
|
auto dst_end = destination_characters.end();
|
|
|
|
// Eat all the whitespace at the beginning of the source and destination
|
|
// syllables and exit if either ran out.
|
|
auto eat_whitespace = [&]() -> bool {
|
|
size_t i = 0, first_non_whitespace = 0;
|
|
while (is_whitespace(next_codepoint(src.c_str(), &i)))
|
|
first_non_whitespace = i;
|
|
if (first_non_whitespace)
|
|
src = src.substr(first_non_whitespace);
|
|
|
|
while (dst != dst_end && is_whitespace(dst->str())) {
|
|
++dst;
|
|
++result.destination_length;
|
|
}
|
|
|
|
// If we ran out of dest then this needs to match the rest of the
|
|
// source syllables (this probably means the user did something wrong)
|
|
if (dst == dst_end) {
|
|
result.source_length = source_strings.size();
|
|
return true;
|
|
}
|
|
|
|
return src.empty();
|
|
};
|
|
|
|
if (eat_whitespace()) return result;
|
|
|
|
// We now have a non-whitespace character at the beginning of both source
|
|
// and destination. Check if the source starts with a romanized kana, and
|
|
// if it does then check if the destination also has the appropriate
|
|
// character. If it does, match them and repeat.
|
|
while (!src.empty()) {
|
|
// First check for a basic match of the first character of the source and dest
|
|
auto first_src_char = ssegment_index(character, begin(src), end(src)).begin()->str();
|
|
if (compare(first_src_char, dst->str()) == 0) {
|
|
++dst;
|
|
++result.destination_length;
|
|
src.erase(0, first_src_char.size());
|
|
if (eat_whitespace()) return result;
|
|
continue;
|
|
}
|
|
|
|
auto check = [&](kana_pair const& kp) -> bool {
|
|
if (!starts_with(&*dst->begin(), kp.kana)) return false;
|
|
|
|
src = src.substr(strlen(kp.romaji));
|
|
for (size_t i = 0; kp.kana[i]; ) {
|
|
i += dst->length();
|
|
++result.destination_length;
|
|
++dst;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
bool matched = false;
|
|
for (auto const& match : romaji_to_kana(src)) {
|
|
if (check(match)) {
|
|
if (eat_whitespace()) return result;
|
|
matched = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!matched) break;
|
|
}
|
|
|
|
// Source and dest are now non-empty and start with non-whitespace.
|
|
// If there's only one character left in the dest, it obviously needs to
|
|
// match all of the source syllables left.
|
|
if (std::distance(dst, dst_end) == 1) {
|
|
result.source_length = source_strings.size();
|
|
++result.destination_length;
|
|
return result;
|
|
}
|
|
|
|
// We couldn't match the current character, but if we can match the *next*
|
|
// syllable then we know that everything in between must belong to the
|
|
// current syllable. Do this by looking up to KANA_SEARCH_DISTANCE
|
|
// characters ahead in destination and seeing if we can match them against
|
|
// the beginning of a syllable after this syllable.
|
|
// If a match is found, make a guess at how much source and destination
|
|
// should be selected based on the distances it was found at.
|
|
|
|
// The longest kanji are 'uketamawa.ru' and 'kokorozashi', each with a
|
|
// reading consisting of five kana. This means each each character from
|
|
// the destination can match at most five syllables from the source.
|
|
static const int max_character_length = 5;
|
|
|
|
// Arbitrarily chosen limit on the number of dest characters to try
|
|
// skipping. Higher numbers probably increase false-positives.
|
|
static const int dst_lookahead_max = 3;
|
|
|
|
for (size_t lookahead = 0; lookahead < dst_lookahead_max; ++lookahead) {
|
|
if (++dst == dst_end) break;
|
|
|
|
// Transliterate this character if it's a known hiragana or katakana character
|
|
std::vector<const char *> translit;
|
|
auto next = std::next(dst);
|
|
if (next != dst_end)
|
|
boost::copy(kana_to_romaji(dst->str() + next->str()), back_inserter(translit));
|
|
boost::copy(kana_to_romaji(dst->str()), back_inserter(translit));
|
|
|
|
// Search for it and the transliterated version in the source
|
|
int src_lookahead_max = (lookahead + 1) * max_character_length;
|
|
int src_lookahead_pos = 0;
|
|
for (auto const& syl : source_strings) {
|
|
// Don't count blank syllables in the max search distance
|
|
if (is_whitespace(syl)) continue;
|
|
if (++src_lookahead_pos == 1) continue;
|
|
if (src_lookahead_pos > src_lookahead_max) break;
|
|
|
|
std::string lsyl = boost::to_lower_copy(syl);
|
|
if (!(starts_with(syl, dst->str()) || util::any_of(translit, [&](const char *str) { return starts_with(lsyl, str); })))
|
|
continue;
|
|
|
|
// The syllable immediately after the current one matched, so
|
|
// everything up to the match must go with the current syllable.
|
|
if (src_lookahead_pos == 2) {
|
|
result.destination_length += lookahead + 1;
|
|
return result;
|
|
}
|
|
|
|
// The match was multiple syllables ahead, so just divide the
|
|
// destination characters evenly between the source syllables
|
|
result.destination_length += 1;
|
|
result.source_length = static_cast<size_t>((src_lookahead_pos - 1.0) / (lookahead + 1.0) + .5);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// We wouldn't have gotten here if the dest was empty, so make sure at
|
|
// least one character is selected
|
|
result.destination_length = std::max<size_t>(result.destination_length, 1u);
|
|
|
|
return result;
|
|
}
|
|
}
|