From 80c9f67ce8ea725c4fbc44719c39ba98bf6125ac Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Sat, 13 Jul 2013 08:26:09 -0700 Subject: [PATCH] Rewrite the auto-matcher for the karaoke timing copier Operate on characters rather than bytes in the dialog so that it actually works with Kanji. Rewrite the auto-matcher to handle more cases and add unit tests for it. --- aegisub/build/Aegisub/Aegisub.vcxproj | 2 - aegisub/build/Aegisub/Aegisub.vcxproj.filters | 6 - aegisub/build/libaegisub/libaegisub.vcxproj | 7 +- .../libaegisub/libaegisub.vcxproj.filters | 12 + aegisub/libaegisub/Makefile | 2 + aegisub/libaegisub/common/kana_table.cpp | 622 ++++++++++++++++++ aegisub/libaegisub/common/karaoke_matcher.cpp | 209 ++++++ .../include/libaegisub/kana_table.h | 30 + .../include/libaegisub/karaoke_matcher.h | 30 + aegisub/libaegisub/include/libaegisub/util.h | 6 + aegisub/src/Makefile | 1 - aegisub/src/dialog_kara_timing_copy.cpp | 239 ++----- aegisub/src/kana_table.cpp | 262 -------- aegisub/src/kana_table.h | 53 -- aegisub/src/utils.cpp | 9 - aegisub/src/utils.h | 6 - aegisub/tests/Makefile | 1 + aegisub/tests/tests/karaoke_matcher.cpp | 197 ++++++ 18 files changed, 1165 insertions(+), 529 deletions(-) create mode 100644 aegisub/libaegisub/common/kana_table.cpp create mode 100644 aegisub/libaegisub/common/karaoke_matcher.cpp create mode 100644 aegisub/libaegisub/include/libaegisub/kana_table.h create mode 100644 aegisub/libaegisub/include/libaegisub/karaoke_matcher.h delete mode 100644 aegisub/src/kana_table.cpp delete mode 100644 aegisub/src/kana_table.h create mode 100644 aegisub/tests/tests/karaoke_matcher.cpp diff --git a/aegisub/build/Aegisub/Aegisub.vcxproj b/aegisub/build/Aegisub/Aegisub.vcxproj index a698646fb..92a653fd8 100644 --- a/aegisub/build/Aegisub/Aegisub.vcxproj +++ b/aegisub/build/Aegisub/Aegisub.vcxproj @@ -198,7 +198,6 @@ - @@ -393,7 +392,6 @@ - NotUsing diff --git a/aegisub/build/Aegisub/Aegisub.vcxproj.filters b/aegisub/build/Aegisub/Aegisub.vcxproj.filters index 5a7274c26..cb309864e 100644 --- a/aegisub/build/Aegisub/Aegisub.vcxproj.filters +++ b/aegisub/build/Aegisub/Aegisub.vcxproj.filters @@ -339,9 +339,6 @@ Video\UI - - Features\Karaoke copier - Subtitle formats @@ -992,9 +989,6 @@ Video\UI - - Features\Karaoke copier - Features\Spell checker diff --git a/aegisub/build/libaegisub/libaegisub.vcxproj b/aegisub/build/libaegisub/libaegisub.vcxproj index 1911fbde8..9242f69ea 100644 --- a/aegisub/build/libaegisub/libaegisub.vcxproj +++ b/aegisub/build/libaegisub/libaegisub.vcxproj @@ -4,7 +4,6 @@ {BB3FED86-DB7A-4DC7-964A-260FB86CDE61} libaegisub - lib @@ -13,7 +12,6 @@ - @@ -33,7 +31,6 @@ lagi_pre.h - @@ -60,6 +57,8 @@ + + @@ -101,6 +100,8 @@ + + diff --git a/aegisub/build/libaegisub/libaegisub.vcxproj.filters b/aegisub/build/libaegisub/libaegisub.vcxproj.filters index e7d275f76..1c487e2cc 100644 --- a/aegisub/build/libaegisub/libaegisub.vcxproj.filters +++ b/aegisub/build/libaegisub/libaegisub.vcxproj.filters @@ -155,6 +155,12 @@ ASS + + Header Files + + + Header Files + @@ -256,6 +262,12 @@ ASS + + Source Files\Common + + + Source Files\Common + diff --git a/aegisub/libaegisub/Makefile b/aegisub/libaegisub/Makefile index 633219cd1..6812c9fd7 100644 --- a/aegisub/libaegisub/Makefile +++ b/aegisub/libaegisub/Makefile @@ -28,6 +28,8 @@ SRC += \ common/hotkey.cpp \ common/io.cpp \ common/json.cpp \ + common/kana_table.cpp \ + common/karaoke_matcher.cpp \ common/keyframe.cpp \ common/log.cpp \ common/mru.cpp \ diff --git a/aegisub/libaegisub/common/kana_table.cpp b/aegisub/libaegisub/common/kana_table.cpp new file mode 100644 index 000000000..e265d285d --- /dev/null +++ b/aegisub/libaegisub/common/kana_table.cpp @@ -0,0 +1,622 @@ +// Copyright (c) 2013, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// Aegisub Project http://www.aegisub.org/ + +#include "../config.h" + +#include "libaegisub/kana_table.h" + +#include + +namespace { +agi::kana_pair kana_to_romaji[] = { + {"\xE3\x81\x81", "a"}, // ぁ + {"\xE3\x81\x82", "a"}, // あ + {"\xE3\x81\x83", "i"}, // ぃ + {"\xE3\x81\x84", "i"}, // い + {"\xE3\x81\x85", "u"}, // ぅ + {"\xE3\x81\x86", "u"}, // う + {"\xE3\x81\x87", "e"}, // ぇ + {"\xE3\x81\x88", "e"}, // え + {"\xE3\x81\x89", "o"}, // ぉ + {"\xE3\x81\x8A", "o"}, // お + {"\xE3\x81\x8B", "ka"}, // か + {"\xE3\x81\x8C", "ga"}, // が + {"\xE3\x81\x8D", "ki"}, // き + {"\xE3\x81\x8D\xE3\x82\x83", "kya"}, // きゃ + {"\xE3\x81\x8D\xE3\x82\x85", "kyu"}, // きゅ + {"\xE3\x81\x8D\xE3\x82\x87", "kyo"}, // きょ + {"\xE3\x81\x8E", "gi"}, // ぎ + {"\xE3\x81\x8E\xE3\x82\x83", "gya"}, // ぎゃ + {"\xE3\x81\x8E\xE3\x82\x85", "gyu"}, // ぎゅ + {"\xE3\x81\x8E\xE3\x82\x87", "gyo"}, // ぎょ + {"\xE3\x81\x8F", "ku"}, // く + {"\xE3\x81\x90", "gu"}, // ぐ + {"\xE3\x81\x91", "ke"}, // け + {"\xE3\x81\x92", "ge"}, // げ + {"\xE3\x81\x93", "ko"}, // こ + {"\xE3\x81\x94", "go"}, // ご + {"\xE3\x81\x95", "sa"}, // さ + {"\xE3\x81\x96", "za"}, // ざ + {"\xE3\x81\x97", "shi"}, // し + {"\xE3\x81\x97\xE3\x82\x83", "sha"}, // しゃ + {"\xE3\x81\x97\xE3\x82\x85", "shu"}, // しゅ + {"\xE3\x81\x97\xE3\x82\x87", "sho"}, // しょ + {"\xE3\x81\x98", "ji"}, // じ + {"\xE3\x81\x98\xE3\x82\x83", "ja"}, // じゃ + {"\xE3\x81\x98\xE3\x82\x85", "ju"}, // じゅ + {"\xE3\x81\x98\xE3\x82\x87", "jo"}, // じょ + {"\xE3\x81\x99", "su"}, // す + {"\xE3\x81\x9A", "zu"}, // ず + {"\xE3\x81\x9B", "se"}, // せ + {"\xE3\x81\x9C", "ze"}, // ぜ + {"\xE3\x81\x9D", "so"}, // そ + {"\xE3\x81\x9E", "zo"}, // ぞ + {"\xE3\x81\x9F", "ta"}, // た + {"\xE3\x81\xA0", "da"}, // だ + {"\xE3\x81\xA1", "chi"}, // ち + {"\xE3\x81\xA1\xE3\x82\x83", "cha"}, // ちゃ + {"\xE3\x81\xA1\xE3\x82\x85", "chu"}, // ちゅ + {"\xE3\x81\xA1\xE3\x82\x87", "cho"}, // ちょ + {"\xE3\x81\xA2", "ji"}, // ぢ + {"\xE3\x81\xA2\xE3\x82\x83", "ja"}, // ぢゃ + {"\xE3\x81\xA2\xE3\x82\x85", "ju"}, // ぢゅ + {"\xE3\x81\xA2\xE3\x82\x87", "jo"}, // ぢょ + {"\xE3\x81\xA3", "c"}, // っ + {"\xE3\x81\xA3", "k"}, // っ + {"\xE3\x81\xA3", "p"}, // っ + {"\xE3\x81\xA3", "s"}, // っ + {"\xE3\x81\xA3", "t"}, // っ + {"\xE3\x81\xA4", "tsu"}, // つ + {"\xE3\x81\xA5", "zu"}, // づ + {"\xE3\x81\xA6", "te"}, // て + {"\xE3\x81\xA7", "de"}, // で + {"\xE3\x81\xA8", "to"}, // と + {"\xE3\x81\xA9", "do"}, // ど + {"\xE3\x81\xAA", "na"}, // な + {"\xE3\x81\xAB", "ni"}, // に + {"\xE3\x81\xAB\xE3\x82\x83", "nya"}, // にゃ + {"\xE3\x81\xAB\xE3\x82\x85", "nyu"}, // にゅ + {"\xE3\x81\xAB\xE3\x82\x87", "nyo"}, // にょ + {"\xE3\x81\xAC", "nu"}, // ぬ + {"\xE3\x81\xAD", "ne"}, // ね + {"\xE3\x81\xAE", "no"}, // の + {"\xE3\x81\xAF", "ha"}, // は + {"\xE3\x81\xAF", "wa"}, // は + {"\xE3\x81\xB0", "ba"}, // ば + {"\xE3\x81\xB1", "pa"}, // ぱ + {"\xE3\x81\xB2", "hi"}, // ひ + {"\xE3\x81\xB2\xE3\x82\x83", "hya"}, // ひゃ + {"\xE3\x81\xB2\xE3\x82\x85", "hyu"}, // ひゅ + {"\xE3\x81\xB2\xE3\x82\x87", "hyo"}, // ひょ + {"\xE3\x81\xB3", "bi"}, // び + {"\xE3\x81\xB3\xE3\x82\x83", "bya"}, // びゃ + {"\xE3\x81\xB3\xE3\x82\x85", "byu"}, // びゅ + {"\xE3\x81\xB3\xE3\x82\x87", "byo"}, // びょ + {"\xE3\x81\xB4", "pi"}, // ぴ + {"\xE3\x81\xB4\xE3\x82\x83", "pya"}, // ぴゃ + {"\xE3\x81\xB4\xE3\x82\x85", "pyu"}, // ぴゅ + {"\xE3\x81\xB4\xE3\x82\x87", "pyo"}, // ぴょ + {"\xE3\x81\xB5", "fu"}, // ふ + {"\xE3\x81\xB6", "bu"}, // ぶ + {"\xE3\x81\xB7", "pu"}, // ぷ + {"\xE3\x81\xB8", "he"}, // へ + {"\xE3\x81\xB8", "e"}, // へ + {"\xE3\x81\xB9", "be"}, // べ + {"\xE3\x81\xBA", "pe"}, // ぺ + {"\xE3\x81\xBB", "ho"}, // ほ + {"\xE3\x81\xBC", "bo"}, // ぼ + {"\xE3\x81\xBD", "po"}, // ぽ + {"\xE3\x81\xBE", "ma"}, // ま + {"\xE3\x81\xBF", "mi"}, // み + {"\xE3\x81\xBF\xE3\x82\x83", "mya"}, // みゃ + {"\xE3\x81\xBF\xE3\x82\x85", "myu"}, // みゅ + {"\xE3\x81\xBF\xE3\x82\x87", "myo"}, // みょ + {"\xE3\x82\x80", "mu"}, // む + {"\xE3\x82\x81", "me"}, // め + {"\xE3\x82\x82", "mo"}, // も + {"\xE3\x82\x84", "ya"}, // や + {"\xE3\x82\x86", "yu"}, // ゆ + {"\xE3\x82\x88", "yo"}, // よ + {"\xE3\x82\x89", "ra"}, // ら + {"\xE3\x82\x8A", "ri"}, // り + {"\xE3\x82\x8A\xE3\x82\x83", "rya"}, // りゃ + {"\xE3\x82\x8A\xE3\x82\x85", "ryu"}, // りゅ + {"\xE3\x82\x8A\xE3\x82\x87", "ryo"}, // りょ + {"\xE3\x82\x8B", "ru"}, // る + {"\xE3\x82\x8C", "re"}, // れ + {"\xE3\x82\x8D", "ro"}, // ろ + {"\xE3\x82\x8F", "wa"}, // わ + {"\xE3\x82\x90", "wi"}, // ゐ + {"\xE3\x82\x91", "we"}, // ゑ + {"\xE3\x82\x92", "wo"}, // を + {"\xE3\x82\x93", "m"}, // ん + {"\xE3\x82\x93", "n"}, // ん + {"\xE3\x82\xA1", "a"}, // ァ + {"\xE3\x82\xA2", "a"}, // ア + {"\xE3\x82\xA3", "i"}, // ィ + {"\xE3\x82\xA4", "i"}, // イ + {"\xE3\x82\xA4\xE3\x82\xA7", "ye"}, // イェ + {"\xE3\x82\xA5", "u"}, // ゥ + {"\xE3\x82\xA6", "u"}, // ウ + {"\xE3\x82\xA6\xE3\x82\xA3", "wi"}, // ウィ + {"\xE3\x82\xA6\xE3\x82\xA7", "we"}, // ウェ + {"\xE3\x82\xA6\xE3\x82\xA9", "wo"}, // ウォ + {"\xE3\x82\xA7", "e"}, // ェ + {"\xE3\x82\xA8", "e"}, // エ + {"\xE3\x82\xA9", "o"}, // ォ + {"\xE3\x82\xAA", "o"}, // オ + {"\xE3\x82\xAB", "ka"}, // カ + {"\xE3\x82\xAC", "ga"}, // ガ + {"\xE3\x82\xAD", "ki"}, // キ + {"\xE3\x82\xAD\xE3\x83\xA3", "kya"}, // キャ + {"\xE3\x82\xAD\xE3\x83\xA5", "kyu"}, // キュ + {"\xE3\x82\xAD\xE3\x83\xA7", "kyo"}, // キョ + {"\xE3\x82\xAE", "gi"}, // ギ + {"\xE3\x82\xAE\xE3\x83\xA3", "gya"}, // ギャ + {"\xE3\x82\xAE\xE3\x83\xA5", "gyu"}, // ギュ + {"\xE3\x82\xAE\xE3\x83\xA7", "gyo"}, // ギョ + {"\xE3\x82\xAF", "ku"}, // ク + {"\xE3\x82\xB0", "gu"}, // グ + {"\xE3\x82\xB1", "ke"}, // ケ + {"\xE3\x82\xB2", "ge"}, // ゲ + {"\xE3\x82\xB3", "ko"}, // コ + {"\xE3\x82\xB4", "go"}, // ゴ + {"\xE3\x82\xB5", "sa"}, // サ + {"\xE3\x82\xB6", "za"}, // ザ + {"\xE3\x82\xB7", "shi"}, // シ + {"\xE3\x82\xB7\xE3\x82\xA7", "she"}, // シェ + {"\xE3\x82\xB7\xE3\x83\xA3", "sha"}, // シャ + {"\xE3\x82\xB7\xE3\x83\xA5", "shu"}, // シュ + {"\xE3\x82\xB7\xE3\x83\xA7", "sho"}, // ショ + {"\xE3\x82\xB8", "ji"}, // ジ + {"\xE3\x82\xB8\xE3\x82\xA7", "je"}, // ジェ + {"\xE3\x82\xB8\xE3\x83\xA3", "ja"}, // ジャ + {"\xE3\x82\xB8\xE3\x83\xA5", "ju"}, // ジュ + {"\xE3\x82\xB8\xE3\x83\xA7", "jo"}, // ジョ + {"\xE3\x82\xB9", "su"}, // ス + {"\xE3\x82\xBA", "zu"}, // ズ + {"\xE3\x82\xBB", "se"}, // セ + {"\xE3\x82\xBC", "ze"}, // ゼ + {"\xE3\x82\xBD", "so"}, // ソ + {"\xE3\x82\xBE", "zo"}, // ゾ + {"\xE3\x82\xBF", "ta"}, // タ + {"\xE3\x83\x80", "da"}, // ダ + {"\xE3\x83\x81", "chi"}, // チ + {"\xE3\x83\x81\xE3\x82\xA7", "che"}, // チェ + {"\xE3\x83\x81\xE3\x83\xA3", "cha"}, // チャ + {"\xE3\x83\x81\xE3\x83\xA5", "chu"}, // チュ + {"\xE3\x83\x81\xE3\x83\xA7", "cho"}, // チョ + {"\xE3\x83\x82", "ji"}, // ヂ + {"\xE3\x83\x82\xE3\x83\xA3", "ja"}, // ヂャ + {"\xE3\x83\x82\xE3\x83\xA5", "ju"}, // ヂュ + {"\xE3\x83\x82\xE3\x83\xA7", "jo"}, // ヂョ + {"\xE3\x83\x83", "c"}, // ッ + {"\xE3\x83\x83", "k"}, // ッ + {"\xE3\x83\x83", "p"}, // ッ + {"\xE3\x83\x83", "s"}, // ッ + {"\xE3\x83\x83", "t"}, // ッ + {"\xE3\x83\x84", "tsu"}, // ツ + {"\xE3\x83\x84\xE3\x82\xA1", "tsa"}, // ツァ + {"\xE3\x83\x84\xE3\x82\xA3", "tsi"}, // ツィ + {"\xE3\x83\x84\xE3\x82\xA7", "tse"}, // ツェ + {"\xE3\x83\x84\xE3\x82\xA9", "tso"}, // ツォ + {"\xE3\x83\x85", "zu"}, // ヅ + {"\xE3\x83\x86", "te"}, // テ + {"\xE3\x83\x86\xE3\x82\xA3", "ti"}, // ティ + {"\xE3\x83\x86\xE3\x82\xA5", "tu"}, // テゥ + {"\xE3\x83\x86\xE3\x83\xA5", "tyu"}, // テュ + {"\xE3\x83\x87", "de"}, // デ + {"\xE3\x83\x87\xE3\x82\xA3", "di"}, // ディ + {"\xE3\x83\x87\xE3\x82\xA5", "du"}, // デゥ + {"\xE3\x83\x87\xE3\x82\xA5", "dyu"}, // デゥ + {"\xE3\x83\x88", "to"}, // ト + {"\xE3\x83\x89", "do"}, // ド + {"\xE3\x83\x8A", "na"}, // ナ + {"\xE3\x83\x8B", "ni"}, // ニ + {"\xE3\x83\x8B\xE3\x83\xA3", "nya"}, // ニャ + {"\xE3\x83\x8B\xE3\x83\xA5", "nyu"}, // ニュ + {"\xE3\x83\x8B\xE3\x83\xA7", "nyo"}, // ニョ + {"\xE3\x83\x8C", "nu"}, // ヌ + {"\xE3\x83\x8D", "ne"}, // ネ + {"\xE3\x83\x8E", "no"}, // ノ + {"\xE3\x83\x8F", "ha"}, // ハ + {"\xE3\x83\x90", "ba"}, // バ + {"\xE3\x83\x91", "pa"}, // パ + {"\xE3\x83\x92", "hi"}, // ヒ + {"\xE3\x83\x92\xE3\x83\xA3", "hya"}, // ヒャ + {"\xE3\x83\x92\xE3\x83\xA5", "hyu"}, // ヒュ + {"\xE3\x83\x92\xE3\x83\xA7", "hyo"}, // ヒョ + {"\xE3\x83\x93", "bi"}, // ビ + {"\xE3\x83\x93\xE3\x83\xA3", "bya"}, // ビャ + {"\xE3\x83\x93\xE3\x83\xA5", "byu"}, // ビュ + {"\xE3\x83\x93\xE3\x83\xA7", "byo"}, // ビョ + {"\xE3\x83\x94", "pi"}, // ピ + {"\xE3\x83\x94\xE3\x83\xA3", "pya"}, // ピャ + {"\xE3\x83\x94\xE3\x83\xA5", "pyu"}, // ピュ + {"\xE3\x83\x94\xE3\x83\xA7", "pyo"}, // ピョ + {"\xE3\x83\x95", "fu"}, // フ + {"\xE3\x83\x95\xE3\x82\xA1", "fa"}, // ファ + {"\xE3\x83\x95\xE3\x82\xA3", "fi"}, // フィ + {"\xE3\x83\x95\xE3\x82\xA7", "fe"}, // フェ + {"\xE3\x83\x95\xE3\x82\xA9", "fo"}, // フォ + {"\xE3\x83\x95\xE3\x83\xA5", "fyu"}, // フュ + {"\xE3\x83\x96", "bu"}, // ブ + {"\xE3\x83\x97", "pu"}, // プ + {"\xE3\x83\x98", "he"}, // ヘ + {"\xE3\x83\x99", "be"}, // ベ + {"\xE3\x83\x9A", "pe"}, // ペ + {"\xE3\x83\x9B", "ho"}, // ホ + {"\xE3\x83\x9C", "bo"}, // ボ + {"\xE3\x83\x9D", "po"}, // ポ + {"\xE3\x83\x9E", "ma"}, // マ + {"\xE3\x83\x9F", "mi"}, // ミ + {"\xE3\x83\x9F\xE3\x83\xA3", "mya"}, // ミャ + {"\xE3\x83\x9F\xE3\x83\xA5", "myu"}, // ミュ + {"\xE3\x83\x9F\xE3\x83\xA7", "myo"}, // ミョ + {"\xE3\x83\xA0", "mu"}, // ム + {"\xE3\x83\xA1", "me"}, // メ + {"\xE3\x83\xA2", "mo"}, // モ + {"\xE3\x83\xA4", "ya"}, // ヤ + {"\xE3\x83\xA6", "yu"}, // ユ + {"\xE3\x83\xA8", "yo"}, // ヨ + {"\xE3\x83\xA9", "ra"}, // ラ + {"\xE3\x83\xAA", "ri"}, // リ + {"\xE3\x83\xAA\xE3\x83\xA3", "rya"}, // リャ + {"\xE3\x83\xAA\xE3\x83\xA5", "ryu"}, // リュ + {"\xE3\x83\xAA\xE3\x83\xA7", "ryo"}, // リョ + {"\xE3\x83\xAB", "ru"}, // ル + {"\xE3\x83\xAC", "re"}, // レ + {"\xE3\x83\xAD", "ro"}, // ロ + {"\xE3\x83\xAF", "wa"}, // ワ + {"\xE3\x83\xB0", "wi"}, // ヰ + {"\xE3\x83\xB1", "we"}, // ヱ + {"\xE3\x83\xB2", "wo"}, // ヲ + {"\xE3\x83\xB3", "m"}, // ン + {"\xE3\x83\xB3", "n"}, // ン + {"\xE3\x83\xB4", "vu"}, // ヴ + {"\xE3\x83\xB4\xE3\x82\xA1", "va"}, // ヴァ + {"\xE3\x83\xB4\xE3\x82\xA3", "vi"}, // ヴィ + {"\xE3\x83\xB4\xE3\x82\xA7", "ve"}, // ヴェ + {"\xE3\x83\xB4\xE3\x82\xA9", "vo"}, // ヴォ + {"\xE3\x83\xB4\xE3\x83\xA3", "vya"}, // ヴャ + {"\xE3\x83\xB4\xE3\x83\xA5", "vyu"}, // ヴュ + {"\xE3\x83\xB4\xE3\x83\xA7", "vyo"}, // ヴョ + {"\xE3\x83\xBC", "a"}, // ー + {"\xE3\x83\xBC", "e"}, // ー + {"\xE3\x83\xBC", "i"}, // ー + {"\xE3\x83\xBC", "o"}, // ー + {"\xE3\x83\xBC", "u"}, // ー +}; + +agi::kana_pair romaji_to_kana[] = { + {"\xE3\x81\x81", "a"}, // ぁ + {"\xE3\x81\x82", "a"}, // あ + {"\xE3\x82\xA1", "a"}, // ァ + {"\xE3\x82\xA2", "a"}, // ア + {"\xE3\x83\xBC", "a"}, // ー + {"\xE3\x81\xB0", "ba"}, // ば + {"\xE3\x83\x90", "ba"}, // バ + {"\xE3\x81\xB9", "be"}, // べ + {"\xE3\x83\x99", "be"}, // ベ + {"\xE3\x81\xB3", "bi"}, // び + {"\xE3\x83\x93", "bi"}, // ビ + {"\xE3\x81\xBC", "bo"}, // ぼ + {"\xE3\x83\x9C", "bo"}, // ボ + {"\xE3\x81\xB6", "bu"}, // ぶ + {"\xE3\x83\x96", "bu"}, // ブ + {"\xE3\x81\xB3\xE3\x82\x83", "bya"}, // びゃ + {"\xE3\x83\x93\xE3\x83\xA3", "bya"}, // ビャ + {"\xE3\x81\xB3\xE3\x82\x87", "byo"}, // びょ + {"\xE3\x83\x93\xE3\x83\xA7", "byo"}, // ビョ + {"\xE3\x81\xB3\xE3\x82\x85", "byu"}, // びゅ + {"\xE3\x83\x93\xE3\x83\xA5", "byu"}, // ビュ + {"\xE3\x81\xA3", "c"}, // っ + {"\xE3\x83\x83", "c"}, // ッ + {"\xE3\x81\xA1\xE3\x82\x83", "cha"}, // ちゃ + {"\xE3\x83\x81\xE3\x83\xA3", "cha"}, // チャ + {"\xE3\x83\x81\xE3\x82\xA7", "che"}, // チェ + {"\xE3\x81\xA1", "chi"}, // ち + {"\xE3\x83\x81", "chi"}, // チ + {"\xE3\x81\xA1\xE3\x82\x87", "cho"}, // ちょ + {"\xE3\x83\x81\xE3\x83\xA7", "cho"}, // チョ + {"\xE3\x81\xA1\xE3\x82\x85", "chu"}, // ちゅ + {"\xE3\x83\x81\xE3\x83\xA5", "chu"}, // チュ + {"\xE3\x81\xA0", "da"}, // だ + {"\xE3\x83\x80", "da"}, // ダ + {"\xE3\x81\xA7", "de"}, // で + {"\xE3\x83\x87", "de"}, // デ + {"\xE3\x83\x87\xE3\x82\xA3", "di"}, // ディ + {"\xE3\x81\xA9", "do"}, // ど + {"\xE3\x83\x89", "do"}, // ド + {"\xE3\x83\x87\xE3\x82\xA5", "du"}, // デゥ + {"\xE3\x83\x87\xE3\x82\xA5", "dyu"}, // デゥ + {"\xE3\x81\x87", "e"}, // ぇ + {"\xE3\x81\x88", "e"}, // え + {"\xE3\x82\xA7", "e"}, // ェ + {"\xE3\x82\xA8", "e"}, // エ + {"\xE3\x83\xBC", "e"}, // ー + {"\xE3\x83\x95\xE3\x82\xA1", "fa"}, // ファ + {"\xE3\x83\x95\xE3\x82\xA7", "fe"}, // フェ + {"\xE3\x83\x95\xE3\x82\xA3", "fi"}, // フィ + {"\xE3\x83\x95\xE3\x82\xA9", "fo"}, // フォ + {"\xE3\x81\xB5", "fu"}, // ふ + {"\xE3\x83\x95", "fu"}, // フ + {"\xE3\x83\x95\xE3\x83\xA5", "fyu"}, // フュ + {"\xE3\x81\x8C", "ga"}, // が + {"\xE3\x82\xAC", "ga"}, // ガ + {"\xE3\x81\x92", "ge"}, // げ + {"\xE3\x82\xB2", "ge"}, // ゲ + {"\xE3\x81\x8E", "gi"}, // ぎ + {"\xE3\x82\xAE", "gi"}, // ギ + {"\xE3\x81\x94", "go"}, // ご + {"\xE3\x82\xB4", "go"}, // ゴ + {"\xE3\x81\x90", "gu"}, // ぐ + {"\xE3\x82\xB0", "gu"}, // グ + {"\xE3\x81\x8E\xE3\x82\x83", "gya"}, // ぎゃ + {"\xE3\x82\xAE\xE3\x83\xA3", "gya"}, // ギャ + {"\xE3\x81\x8E\xE3\x82\x87", "gyo"}, // ぎょ + {"\xE3\x82\xAE\xE3\x83\xA7", "gyo"}, // ギョ + {"\xE3\x81\x8E\xE3\x82\x85", "gyu"}, // ぎゅ + {"\xE3\x82\xAE\xE3\x83\xA5", "gyu"}, // ギュ + {"\xE3\x81\xAF", "ha"}, // は + {"\xE3\x83\x8F", "ha"}, // ハ + {"\xE3\x81\xB8", "he"}, // へ + {"\xE3\x83\x98", "he"}, // ヘ + {"\xE3\x81\xB2", "hi"}, // ひ + {"\xE3\x83\x92", "hi"}, // ヒ + {"\xE3\x81\xBB", "ho"}, // ほ + {"\xE3\x83\x9B", "ho"}, // ホ + {"\xE3\x81\xB2\xE3\x82\x83", "hya"}, // ひゃ + {"\xE3\x83\x92\xE3\x83\xA3", "hya"}, // ヒャ + {"\xE3\x81\xB2\xE3\x82\x87", "hyo"}, // ひょ + {"\xE3\x83\x92\xE3\x83\xA7", "hyo"}, // ヒョ + {"\xE3\x81\xB2\xE3\x82\x85", "hyu"}, // ひゅ + {"\xE3\x83\x92\xE3\x83\xA5", "hyu"}, // ヒュ + {"\xE3\x81\x83", "i"}, // ぃ + {"\xE3\x81\x84", "i"}, // い + {"\xE3\x82\xA3", "i"}, // ィ + {"\xE3\x82\xA4", "i"}, // イ + {"\xE3\x83\xBC", "i"}, // ー + {"\xE3\x81\x98\xE3\x82\x83", "ja"}, // じゃ + {"\xE3\x81\xA2\xE3\x82\x83", "ja"}, // ぢゃ + {"\xE3\x82\xB8\xE3\x83\xA3", "ja"}, // ジャ + {"\xE3\x83\x82\xE3\x83\xA3", "ja"}, // ヂャ + {"\xE3\x82\xB8\xE3\x82\xA7", "je"}, // ジェ + {"\xE3\x81\x98", "ji"}, // じ + {"\xE3\x81\xA2", "ji"}, // ぢ + {"\xE3\x82\xB8", "ji"}, // ジ + {"\xE3\x83\x82", "ji"}, // ヂ + {"\xE3\x81\x98\xE3\x82\x87", "jo"}, // じょ + {"\xE3\x81\xA2\xE3\x82\x87", "jo"}, // ぢょ + {"\xE3\x82\xB8\xE3\x83\xA7", "jo"}, // ジョ + {"\xE3\x83\x82\xE3\x83\xA7", "jo"}, // ヂョ + {"\xE3\x81\x98\xE3\x82\x85", "ju"}, // じゅ + {"\xE3\x81\xA2\xE3\x82\x85", "ju"}, // ぢゅ + {"\xE3\x82\xB8\xE3\x83\xA5", "ju"}, // ジュ + {"\xE3\x83\x82\xE3\x83\xA5", "ju"}, // ヂュ + {"\xE3\x81\xA3", "k"}, // っ + {"\xE3\x83\x83", "k"}, // ッ + {"\xE3\x81\x8B", "ka"}, // か + {"\xE3\x82\xAB", "ka"}, // カ + {"\xE3\x81\x91", "ke"}, // け + {"\xE3\x82\xB1", "ke"}, // ケ + {"\xE3\x81\x8D", "ki"}, // き + {"\xE3\x82\xAD", "ki"}, // キ + {"\xE3\x81\x93", "ko"}, // こ + {"\xE3\x82\xB3", "ko"}, // コ + {"\xE3\x81\x8F", "ku"}, // く + {"\xE3\x82\xAF", "ku"}, // ク + {"\xE3\x81\x8D\xE3\x82\x83", "kya"}, // きゃ + {"\xE3\x82\xAD\xE3\x83\xA3", "kya"}, // キャ + {"\xE3\x81\x8D\xE3\x82\x87", "kyo"}, // きょ + {"\xE3\x82\xAD\xE3\x83\xA7", "kyo"}, // キョ + {"\xE3\x81\x8D\xE3\x82\x85", "kyu"}, // きゅ + {"\xE3\x82\xAD\xE3\x83\xA5", "kyu"}, // キュ + {"\xE3\x82\x93", "m"}, // ん + {"\xE3\x83\xB3", "m"}, // ン + {"\xE3\x81\xBE", "ma"}, // ま + {"\xE3\x83\x9E", "ma"}, // マ + {"\xE3\x82\x81", "me"}, // め + {"\xE3\x83\xA1", "me"}, // メ + {"\xE3\x81\xBF", "mi"}, // み + {"\xE3\x83\x9F", "mi"}, // ミ + {"\xE3\x82\x82", "mo"}, // も + {"\xE3\x83\xA2", "mo"}, // モ + {"\xE3\x82\x80", "mu"}, // む + {"\xE3\x83\xA0", "mu"}, // ム + {"\xE3\x81\xBF\xE3\x82\x83", "mya"}, // みゃ + {"\xE3\x83\x9F\xE3\x83\xA3", "mya"}, // ミャ + {"\xE3\x81\xBF\xE3\x82\x87", "myo"}, // みょ + {"\xE3\x83\x9F\xE3\x83\xA7", "myo"}, // ミョ + {"\xE3\x81\xBF\xE3\x82\x85", "myu"}, // みゅ + {"\xE3\x83\x9F\xE3\x83\xA5", "myu"}, // ミュ + {"\xE3\x82\x93", "n"}, // ん + {"\xE3\x83\xB3", "n"}, // ン + {"\xE3\x81\xAA", "na"}, // な + {"\xE3\x83\x8A", "na"}, // ナ + {"\xE3\x81\xAD", "ne"}, // ね + {"\xE3\x83\x8D", "ne"}, // ネ + {"\xE3\x81\xAB", "ni"}, // に + {"\xE3\x83\x8B", "ni"}, // ニ + {"\xE3\x81\xAE", "no"}, // の + {"\xE3\x83\x8E", "no"}, // ノ + {"\xE3\x81\xAC", "nu"}, // ぬ + {"\xE3\x83\x8C", "nu"}, // ヌ + {"\xE3\x81\xAB\xE3\x82\x83", "nya"}, // にゃ + {"\xE3\x83\x8B\xE3\x83\xA3", "nya"}, // ニャ + {"\xE3\x81\xAB\xE3\x82\x87", "nyo"}, // にょ + {"\xE3\x83\x8B\xE3\x83\xA7", "nyo"}, // ニョ + {"\xE3\x81\xAB\xE3\x82\x85", "nyu"}, // にゅ + {"\xE3\x83\x8B\xE3\x83\xA5", "nyu"}, // ニュ + {"\xE3\x81\x89", "o"}, // ぉ + {"\xE3\x81\x8A", "o"}, // お + {"\xE3\x82\xA9", "o"}, // ォ + {"\xE3\x82\xAA", "o"}, // オ + {"\xE3\x83\xBC", "o"}, // ー + {"\xE3\x81\xA3", "p"}, // っ + {"\xE3\x83\x83", "p"}, // ッ + {"\xE3\x81\xB1", "pa"}, // ぱ + {"\xE3\x83\x91", "pa"}, // パ + {"\xE3\x81\xBA", "pe"}, // ぺ + {"\xE3\x83\x9A", "pe"}, // ペ + {"\xE3\x81\xB4", "pi"}, // ぴ + {"\xE3\x83\x94", "pi"}, // ピ + {"\xE3\x81\xBD", "po"}, // ぽ + {"\xE3\x83\x9D", "po"}, // ポ + {"\xE3\x81\xB7", "pu"}, // ぷ + {"\xE3\x83\x97", "pu"}, // プ + {"\xE3\x81\xB4\xE3\x82\x83", "pya"}, // ぴゃ + {"\xE3\x83\x94\xE3\x83\xA3", "pya"}, // ピャ + {"\xE3\x81\xB4\xE3\x82\x87", "pyo"}, // ぴょ + {"\xE3\x83\x94\xE3\x83\xA7", "pyo"}, // ピョ + {"\xE3\x81\xB4\xE3\x82\x85", "pyu"}, // ぴゅ + {"\xE3\x83\x94\xE3\x83\xA5", "pyu"}, // ピュ + {"\xE3\x82\x89", "ra"}, // ら + {"\xE3\x83\xA9", "ra"}, // ラ + {"\xE3\x82\x8C", "re"}, // れ + {"\xE3\x83\xAC", "re"}, // レ + {"\xE3\x82\x8A", "ri"}, // り + {"\xE3\x83\xAA", "ri"}, // リ + {"\xE3\x82\x8D", "ro"}, // ろ + {"\xE3\x83\xAD", "ro"}, // ロ + {"\xE3\x82\x8B", "ru"}, // る + {"\xE3\x83\xAB", "ru"}, // ル + {"\xE3\x82\x8A\xE3\x82\x83", "rya"}, // りゃ + {"\xE3\x83\xAA\xE3\x83\xA3", "rya"}, // リャ + {"\xE3\x82\x8A\xE3\x82\x87", "ryo"}, // りょ + {"\xE3\x83\xAA\xE3\x83\xA7", "ryo"}, // リョ + {"\xE3\x82\x8A\xE3\x82\x85", "ryu"}, // りゅ + {"\xE3\x83\xAA\xE3\x83\xA5", "ryu"}, // リュ + {"\xE3\x81\xA3", "s"}, // っ + {"\xE3\x83\x83", "s"}, // ッ + {"\xE3\x81\x95", "sa"}, // さ + {"\xE3\x82\xB5", "sa"}, // サ + {"\xE3\x81\x9B", "se"}, // せ + {"\xE3\x82\xBB", "se"}, // セ + {"\xE3\x81\x97\xE3\x82\x83", "sha"}, // しゃ + {"\xE3\x82\xB7\xE3\x83\xA3", "sha"}, // シャ + {"\xE3\x82\xB7\xE3\x82\xA7", "she"}, // シェ + {"\xE3\x81\x97", "shi"}, // し + {"\xE3\x82\xB7", "shi"}, // シ + {"\xE3\x81\x97\xE3\x82\x87", "sho"}, // しょ + {"\xE3\x82\xB7\xE3\x83\xA7", "sho"}, // ショ + {"\xE3\x81\x97\xE3\x82\x85", "shu"}, // しゅ + {"\xE3\x82\xB7\xE3\x83\xA5", "shu"}, // シュ + {"\xE3\x81\x9D", "so"}, // そ + {"\xE3\x82\xBD", "so"}, // ソ + {"\xE3\x81\x99", "su"}, // す + {"\xE3\x82\xB9", "su"}, // ス + {"\xE3\x81\xA3", "t"}, // っ + {"\xE3\x83\x83", "t"}, // ッ + {"\xE3\x81\x9F", "ta"}, // た + {"\xE3\x82\xBF", "ta"}, // タ + {"\xE3\x81\xA6", "te"}, // て + {"\xE3\x83\x86", "te"}, // テ + {"\xE3\x83\x86\xE3\x82\xA3", "ti"}, // ティ + {"\xE3\x81\xA8", "to"}, // と + {"\xE3\x83\x88", "to"}, // ト + {"\xE3\x83\x84\xE3\x82\xA1", "tsa"}, // ツァ + {"\xE3\x83\x84\xE3\x82\xA7", "tse"}, // ツェ + {"\xE3\x83\x84\xE3\x82\xA3", "tsi"}, // ツィ + {"\xE3\x83\x84\xE3\x82\xA9", "tso"}, // ツォ + {"\xE3\x81\xA4", "tsu"}, // つ + {"\xE3\x83\x84", "tsu"}, // ツ + {"\xE3\x83\x86\xE3\x82\xA5", "tu"}, // テゥ + {"\xE3\x83\x86\xE3\x83\xA5", "tyu"}, // テュ + {"\xE3\x81\x85", "u"}, // ぅ + {"\xE3\x81\x86", "u"}, // う + {"\xE3\x82\xA5", "u"}, // ゥ + {"\xE3\x82\xA6", "u"}, // ウ + {"\xE3\x83\xBC", "u"}, // ー + {"\xE3\x83\xB4\xE3\x82\xA1", "va"}, // ヴァ + {"\xE3\x83\xB4\xE3\x82\xA7", "ve"}, // ヴェ + {"\xE3\x83\xB4\xE3\x82\xA3", "vi"}, // ヴィ + {"\xE3\x83\xB4\xE3\x82\xA9", "vo"}, // ヴォ + {"\xE3\x83\xB4", "vu"}, // ヴ + {"\xE3\x83\xB4\xE3\x83\xA3", "vya"}, // ヴャ + {"\xE3\x83\xB4\xE3\x83\xA7", "vyo"}, // ヴョ + {"\xE3\x83\xB4\xE3\x83\xA5", "vyu"}, // ヴュ + {"\xE3\x81\xAF", "wa"}, // は + {"\xE3\x82\x8F", "wa"}, // わ + {"\xE3\x83\xAF", "wa"}, // ワ + {"\xE3\x82\x91", "we"}, // ゑ + {"\xE3\x82\xA6\xE3\x82\xA7", "we"}, // ウェ + {"\xE3\x83\xB1", "we"}, // ヱ + {"\xE3\x82\x90", "wi"}, // ゐ + {"\xE3\x82\xA6\xE3\x82\xA3", "wi"}, // ウィ + {"\xE3\x83\xB0", "wi"}, // ヰ + {"\xE3\x82\x92", "wo"}, // を + {"\xE3\x82\xA6\xE3\x82\xA9", "wo"}, // ウォ + {"\xE3\x83\xB2", "wo"}, // ヲ + {"\xE3\x82\x84", "ya"}, // や + {"\xE3\x83\xA4", "ya"}, // ヤ + {"\xE3\x82\xA4\xE3\x82\xA7", "ye"}, // イェ + {"\xE3\x82\x88", "yo"}, // よ + {"\xE3\x83\xA8", "yo"}, // ヨ + {"\xE3\x82\x86", "yu"}, // ゆ + {"\xE3\x83\xA6", "yu"}, // ユ + {"\xE3\x81\x96", "za"}, // ざ + {"\xE3\x82\xB6", "za"}, // ザ + {"\xE3\x81\x9C", "ze"}, // ぜ + {"\xE3\x82\xBC", "ze"}, // ゼ + {"\xE3\x81\x9E", "zo"}, // ぞ + {"\xE3\x82\xBE", "zo"}, // ゾ + {"\xE3\x81\x9A", "zu"}, // ず + {"\xE3\x81\xA5", "zu"}, // づ + {"\xE3\x82\xBA", "zu"}, // ズ + {"\xE3\x83\x85", "zu"}, // ヅ +}; + +bool cmp_kana(agi::kana_pair const& kp, std::string const& kana) { + return strcmp(kp.kana, kana.c_str()) < 0; +} + +struct cmp_romaji { + bool operator()(agi::kana_pair const& kp, std::string const& romaji) const { + return strcmp(kp.romaji, romaji.c_str()) < 0; + } + bool operator()(std::string const& romaji, agi::kana_pair const& kp) const { + return strcmp(kp.romaji, romaji.c_str()) > 0; + } + +#ifdef _MSC_VER // debug iterator stuff needs this overload + bool operator()(agi::kana_pair const& a, agi::kana_pair const& b) const { + return strcmp(a.romaji, b.romaji) < 0; + } +#endif +}; + +} + +namespace agi { +std::vector kana_to_romaji(std::string const& kana) { + std::vector ret; + for (auto pair = boost::lower_bound(::kana_to_romaji, kana, cmp_kana); + pair != std::end(::kana_to_romaji) && !strcmp(pair->kana, kana.c_str()); + ++pair) + ret.push_back(pair->romaji); + return ret; +} + +boost::iterator_range romaji_to_kana(std::string const& romaji) { + for (size_t len = std::min(3, romaji.size()); len > 0; --len) { + auto pair = boost::equal_range(::romaji_to_kana, romaji.substr(0, len).c_str(), cmp_romaji()); + if (pair.first != pair.second) + return boost::make_iterator_range(pair.first, pair.second); + } + return boost::make_iterator_range(::romaji_to_kana, ::romaji_to_kana); +} +} diff --git a/aegisub/libaegisub/common/karaoke_matcher.cpp b/aegisub/libaegisub/common/karaoke_matcher.cpp new file mode 100644 index 000000000..68f2f3524 --- /dev/null +++ b/aegisub/libaegisub/common/karaoke_matcher.cpp @@ -0,0 +1,209 @@ +// Copyright (c) 2013, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// Aegisub Project http://www.aegisub.org/ + +#include "../config.h" + +#include "libaegisub/karaoke_matcher.h" + +#include "libaegisub/kana_table.h" +#include "libaegisub/util.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace { +int32_t next_codepoint(const char *str, size_t *i) { + UChar32 c; + U8_NEXT_UNSAFE(str, *i, c); + return c; +} + +bool is_whitespace(int32_t c) { + return !!u_isUWhiteSpace(c); +} + +bool is_whitespace(std::string const& str) { + size_t i = 0; + while (auto c = next_codepoint(str.c_str(), &i)) { + if (!u_isUWhiteSpace(c)) + return false; + } + return true; +} + +// strcmp but ignoring case and accents +int compare(std::string const& a, std::string const& b) { + using namespace boost::locale; + return std::use_facet>(std::locale()).compare(collator_base::primary, a, b); +} + +} + +namespace agi { + +karaoke_match_result auto_match_karaoke(std::vector const& source_strings, std::string const& dest_string) { + karaoke_match_result result = { 0, 0 }; + if (source_strings.empty()) return result; + + using namespace boost::locale::boundary; + using boost::starts_with; + + result.source_length = 1; + ssegment_index destination_characters(character, begin(dest_string), end(dest_string)); + auto src = boost::to_lower_copy(source_strings[0]); + auto dst = destination_characters.begin(); + auto dst_end = destination_characters.end(); + + // Eat all the whitespace at the beginning of the source and destination + // syllables and exit if either ran out. + auto eat_whitespace = [&]() -> bool { + size_t i = 0, first_non_whitespace = 0; + while (is_whitespace(next_codepoint(src.c_str(), &i))) + first_non_whitespace = i; + if (first_non_whitespace) + src = src.substr(first_non_whitespace); + + while (dst != dst_end && is_whitespace(dst->str())) { + ++dst; + ++result.destination_length; + } + + // If we ran out of dest then this needs to match the rest of the + // source syllables (this probably means the user did something wrong) + if (dst == dst_end) { + result.source_length = source_strings.size(); + return true; + } + + return src.empty(); + }; + + if (eat_whitespace()) return result; + + // We now have a non-whitespace character at the beginning of both source + // and destination. Check if the source starts with a romanized kana, and + // if it does then check if the destination also has the appropriate + // character. If it does, match them and repeat. + while (!src.empty()) { + // First check for a basic match of the first character of the source and dest + auto first_src_char = ssegment_index(character, begin(src), end(src)).begin()->str(); + if (compare(first_src_char, dst->str()) == 0) { + ++dst; + ++result.destination_length; + src.erase(0, first_src_char.size()); + if (eat_whitespace()) return result; + continue; + } + + auto check = [&](kana_pair const& kp) -> bool { + if (!starts_with(&*dst->begin(), kp.kana)) return false; + + src = src.substr(strlen(kp.romaji)); + for (size_t i = 0; kp.kana[i]; ) { + i += dst->length(); + ++result.destination_length; + ++dst; + } + return true; + }; + + bool matched = false; + for (auto const& match : romaji_to_kana(src)) { + if (check(match)) { + if (eat_whitespace()) return result; + matched = true; + break; + } + } + if (!matched) break; + } + + // Source and dest are now non-empty and start with non-whitespace. + // If there's only one character left in the dest, it obviously needs to + // match all of the source syllables left. + if (std::distance(dst, dst_end) == 1) { + result.source_length = source_strings.size(); + ++result.destination_length; + return result; + } + + // We couldn't match the current character, but if we can match the *next* + // syllable then we know that everything in between must belong to the + // current syllable. Do this by looking up to KANA_SEARCH_DISTANCE + // characters ahead in destination and seeing if we can match them against + // the beginning of a syllable after this syllable. + // If a match is found, make a guess at how much source and destination + // should be selected based on the distances it was found at. + + // The longest kanji are 'uketamawa.ru' and 'kokorozashi', each with a + // reading consisting of five kana. This means each each character from + // the destination can match at most five syllables from the source. + static const int max_character_length = 5; + + // Arbitrarily chosen limit on the number of dest characters to try + // skipping. Higher numbers probably increase false-positives. + static const int dst_lookahead_max = 3; + + for (size_t lookahead = 0; lookahead < dst_lookahead_max; ++lookahead) { + if (++dst == dst_end) break; + + // Transliterate this character if it's a known hiragana or katakana character + std::vector translit; + auto next = std::next(dst); + if (next != dst_end) + boost::copy(kana_to_romaji(dst->str() + next->str()), back_inserter(translit)); + boost::copy(kana_to_romaji(dst->str()), back_inserter(translit)); + + // Search for it and the transliterated version in the source + int src_lookahead_max = (lookahead + 1) * max_character_length; + int src_lookahead_pos = 0; + for (auto const& syl : source_strings) { + // Don't count blank syllables in the max search distance + if (is_whitespace(syl)) continue; + if (++src_lookahead_pos == 1) continue; + if (src_lookahead_pos > src_lookahead_max) break; + + std::string lsyl = boost::to_lower_copy(syl); + if (!(starts_with(syl, dst->str()) || util::any_of(translit, [&](const char *str) { return starts_with(lsyl, str); }))) + continue; + + // The syllable immediately after the current one matched, so + // everything up to the match must go with the current syllable. + if (src_lookahead_pos == 2) { + result.destination_length += lookahead + 1; + return result; + } + + // The match was multiple syllables ahead, so just divide the + // destination characters evenly between the source syllables + result.destination_length += 1; + result.source_length = static_cast((src_lookahead_pos - 1.0) / (lookahead + 1.0) + .5); + return result; + } + } + + // We wouldn't have gotten here if the dest was empty, so make sure at + // least one character is selected + result.destination_length = std::max(result.destination_length, 1u); + + return result; +} +} diff --git a/aegisub/libaegisub/include/libaegisub/kana_table.h b/aegisub/libaegisub/include/libaegisub/kana_table.h new file mode 100644 index 000000000..a826819be --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/kana_table.h @@ -0,0 +1,30 @@ +// Copyright (c) 2013, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// Aegisub Project http://www.aegisub.org/ + +#include +#include + +namespace agi { + struct kana_pair { + const char *kana; + const char *romaji; + }; + + /// Transliterated romaji for the given kana, or nullptr if not applicable + std::vector kana_to_romaji(std::string const& kana); + + boost::iterator_range romaji_to_kana(std::string const& romaji); +} diff --git a/aegisub/libaegisub/include/libaegisub/karaoke_matcher.h b/aegisub/libaegisub/include/libaegisub/karaoke_matcher.h new file mode 100644 index 000000000..527cac18f --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/karaoke_matcher.h @@ -0,0 +1,30 @@ +// Copyright (c) 2013, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// Aegisub Project http://www.aegisub.org/ + +#include +#include + +namespace agi { + struct karaoke_match_result { + /// The number of strings in the source matched + size_t source_length; + /// The number of characters in the destination string matched + size_t destination_length; + }; + + /// Try to automatically select the portion of dst which corresponds to the first string in src + karaoke_match_result auto_match_karaoke(std::vector const& src, std::string const& dst); +} diff --git a/aegisub/libaegisub/include/libaegisub/util.h b/aegisub/libaegisub/include/libaegisub/util.h index 26dd2970d..32b6a56ba 100644 --- a/aegisub/libaegisub/include/libaegisub/util.h +++ b/aegisub/libaegisub/include/libaegisub/util.h @@ -75,5 +75,11 @@ namespace agi { /// elsewhere (because libstcc++ 4.7 is missing it). void sleep_for(int ms); + // boost.range doesn't have wrappers for the C++11 stuff + template + bool any_of(Range&& r, Predicate&& p) { + return std::any_of(std::begin(r), std::end(r), std::forward(p)); + } + } // namespace util } // namespace agi diff --git a/aegisub/src/Makefile b/aegisub/src/Makefile index 6782a83c4..19c8d128e 100644 --- a/aegisub/src/Makefile +++ b/aegisub/src/Makefile @@ -194,7 +194,6 @@ SRC += \ hotkey.cpp \ hotkey_data_view_model.cpp \ initial_line_state.cpp \ - kana_table.cpp \ lpeg.c \ main.cpp \ menu.cpp \ diff --git a/aegisub/src/dialog_kara_timing_copy.cpp b/aegisub/src/dialog_kara_timing_copy.cpp index 71b53b8f2..877e6bc40 100644 --- a/aegisub/src/dialog_kara_timing_copy.cpp +++ b/aegisub/src/dialog_kara_timing_copy.cpp @@ -43,12 +43,17 @@ #include "compat.h" #include "help_button.h" #include "include/aegisub/context.h" -#include "kana_table.h" #include "libresrc/libresrc.h" #include "options.h" #include "selection_controller.h" #include "utils.h" +#include + +#include +#include +#include +#include #include #include @@ -76,12 +81,13 @@ class KaraokeLineMatchDisplay : public wxControl { std::vector matched_groups; std::deque unmatched_source; - std::string unmatched_destination; + std::string destination_str; + boost::locale::boundary::ssegment_index destination; + boost::locale::boundary::ssegment_index::iterator match_begin, match_end; int last_total_matchgroup_render_width; size_t source_sel_length; - size_t destination_sel_length; void OnPaint(wxPaintEvent &event); @@ -96,7 +102,7 @@ public: /// Number of syllables not yet matched from source size_t GetRemainingSource() const { return unmatched_source.size(); } /// Number of characters not yet matched from destination - size_t GetRemainingDestination() const { return unmatched_destination.size(); } + size_t GetRemainingDestination() const { return distance(match_end, destination.end()); } // Adjust source and destination match lengths void IncreaseSourceMatch(); @@ -147,7 +153,7 @@ wxSize KaraokeLineMatchDisplay::GetBestSize() const return wxSize(min_width * 2, h_src + h_dst + 7); } -int DrawBoxedText(wxDC &dc, const std::string &txt, int x, int y) +int DrawBoxedText(wxDC &dc, wxString const& txt, int x, int y) { int tw, th; // Assume the pen, brush and font properties have already been set in the DC. @@ -164,10 +170,9 @@ int DrawBoxedText(wxDC &dc, const std::string &txt, int x, int y) } else { - wxString wxtxt(to_wx(txt)); - dc.GetTextExtent(wxtxt, &tw, &th); + dc.GetTextExtent(txt, &tw, &th); dc.DrawRectangle(x, y-2, tw+4, th+4); - dc.DrawText(wxtxt, x+2, y); + dc.DrawText(txt, x+2, y); return tw+3; } } @@ -256,11 +261,11 @@ void KaraokeLineMatchDisplay::OnPaint(wxPaintEvent &) // Matched source syllables int syl_x = next_x; for (auto const& syl : grp.src) - syl_x += DrawBoxedText(dc, syl.text, syl_x, y_line1); + syl_x += DrawBoxedText(dc, to_wx(syl.text), syl_x, y_line1); // Matched destination text { - const int adv = DrawBoxedText(dc, grp.dst, next_x, y_line2); + const int adv = DrawBoxedText(dc, to_wx(grp.dst), next_x, y_line2); // Adjust next_x here while we have the text_w next_x = syl_x > next_x + adv ? syl_x : next_x + adv; @@ -292,24 +297,30 @@ void KaraokeLineMatchDisplay::OnPaint(wxPaintEvent &) dc.SetBrush(wxBrush(inner_back)); } - syl_x += DrawBoxedText(dc, unmatched_source[j].text, syl_x, y_line1); + syl_x += DrawBoxedText(dc, to_wx(unmatched_source[j].text), syl_x, y_line1); } // Remaining destination - if (!unmatched_destination.empty()) + if (match_begin != match_end) { dc.SetTextBackground(sel_back); dc.SetTextForeground(sel_text); dc.SetBrush(wxBrush(sel_back)); - next_x += DrawBoxedText(dc, unmatched_destination.substr(0, destination_sel_length), next_x, y_line2); + wxString str; + for (auto it = match_begin; it != match_end; ++it) + str += to_wx(it->str()); + next_x += DrawBoxedText(dc, str, next_x, y_line2); } - if (destination_sel_length < unmatched_destination.size()) + if (match_end != destination.end()) { dc.SetTextBackground(inner_back); dc.SetTextForeground(inner_text); dc.SetBrush(wxBrush(inner_back)); - DrawBoxedText(dc, unmatched_destination.substr(destination_sel_length), next_x, y_line2); + wxString str; + for (auto it = match_end; it != destination.end(); ++it) + str += to_wx(it->str()); + DrawBoxedText(dc, str, next_x, y_line2); } } @@ -328,8 +339,12 @@ void KaraokeLineMatchDisplay::SetInputData(AssDialogue *src, AssDialogue *dst) source_sel_length = 1; } - unmatched_destination = dst ? dst->GetStrippedText() : ""; - destination_sel_length = std::max(1, unmatched_destination.size()); + destination_str = dst ? dst->GetStrippedText() : ""; + using namespace boost::locale::boundary; + destination = ssegment_index(character, begin(destination_str), end(destination_str)); + match_begin = match_end = destination.begin(); + if (!destination_str.empty()) + ++match_end; Refresh(true); } @@ -363,182 +378,34 @@ void KaraokeLineMatchDisplay::DecreaseSourceMatch() void KaraokeLineMatchDisplay::IncreseDestinationMatch() { - destination_sel_length = std::min(destination_sel_length + 1, GetRemainingDestination()); - Refresh(true); + if (match_end != destination.end()) { + ++match_end; + Refresh(true); + } } void KaraokeLineMatchDisplay::DecreaseDestinationMatch() { - destination_sel_length = std::max(destination_sel_length, 1) - 1; - Refresh(true); + if (match_end != match_begin) { + --match_end; + Refresh(true); + } } -/// Kana interpolation, in characters, unset to disable -#define KANA_SEARCH_DISTANCE 3 - void KaraokeLineMatchDisplay::AutoMatchJapanese() { - if (unmatched_source.size() < 1) return; - - // Quick escape: If there's no destination left, take all remaining source. - // (Usually this means the user made a mistake.) - if (unmatched_destination.empty()) - { - source_sel_length = unmatched_source.size(); - destination_sel_length = 0; - return; - } - - // We'll first see if we can do something with the first unmatched source syllable - wxString src(to_wx(unmatched_source[0].text).Lower()); - wxString dst(to_wx(unmatched_destination)); - source_sel_length = 1; // we're working on the first, assume it was matched - destination_sel_length = 0; - - // Quick escape: If the source syllable is empty, return with first source syllable and empty destination - if (src.empty()) return; - - // Try to match the next source syllable against the destination. Do it - // "inverted": try all kana from the table and prefix-match them against - // the destination, then if it matches a prefix, try to match the hepburn - // for it agast the source; eat if it matches. Keep trying to match as - // long as there's text left in the source syllable or matching fails. - while (src.size() > 0) - { - wxString dst_hira_rest, dst_kata_rest, src_rest; - bool matched = false; - for (const KanaEntry *ke = KanaTable; ke->hiragana; ++ke) - { - if (src.StartsWith(ke->hepburn, &src_rest)) - { - bool hira_matches = dst.StartsWith(ke->hiragana, &dst_hira_rest) && *ke->hiragana; - bool kata_matches = dst.StartsWith(ke->katakana, &dst_kata_rest); - - if (hira_matches || kata_matches) - { - matched = true; - src = src_rest; - dst = hira_matches ? dst_hira_rest : dst_kata_rest; - destination_sel_length += wcslen(hira_matches ? ke->hiragana : ke->katakana); - break; - } - } - } - if (!matched) break; - } - - // The source might be empty now: That's good! - // That means we managed to match it all against destination text - if (src.empty()) return; - // destination_sel_length already has the appropriate value - // and source_sel_length was already 1 - - // Now the source syllable might consist of just whitespace. - // Eat all whitespace at the start of the destination. - if (StringEmptyOrWhitespace(src)) - { - wxString str(to_wx(unmatched_destination.substr(destination_sel_length))); - destination_sel_length += std::distance(str.begin(), std::find_if_not(str.begin(), str.end(), IsWhitespace)); - // Now we've eaten all spaces in the destination as well - // so the selection lengths should be good - return; - } - - // If there's just one character left in the destination at this point, - // (and the source doesn't begin with space syllables, see test above) - // assume it's safe to take all remaining source to match the single - // remaining destination. - if (unmatched_destination.size() == 1) - { - source_sel_length = unmatched_source.size(); - destination_sel_length = 1; - return; - } - -#ifdef KANA_SEARCH_DISTANCE - // Try to look up to KANA_SEARCH_DISTANCE characters ahead in destination, - // see if any of those are recognised kana. If there are any within the - // range, see if it matches a following syllable, at most 5 source - // syllables per character in source we're ahead. - // The number 5 comes from the kanji with the longest readings: - // 'uketamawa.ru' and 'kokorozashi' which each have a reading consisting of - // five kana. - // Only match the found kana in destination against the beginning of source - // syllables, not the middle of them. - // If a match is found, make a guess at how much source and destination - // should be selected based on the distances it was found at. - dst = to_wx(unmatched_destination); - for (size_t lookahead = 0; lookahead < KANA_SEARCH_DISTANCE; ++lookahead) - { - // Eat dst at the beginning, don't test for the first character being kana - dst = dst.Mid(1); - // Find a position where hiragana or katakana matches - wxString matched_roma; - wxString matched_kana; - for (const KanaEntry *ke = KanaTable; ke->hiragana; ++ke) - { - if (*ke->hiragana && dst.StartsWith(ke->hiragana)) - { - matched_roma = ke->hepburn; - matched_kana = ke->hiragana; - break; - } - if (*ke->katakana && dst.StartsWith(ke->katakana)) - { - matched_roma = ke->hepburn; - matched_kana = ke->katakana; - break; - } - } - // If we didn't match any kana against dst, move to next char in dst - if (!matched_kana) - continue; - // Otherwise look for a match for the romaji - // For the magic number 5, see big comment block above - int src_lookahead_max = (lookahead+1)*5; - int src_lookahead_pos = 0; - for (auto const& syl : unmatched_source) - { - // Check if we've gone too far ahead in the source - if (src_lookahead_pos++ >= src_lookahead_max) break; - // Otherwise look for a match - if (to_wx(syl.text).StartsWith(matched_roma)) - { - // Yay! Time to interpolate. - // Special case: If the last source syllable before the matching one is - // empty or contains just whitespace, don't include that one. - if (src_lookahead_pos > 1 && StringEmptyOrWhitespace(to_wx(unmatched_source[src_lookahead_pos-2].text))) - src_lookahead_pos -= 1; - // Special case: Just one source syllable matching, pick all destination found - if (src_lookahead_pos == 2) - { - source_sel_length = 1; - destination_sel_length = lookahead+1; - return; - } - // Otherwise try to split the eaten source syllables evenly between the eaten - // destination characters, and do a regular rounding. - float src_per_dst = (float)(src_lookahead_pos-1)/(float)(lookahead+1); - source_sel_length = (int)(src_per_dst + 0.5); - destination_sel_length = 1; - return; - } - } - } -#endif - - // Okay so we didn't match anything. Aww. - // Just fail... - // We know from earlier that we do have both some source and some destination. - source_sel_length = 1; - destination_sel_length = 1; - return; + std::vector source; + for (auto const& syl : unmatched_source) + source.emplace_back(syl.text); + auto result = agi::auto_match_karaoke(source, match_begin == destination.end() ? "" : &*match_begin->begin()); + source_sel_length = result.source_length; + match_end = std::next(match_begin, result.destination_length); } bool KaraokeLineMatchDisplay::AcceptMatch() { // Completely empty match - if (source_sel_length == 0 && destination_sel_length == 0) return false; + if (source_sel_length == 0 && match_begin == match_end) return false; MatchGroup match; @@ -547,10 +414,8 @@ bool KaraokeLineMatchDisplay::AcceptMatch() unmatched_source.erase(unmatched_source.begin(), unmatched_source.begin() + source_sel_length); source_sel_length = 0; - assert(destination_sel_length <= unmatched_destination.size()); - match.dst = unmatched_destination.substr(0, destination_sel_length); - unmatched_destination.erase(0, destination_sel_length); - destination_sel_length = 0; + match.dst = std::string(match_begin->begin(), match_end == destination.end() ? destination_str.end() : match_end->begin()); + match_begin = match_end; matched_groups.emplace_back(std::move(match)); @@ -569,12 +434,12 @@ bool KaraokeLineMatchDisplay::UndoMatch() MatchGroup &group = matched_groups.back(); source_sel_length = group.src.size(); - destination_sel_length = group.dst.size(); - copy(group.src.rbegin(), group.src.rend(), front_inserter(unmatched_source)); group.src.clear(); - unmatched_destination = group.dst + unmatched_destination; + match_end = match_begin; + for (size_t size = group.dst.size(); size > 0; size -= match_begin->length()) + --match_begin; matched_groups.pop_back(); diff --git a/aegisub/src/kana_table.cpp b/aegisub/src/kana_table.cpp deleted file mode 100644 index a05df5a0c..000000000 --- a/aegisub/src/kana_table.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright (c) 2006, Rodrigo Braz Monteiro -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// * Neither the name of the Aegisub Group nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Aegisub Project http://www.aegisub.org/ - -/// @file kana_table.cpp -/// @brief Data about the Japanese kana syllabary used by kanji karaoke timing copying -/// @ingroup kara_timing_copy -/// - - -#include "config.h" - -#include "kana_table.h" - -const KanaEntry KanaTable[] = -{ - // Regular kana usage and combinations - { L"\u3042", L"\u30a2", L"a" }, - { L"\u3044", L"\u30a4", L"i" }, - { L"\u3046", L"\u30a6", L"u" }, - { L"\u3048", L"\u30a8", L"e" }, - { L"\u304a", L"\u30aa", L"o" }, - - { L"\u304b", L"\u30ab", L"ka" }, - { L"\u304d", L"\u30ad", L"ki" }, - { L"\u304f", L"\u30af", L"ku" }, - { L"\u3051", L"\u30b1", L"ke" }, - { L"\u3053", L"\u30b3", L"ko" }, - - { L"\u3055", L"\u30b5", L"sa" }, - { L"\u3057", L"\u30b7", L"shi" }, - { L"\u3059", L"\u30b9", L"su" }, - { L"\u305b", L"\u30bb", L"se" }, - { L"\u305d", L"\u30bd", L"so" }, - - { L"\u305f", L"\u30bf", L"ta" }, - { L"\u3061", L"\u30c1", L"chi" }, - { L"\u3064", L"\u30c4", L"tsu" }, - { L"\u3066", L"\u30c6", L"te" }, - { L"\u3068", L"\u30c8", L"to" }, - - { L"\u306a", L"\u30ca", L"na" }, - { L"\u306b", L"\u30cb", L"ni" }, - { L"\u306c", L"\u30cc", L"nu" }, - { L"\u306d", L"\u30cd", L"ne" }, - { L"\u306e", L"\u30ce", L"no" }, - - { L"\u306f", L"\u30cf", L"ha" }, - { L"\u3072", L"\u30d2", L"hi" }, - { L"\u3075", L"\u30d5", L"fu" }, - { L"\u3078", L"\u30d8", L"he" }, - { L"\u307b", L"\u30db", L"ho" }, - - { L"\u307e", L"\u30de", L"ma" }, - { L"\u307f", L"\u30df", L"mi" }, - { L"\u3080", L"\u30e0", L"mu" }, - { L"\u3081", L"\u30e1", L"me" }, - { L"\u3082", L"\u30e2", L"mo" }, - - { L"\u3084", L"\u30e4", L"ya" }, - { L"\u3086", L"\u30e6", L"yu" }, - { L"\u3088", L"\u30e8", L"yo" }, - - { L"\u3089", L"\u30e9", L"ra" }, - { L"\u308a", L"\u30ea", L"ri" }, - { L"\u308b", L"\u30eb", L"ru" }, - { L"\u308c", L"\u30ec", L"re" }, - { L"\u308d", L"\u30ed", L"ro" }, - - { L"\u308f", L"\u30ef", L"wa" }, - { L"\u3090", L"\u30f0", L"wi" }, - { L"\u3091", L"\u30f1", L"we" }, - { L"\u3092", L"\u30f2", L"wo" }, - - { L"\u304c", L"\u30ac", L"ga" }, - { L"\u304e", L"\u30ae", L"gi" }, - { L"\u3050", L"\u30b0", L"gu" }, - { L"\u3052", L"\u30b2", L"ge" }, - { L"\u3054", L"\u30b4", L"go" }, - - { L"\u3056", L"\u30b6", L"za" }, - { L"\u3058", L"\u30b8", L"ji" }, - { L"\u305a", L"\u30ba", L"zu" }, - { L"\u305c", L"\u30bc", L"ze" }, - { L"\u305e", L"\u30be", L"zo" }, - - { L"\u3060", L"\u30c0", L"da" }, - { L"\u3062", L"\u30c2", L"ji" }, - { L"\u3065", L"\u30c5", L"zu" }, - { L"\u3067", L"\u30c7", L"de" }, - { L"\u3069", L"\u30c9", L"do" }, - - { L"\u3070", L"\u30d0", L"ba" }, - { L"\u3073", L"\u30d3", L"bi" }, - { L"\u3076", L"\u30d6", L"bu" }, - { L"\u3079", L"\u30d9", L"be" }, - { L"\u307c", L"\u30dc", L"bo" }, - - { L"\u3071", L"\u30d1", L"pa" }, - { L"\u3074", L"\u30d4", L"pi" }, - { L"\u3077", L"\u30d7", L"pu" }, - { L"\u307a", L"\u30da", L"pe" }, - { L"\u307d", L"\u30dd", L"po" }, - - { L"\u304d\u3083", L"\u30ad\u30e3", L"kya" }, - { L"\u304d\u3085", L"\u30ad\u30e5", L"kyu" }, - { L"\u304d\u3087", L"\u30ad\u30e7", L"kyo" }, - - { L"\u3057\u3083", L"\u30b7\u30e3", L"sha" }, - { L"\u3057\u3085", L"\u30b7\u30e5", L"shu" }, - { L"\u3057\u3087", L"\u30b7\u30e7", L"sho" }, - - { L"\u3061\u3083", L"\u30c1\u30e3", L"cha" }, - { L"\u3061\u3085", L"\u30c1\u30e5", L"chu" }, - { L"\u3061\u3087", L"\u30c1\u30e7", L"cho" }, - - { L"\u306b\u3083", L"\u30cb\u30e3", L"nya" }, - { L"\u306b\u3085", L"\u30cb\u30e5", L"nyu" }, - { L"\u306b\u3087", L"\u30cb\u30e7", L"nyo" }, - - { L"\u3072\u3083", L"\u30d2\u30e3", L"hya" }, - { L"\u3072\u3085", L"\u30d2\u30e5", L"hyu" }, - { L"\u3072\u3087", L"\u30d2\u30e7", L"hyo" }, - - { L"\u307f\u3083", L"\u30df\u30e3", L"mya" }, - { L"\u307f\u3085", L"\u30df\u30e5", L"myu" }, - { L"\u307f\u3087", L"\u30df\u30e7", L"myo" }, - - { L"\u308a\u3083", L"\u30ea\u30e3", L"rya" }, - { L"\u308a\u3085", L"\u30ea\u30e5", L"ryu" }, - { L"\u308a\u3087", L"\u30ea\u30e7", L"ryo" }, - - { L"\u304e\u3083", L"\u30ae\u30e3", L"gya" }, - { L"\u304e\u3085", L"\u30ae\u30e5", L"gyu" }, - { L"\u304e\u3087", L"\u30ae\u30e7", L"gyo" }, - - { L"\u3058\u3083", L"\u30b8\u30e3", L"ja" }, - { L"\u3058\u3085", L"\u30b8\u30e5", L"ju" }, - { L"\u3058\u3087", L"\u30b8\u30e7", L"jo" }, - - { L"\u3062\u3083", L"\u30c2\u30e3", L"ja" }, - { L"\u3062\u3085", L"\u30c2\u30e5", L"ju" }, - { L"\u3062\u3087", L"\u30c2\u30e7", L"jo" }, - - { L"\u3073\u3083", L"\u30d3\u30e3", L"bya" }, - { L"\u3073\u3085", L"\u30d3\u30e5", L"byu" }, - { L"\u3073\u3087", L"\u30d3\u30e7", L"byo" }, - - { L"\u3074\u3083", L"\u30d4\u30e3", L"pya" }, - { L"\u3074\u3085", L"\u30d4\u30e5", L"pyu" }, - { L"\u3074\u3087", L"\u30d4\u30e7", L"pyo" }, - - - // Specialty katakana usage for loan words - - // Katakana fu + small vowel - { L"", L"\u30d5\u30a1", L"fa" }, - { L"", L"\u30d5\u30a3", L"fi" }, - { L"", L"\u30d5\u30a7", L"fe" }, - { L"", L"\u30d5\u30a9", L"fo" }, - - // Katakana vu + small vowel - { L"", L"\u30f4\u30a1", L"va" }, - { L"", L"\u30f4\u30a3", L"vi" }, - { L"", L"\u30f4", L"vu" }, - { L"", L"\u30f4\u30a7", L"ve" }, - { L"", L"\u30f4\u30a9", L"vo" }, - - // Katakana fu + small yu - { L"", L"\u30d5\u30e5", L"fyu" }, - - // Katakana i + little e - { L"", L"\u30a4\u30a7", L"ye" }, - - // Katakana u + little vowels - { L"", L"\u30a6\u30a3", L"wi" }, - { L"", L"\u30a6\u30a7", L"we" }, - { L"", L"\u30a6\u30a9", L"wo" }, - - // Katakana vu + small ya-yu-yo - { L"", L"\u30f4\u30e3", L"vya" }, - { L"", L"\u30f4\u30e5", L"vyu" }, - { L"", L"\u30f4\u30e7", L"vyo" }, - - // Katakana shi-ji-chi + small e - { L"", L"\u30b7\u30a7", L"she" }, - { L"", L"\u30b8\u30a7", L"je" }, - { L"", L"\u30c1\u30a7", L"che" }, - - // Katakana de + small i-u-yu - { L"", L"\u30c6\u30a3", L"ti" }, - { L"", L"\u30c6\u30a5", L"tu" }, - { L"", L"\u30c6\u30e5", L"tyu" }, - - // Katakana de + small i-u-yu - { L"", L"\u30c7\u30a3", L"di" }, - { L"", L"\u30c7\u30a5", L"du" }, - { L"", L"\u30c7\u30a5", L"dyu" }, - - // Katakana tsu + small vowels - { L"", L"\u30c4\u30a1", L"tsa" }, - { L"", L"\u30c4\u30a3", L"tsi" }, - { L"", L"\u30c4\u30a7", L"tse" }, - { L"", L"\u30c4\u30a9", L"tso" }, - - - // Syllablic consonants - - // Small tsu - { L"\u3063", L"\u30c3", L"t" }, - { L"\u3063", L"\u30c3", L"c" }, - { L"\u3063", L"\u30c3", L"s" }, - { L"\u3063", L"\u30c3", L"k" }, - { L"\u3063", L"\u30c3", L"p" }, - - // Syllabic n - { L"\u3093", L"\u30f3", L"n" }, - { L"\u3093", L"\u30f3", L"m" }, - - - // Other special usage - - // Small vowels - { L"\u3041", L"\u30a1", L"a" }, - { L"\u3043", L"\u30a3", L"i" }, - { L"\u3045", L"\u30a5", L"u" }, - { L"\u3047", L"\u30a7", L"e" }, - { L"\u3049", L"\u30a9", L"o" }, - - // Long vowel mark (dash) - { L"", L"\u30fc", L"a" }, - { L"", L"\u30fc", L"i" }, - { L"", L"\u30fc", L"u" }, - { L"", L"\u30fc", L"e" }, - { L"", L"\u30fc", L"o" }, - { 0, 0, 0 } -}; diff --git a/aegisub/src/kana_table.h b/aegisub/src/kana_table.h deleted file mode 100644 index 20a3b3483..000000000 --- a/aegisub/src/kana_table.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2006, Rodrigo Braz Monteiro -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// * Neither the name of the Aegisub Group nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Aegisub Project http://www.aegisub.org/ - -/// @file kana_table.h -/// @see kana_table.cpp -/// @ingroup kara_timing_copy -/// - -#include - -#include - -/// @class KanaEntry -/// @brief Base class for Kana + Romaji tuples. -struct KanaEntry { - /// Hiragana - const wchar_t *hiragana; - - /// Katakana - const wchar_t *katakana; - - /// Hepburn romaji. - const wchar_t *hepburn; -}; - -/// Table of Hiragana, Katakana and Hepburn romaji tuples. -extern const KanaEntry KanaTable[]; diff --git a/aegisub/src/utils.cpp b/aegisub/src/utils.cpp index f72593d57..90e4a312d 100644 --- a/aegisub/src/utils.cpp +++ b/aegisub/src/utils.cpp @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -109,14 +108,6 @@ int SmallestPowerOf2(int x) { return x; } -bool IsWhitespace(wxUniChar c) { - return !!u_isUWhiteSpace(c.GetValue()); -} - -bool StringEmptyOrWhitespace(const wxString &str) { - return std::all_of(str.begin(), str.end(), IsWhitespace); -} - void RestartAegisub() { config::opt->Flush(); diff --git a/aegisub/src/utils.h b/aegisub/src/utils.h index 404949e03..4bdc943d0 100644 --- a/aegisub/src/utils.h +++ b/aegisub/src/utils.h @@ -61,12 +61,6 @@ void StatusTimeout(wxString const& msg, int ms = 10000); /// Algorithm from http://bob.allegronetwork.com/prog/tricks.html int SmallestPowerOf2(int x); -/// Check if wchar 'c' is a whitespace character -bool IsWhitespace(wxUniChar c); - -/// Check if every character in str is whitespace -bool StringEmptyOrWhitespace(const wxString &str); - /// Get the length in characters of the longest line in the given text size_t MaxLineLength(std::string const& text); diff --git a/aegisub/tests/Makefile b/aegisub/tests/Makefile index 9841207d9..c986ad797 100644 --- a/aegisub/tests/Makefile +++ b/aegisub/tests/Makefile @@ -25,6 +25,7 @@ SRC = \ tests/hotkey.cpp \ tests/iconv.cpp \ tests/ifind.cpp \ + tests/karaoke_matcher.cpp \ tests/keyframe.cpp \ tests/line_iterator.cpp \ tests/line_wrap.cpp \ diff --git a/aegisub/tests/tests/karaoke_matcher.cpp b/aegisub/tests/tests/karaoke_matcher.cpp new file mode 100644 index 000000000..adc30b740 --- /dev/null +++ b/aegisub/tests/tests/karaoke_matcher.cpp @@ -0,0 +1,197 @@ +// Copyright (c) 2013, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#include + +#include "main.h" +#include "util.h" + +class lagi_karaoke_matcher : public libagi { }; + +namespace agi { +bool operator==(karaoke_match_result const& a, karaoke_match_result const& b) { + return a.source_length == b.source_length && a.destination_length == b.destination_length; +} +::std::ostream& operator<<(::std::ostream& os, karaoke_match_result const& r) { + return os << "karaoke_match_result{" << r.source_length << ", " << r.destination_length << "}"; +} +} + +using agi::auto_match_karaoke; +using agi::karaoke_match_result; + +TEST(lagi_karaoke_matcher, empty_src_gives_zero_src_length) { + EXPECT_EQ(0, auto_match_karaoke(std::vector(), "").source_length); + EXPECT_EQ(0, auto_match_karaoke(std::vector(), "a").source_length); +} + +TEST(lagi_karaoke_matcher, empty_dest_gives_zero_dest_length) { + EXPECT_EQ(0, auto_match_karaoke(std::vector(), "").destination_length); +} + +TEST(lagi_karaoke_matcher, empty_dest_with_source_selects_all_source) { + EXPECT_EQ(2, auto_match_karaoke({"a", "b"}, "").source_length); +} + +TEST(lagi_karaoke_matcher, empty_but_present_src_syllable_matches_no_dest) { + EXPECT_EQ((karaoke_match_result{1, 0}), + auto_match_karaoke({"", "b"}, "cc")); +} + +TEST(lagi_karaoke_matcher, dest_with_non_match_selects_first_character) { + EXPECT_EQ((karaoke_match_result{1, 1}), + auto_match_karaoke({"a", "b"}, "cc")); +} + +TEST(lagi_karaoke_matcher, dest_with_identical_match_selects_match) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"abc", "de"}, "abcde")); +} + +TEST(lagi_karaoke_matcher, match_is_case_insensitive) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"abc", "de"}, "ABCDE")); + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"ABC", "DE"}, "abcde")); +} + +TEST(lagi_karaoke_matcher, leading_whitespace_in_source_is_ignored) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({" abc", "de"}, "abcde")); +} + +TEST(lagi_karaoke_matcher, trailing_whitespace_in_source_is_ignored) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"abc ", "de"}, "abcde")); +} + +TEST(lagi_karaoke_matcher, whitespace_in_dest_is_consumed) { + EXPECT_EQ((karaoke_match_result{1, 4}), + auto_match_karaoke({"abc ", "de"}, " abcde")); + EXPECT_EQ((karaoke_match_result{1, 4}), + auto_match_karaoke({"abc ", "de"}, "abc de")); + EXPECT_EQ((karaoke_match_result{1, 5}), + auto_match_karaoke({"abc ", "de"}, "ab c de")); +} + +TEST(lagi_karaoke_matcher, dest_match_is_in_characters) { + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"∫∫", "de"}, "∫∫a")); +} + +TEST(lagi_karaoke_matcher, decomposed_characters_are_handled_atomically) { + // YODO + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"∫∫", "de"}, "∫∫a")); +} + +TEST(lagi_karaoke_matcher, single_hiragana_is_matched) { + EXPECT_EQ((karaoke_match_result{1, 1}), + auto_match_karaoke({"ro" "de"}, "ろ")); +} + +TEST(lagi_karaoke_matcher, single_katakana_is_matched) { + EXPECT_EQ((karaoke_match_result{1, 1}), + auto_match_karaoke({"ro" "de"}, "ロ")); +} + +TEST(lagi_karaoke_matcher, multiple_characters_matched) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"romaji" "de"}, "ろまじ")); +} +TEST(lagi_karaoke_matcher, multiple_character_kana) { + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"kya", "e"}, "きゃe")); + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"kya"}, "きゃ")); +} + +TEST(lagi_karaoke_matcher, whitespace_between_characters_in_source_ignored) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"ro ma ji" "de"}, "ろまじ")); +} + +TEST(lagi_karaoke_matcher, whitespace_inside_characters_in_source_breaks_match) { + EXPECT_EQ((karaoke_match_result{1, 1}), + auto_match_karaoke({"r om aj i" "de"}, "ろまじ")); +} + +TEST(lagi_karaoke_matcher, single_dest_character_consumes_all_source) { + EXPECT_EQ((karaoke_match_result{3, 1}), + auto_match_karaoke({"a", "b", "c"}, "ろ")); +} + +TEST(lagi_karaoke_matcher, fullwidth_letters_are_matched_to_ascii) { + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"ab", "cd"}, "abc")); +} + +TEST(lagi_karaoke_matcher, simple_lookahead) { + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"ab", "ro"}, "eeろ")); +} + +TEST(lagi_karaoke_matcher, lookahead_ignores_empty_syllables) { + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"ab", "", "ro"}, "eeろ")); + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"ab", "", "", "ro"}, "eeろ")); +} + +TEST(lagi_karaoke_matcher, lookahead_only_looks_at_three_characters_of_dst) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"abc", "", "ro"}, "eeeろ")); + EXPECT_EQ((karaoke_match_result{1, 1}), + auto_match_karaoke({"abcd", "", "ro"}, "eeeeろ")); +} + +TEST(lagi_karaoke_matcher, lookahead_two_syllables) { + EXPECT_EQ((karaoke_match_result{1, 1}), + auto_match_karaoke({"a", "b", "ro"}, "eeろ")); + EXPECT_EQ((karaoke_match_result{2, 1}), + auto_match_karaoke({"a", "b", "c", "ro"}, "eeろ")); + EXPECT_EQ((karaoke_match_result{2, 1}), + auto_match_karaoke({"a", "b", "c", "d", "ro"}, "eeろ")); + EXPECT_EQ((karaoke_match_result{3, 1}), + auto_match_karaoke({"a", "b", "c", "d", "f", "ro"}, "eeろ")); + EXPECT_EQ((karaoke_match_result{3, 2}), + auto_match_karaoke({"a", "b", "c", "d", "f", "ro"}, " eeろ")); +} + +TEST(lagi_karaoke_matcher, lookahead_multicharacter_kana) { + EXPECT_EQ((karaoke_match_result{1, 2}), + auto_match_karaoke({"aa", "kya"}, "eeきゃ")); +} + +TEST(lagi_karaoke_matcher, ha_is_wa) { + EXPECT_EQ((karaoke_match_result{2, 1}), + auto_match_karaoke({"Bo", "ku", "wa"}, "僕は")); +} + +TEST(lagi_karaoke_matcher, he_is_e) { + EXPECT_EQ((karaoke_match_result{2, 1}), + auto_match_karaoke({"Bo", "ku", "e"}, "僕へ")); +} + +TEST(lagi_karaoke_matcher, shitta) { + EXPECT_EQ((karaoke_match_result{1, 1}), + auto_match_karaoke({"shi", "tta", ""}, "知った")); + EXPECT_EQ((karaoke_match_result{2, 2}), + auto_match_karaoke({"tta", ""}, "った")); +} + +TEST(lagi_karaoke_matcher, lookahead_is_case_insensitive) { + EXPECT_EQ((karaoke_match_result{1, 3}), + auto_match_karaoke({"Oh... ", "Nan", "ka ", "ta", "ri", "nai"}, "Oh…なんか足りない")); +}