From 47c36c9033dfc03ffdbc595a5f8953520cb28ec2 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Wed, 30 Jan 2013 07:26:16 -0800 Subject: [PATCH] Use ICU/boost.locale for case-insensitive searching Do proper unicode case-folding for case-insensitive searching rather than converting only ascii characters to lowercase. The Turkish 'i' is still not handled correctly (since it's the only place where case-folding is locale-dependent), but that's probably not worth caring about as long as we don't have a Turkish UI translation. This affects both the find/replace dialog and the select lines dialog. Closes #1342. --- aegisub/Makefile.inc.in | 2 +- aegisub/build/tests/tests.vcxproj | 1 + aegisub/build/tests/tests.vcxproj.filters | 3 + aegisub/configure.ac | 1 + aegisub/libaegisub/common/util.cpp | 99 ++++++++++++++++++++ aegisub/libaegisub/include/libaegisub/util.h | 10 ++ aegisub/src/dialog_search_replace.cpp | 1 + aegisub/src/search_replace_engine.cpp | 37 +++++--- aegisub/tests/Makefile | 1 + aegisub/tests/support/main.cpp | 3 + aegisub/tests/tests/ifind.cpp | 75 +++++++++++++++ 11 files changed, 218 insertions(+), 15 deletions(-) create mode 100644 aegisub/tests/tests/ifind.cpp diff --git a/aegisub/Makefile.inc.in b/aegisub/Makefile.inc.in index 68b5fa254..a5d90b69f 100644 --- a/aegisub/Makefile.inc.in +++ b/aegisub/Makefile.inc.in @@ -76,7 +76,7 @@ CPPFLAGS_WX = @WX_CPPFLAGS@ LIBS_WX = @WX_LIBS@ -lz CPPFLAGS_BOOST = @BOOST_CPPFLAGS@ -LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_REGEX_LDFLAGS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LDFLAGS@ @BOOST_SYSTEM_LIBS@ +LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_LOCALE_LIBS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LIBS@ CFLAGS_FFMS2 = @FFMS2_CFLAGS@ CFLAGS_FFTW3 = @FFTW3_CFLAGS@ diff --git a/aegisub/build/tests/tests.vcxproj b/aegisub/build/tests/tests.vcxproj index 71d734c5f..ff4b5fdff 100644 --- a/aegisub/build/tests/tests.vcxproj +++ b/aegisub/build/tests/tests.vcxproj @@ -44,6 +44,7 @@ + diff --git a/aegisub/build/tests/tests.vcxproj.filters b/aegisub/build/tests/tests.vcxproj.filters index 75aa7e626..ad542727b 100644 --- a/aegisub/build/tests/tests.vcxproj.filters +++ b/aegisub/build/tests/tests.vcxproj.filters @@ -35,6 +35,9 @@ Tests + + Tests + Tests diff --git a/aegisub/configure.ac b/aegisub/configure.ac index 1ee33c2c1..95531541b 100644 --- a/aegisub/configure.ac +++ b/aegisub/configure.ac @@ -248,6 +248,7 @@ PKG_CHECK_MODULES(FONTCONFIG, fontconfig >= fontconfig_required_version, libext=a BOOST_REQUIRE([boost_required_version]) BOOST_FILESYSTEM +BOOST_LOCALE BOOST_REGEX ######## diff --git a/aegisub/libaegisub/common/util.cpp b/aegisub/libaegisub/common/util.cpp index b1b738e23..7c2191f6b 100644 --- a/aegisub/libaegisub/common/util.cpp +++ b/aegisub/libaegisub/common/util.cpp @@ -18,8 +18,45 @@ #include "libaegisub/util.h" +#include "libaegisub/exception.h" + +#include +#include +#include #include +namespace { +const size_t bad_pos = (size_t)-1; +const std::pair bad_match(bad_pos, bad_pos); + +template +size_t advance_both(Iterator& folded, Iterator& raw) { + size_t len; + if (*folded == *raw) { + len = folded->length(); + ++folded; + } + else { + // This character was changed by case folding, so refold it and eat the + // appropriate number of characters from folded + len = boost::locale::fold_case(raw->str()).size(); + for (size_t folded_consumed = 0; folded_consumed < len; ++folded) + folded_consumed += folded->length(); + } + + ++raw; + return len; +} + +std::pair find_range(std::string const& haystack, std::string const& needle, size_t start = 0) { + const size_t match_start = haystack.find(needle, start); + if (match_start == std::string::npos) + return bad_match; + return std::make_pair(match_start, match_start + needle.size()); +} + +} + namespace agi { namespace util { std::string strftime(const char *fmt, const tm *tmptr) { @@ -33,4 +70,66 @@ std::string strftime(const char *fmt, const tm *tmptr) { return buff; } +std::pair ifind(std::string const& haystack, std::string const& needle) { + const auto folded_hs = boost::locale::fold_case(haystack); + const auto folded_n = boost::locale::fold_case(needle); + auto match = find_range(folded_hs, folded_n); + if (match == bad_match || folded_hs == haystack) + return match; + + // We have a match, but the position is an index into the folded string + // and we want an index into the unfolded string. + + using namespace boost::locale::boundary; + const ssegment_index haystack_characters(character, begin(haystack), end(haystack)); + const ssegment_index folded_characters(character, begin(folded_hs), end(folded_hs)); + const size_t haystack_char_count = boost::distance(haystack_characters); + const size_t folded_char_count = boost::distance(folded_characters); + + // As of Unicode 6.2, case folding can never reduce the number of + // characters, and can only reduce the number of bytes with UTF-8 when + // increasing the number of characters. As a result, iff the bytes and + // characters are unchanged, no folds changed the size of any characters + // and our indices are correct. + if (haystack.size() == folded_hs.size() && haystack_char_count == folded_char_count) + return match; + + const auto map_folded_to_raw = [&]() -> std::pair { + size_t start = -1; + + // Iterate over each pair of characters and refold each character which was + // changed by folding, so that we can find the corresponding positions in + // the unfolded string + auto folded_it = begin(folded_characters); + auto haystack_it = begin(haystack_characters); + size_t folded_pos = 0; + + while (folded_pos < match.first) + folded_pos += advance_both(folded_it, haystack_it); + // If we overshot the start then the match started in the middle of a + // character which was folded to multiple characters + if (folded_pos > match.first) + return bad_match; + + start = distance(begin(haystack), begin(*haystack_it)); + + while (folded_pos < match.second) + folded_pos += advance_both(folded_it, haystack_it); + if (folded_pos > match.second) + return bad_match; + + return std::make_pair(start, distance(begin(haystack), begin(*haystack_it))); + }; + + auto ret = map_folded_to_raw(); + while (ret == bad_match) { + // Found something, but it was an invalid match so retry from the next character + match = find_range(folded_hs, folded_n, match.first + 1); + if (match == bad_match) return match; + ret = map_folded_to_raw(); + } + + return ret; +} + } } diff --git a/aegisub/libaegisub/include/libaegisub/util.h b/aegisub/libaegisub/include/libaegisub/util.h index 0a637c01f..95cbac266 100644 --- a/aegisub/libaegisub/include/libaegisub/util.h +++ b/aegisub/libaegisub/include/libaegisub/util.h @@ -41,6 +41,16 @@ namespace agi { /// @return The strftime-formatted string std::string strftime(const char *fmt, const tm *tmptr = nullptr); + /// Case-insensitive find with proper case folding + /// @param haystack String to search + /// @param needle String to look for + /// @return make_pair(-1,-1) if `needle` could not be found, or a range equivalent to `needle` in `haystack` if it could + /// + /// `needle` and `haystack` must both be in Normalization Form D. The size + /// of the match might be different from the size of `needle`, since it's + /// based on the unfolded length. + std::pair ifind(std::string const& haystack, std::string const& needle); + struct delete_ptr { template void operator()(T* ptr) const { diff --git a/aegisub/src/dialog_search_replace.cpp b/aegisub/src/dialog_search_replace.cpp index a8d932f9c..4573af8c7 100644 --- a/aegisub/src/dialog_search_replace.cpp +++ b/aegisub/src/dialog_search_replace.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/aegisub/src/search_replace_engine.cpp b/aegisub/src/search_replace_engine.cpp index 105f14ef0..aa9f4c1de 100644 --- a/aegisub/src/search_replace_engine.cpp +++ b/aegisub/src/search_replace_engine.cpp @@ -25,8 +25,9 @@ #include "text_selection_controller.h" #include +#include -#include +#include #include @@ -43,6 +44,14 @@ auto get_dialogue_field(SearchReplaceSettings::Field field) -> decltype(&AssDial throw agi::InternalError("Bad field for search", 0); } +std::string const& get_normalized(const AssDialogue *diag, decltype(&AssDialogue::Text) field) { + auto& value = const_cast(diag)->*field; + auto normalized = boost::locale::normalize(value.get()); + if (normalized != value) + value = normalized; + return value.get(); +} + typedef std::function matcher; class noop_accessor { @@ -54,7 +63,7 @@ public: std::string get(const AssDialogue *d, size_t s) { start = s; - return (d->*field).get().substr(s); + return get_normalized(d, field).substr(s); } MatchState make_match_state(size_t s, size_t e, boost::u32regex *r = nullptr) { @@ -87,7 +96,7 @@ public: skip_tags_accessor(SearchReplaceSettings::Field f) : field(get_dialogue_field(f)), start(0) { } std::string get(const AssDialogue *d, size_t s) { - auto const& str = (d->*field).get(); + auto const& str = get_normalized(d, field); parse_str(str); std::string out; @@ -156,25 +165,25 @@ matcher get_matcher(SearchReplaceSettings const& settings, Accessor&& a) { }; } - bool full_match_only = settings.exact_match; - bool match_case = settings.match_case; + const bool full_match_only = settings.exact_match; + const bool match_case = settings.match_case; std::string look_for = settings.find; + if (!settings.match_case) - boost::to_lower(look_for); + look_for = boost::locale::fold_case(look_for); return [=](const AssDialogue *diag, size_t start) mutable -> MatchState { - auto str = a.get(diag, start); + const auto str = a.get(diag, start); if (full_match_only && str.size() != look_for.size()) return MatchState(); - if (!match_case) - boost::to_lower(str); + if (match_case) { + const auto pos = str.find(look_for); + return pos == std::string::npos ? MatchState() : a.make_match_state(pos, pos + look_for.size()); + } - size_t pos = str.find(look_for); - if (pos == std::string::npos) - return MatchState(); - - return a.make_match_state(pos, pos + look_for.size()); + const auto pos = agi::util::ifind(str, look_for); + return pos.first == bad_pos ? MatchState() : a.make_match_state(pos.first, pos.second); }; } diff --git a/aegisub/tests/Makefile b/aegisub/tests/Makefile index 076a21d09..8d82b08c8 100644 --- a/aegisub/tests/Makefile +++ b/aegisub/tests/Makefile @@ -23,6 +23,7 @@ SRC = \ tests/dialogue_lexer.cpp \ tests/hotkey.cpp \ tests/iconv.cpp \ + tests/ifind.cpp \ tests/keyframe.cpp \ tests/line_iterator.cpp \ tests/line_wrap.cpp \ diff --git a/aegisub/tests/support/main.cpp b/aegisub/tests/support/main.cpp index 87c32428f..3f0808246 100644 --- a/aegisub/tests/support/main.cpp +++ b/aegisub/tests/support/main.cpp @@ -18,8 +18,11 @@ #include #include +#include + int main(int argc, char **argv) { agi::dispatch::Init([](agi::dispatch::Thunk f) { }); + std::locale::global(boost::locale::generator().generate("")); int retval; agi::log::log = new agi::log::LogSink; diff --git a/aegisub/tests/tests/ifind.cpp b/aegisub/tests/tests/ifind.cpp new file mode 100644 index 000000000..baad59913 --- /dev/null +++ b/aegisub/tests/tests/ifind.cpp @@ -0,0 +1,75 @@ +// Copyright (c) 2013, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// Aegisub Project http://www.aegisub.org/ + +#include + +#include "main.h" + +#define IFIND(haystack, needle) \ + std::pair pos; \ + ASSERT_NO_THROW(pos = agi::util::ifind(haystack, needle)) + +#define EXPECT_IFIND(haystack, needle, s, e) \ + do { \ + IFIND(haystack, needle); \ + EXPECT_EQ((size_t)s, pos.first); \ + EXPECT_EQ((size_t)e, pos.second); \ + } while(false) + +#define EXPECT_NO_MATCH(haystack, needle) \ + do { \ + IFIND(haystack, needle); \ + EXPECT_EQ((size_t)-1, pos.first); \ + EXPECT_EQ((size_t)-1, pos.second); \ + } while(false) + +TEST(lagi_ifind, basic_match) { + EXPECT_IFIND(" a ", "a", 1, 2); + EXPECT_IFIND(" a ", "A", 1, 2); + EXPECT_NO_MATCH(" a ", "b"); +} + +TEST(lagi_ifind, sharp_s_matches_ss) { + // lowercase + EXPECT_IFIND(" \xC3\x9F ", "ss", 1, 3); + EXPECT_IFIND(" ss ", "\xC3\x9F", 1, 3); + + // uppercase + EXPECT_IFIND(" \xE1\xBA\x9E ", "ss", 1, 4); + EXPECT_IFIND(" ss ", "\xE1\xBA\x9E", 1, 3); +} + +TEST(lagi_ifind, no_partial_match_on_decomposed_character) { + EXPECT_NO_MATCH("s\xEF\xAC\x86", "ss"); // LATIN SMALL LIGATURE ST + EXPECT_NO_MATCH("\xEF\xAC\x86t", "tt"); + EXPECT_NO_MATCH(" \xE1\xBA\x9E ", "s"); + EXPECT_NO_MATCH("\xE1\xBA\x9E", "s"); + EXPECT_IFIND(" \xE1\xBA\x9E s ", "s", 5, 6); + EXPECT_IFIND("s\xE1\xBA\x9E", "ss", 1, 4); + EXPECT_IFIND("s\xE1\xBA\x9E", "sss", 0, 4); + EXPECT_IFIND("\xE1\xBA\x9Es", "sss", 0, 4); + EXPECT_IFIND("\xEF\xAC\x86", "st", 0, 3); +} + +TEST(lagi_ifind, correct_index_with_expanded_character_before_match) { + // U+0587 turns into U+0565 U+0582, all of which are two bytes in UTF-8 + EXPECT_IFIND(" \xD6\x87 a ", "a", 4, 5); +} + +TEST(lagi_ifind, correct_index_with_shrunk_character_before_match) { + // U+FB00 turns into "ff", which is one byte shorter in UTF-8 + EXPECT_IFIND(" \xEF\xAC\x80 a ", "a", 5, 6); +}