Use ICU/boost.locale for case-insensitive searching

Do proper unicode case-folding for case-insensitive searching rather
than converting only ascii characters to lowercase. The Turkish 'i' is
still not handled correctly (since it's the only place where
case-folding is locale-dependent), but that's probably not worth caring
about as long as we don't have a Turkish UI translation.

This affects both the find/replace dialog and the select lines dialog.

Closes #1342.
This commit is contained in:
Thomas Goyne 2013-01-30 07:26:16 -08:00
parent 1cc52611de
commit 47c36c9033
11 changed files with 218 additions and 15 deletions

View file

@ -76,7 +76,7 @@ CPPFLAGS_WX = @WX_CPPFLAGS@
LIBS_WX = @WX_LIBS@ -lz
CPPFLAGS_BOOST = @BOOST_CPPFLAGS@
LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_REGEX_LDFLAGS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LDFLAGS@ @BOOST_SYSTEM_LIBS@
LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_LOCALE_LIBS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LIBS@
CFLAGS_FFMS2 = @FFMS2_CFLAGS@
CFLAGS_FFTW3 = @FFTW3_CFLAGS@

View file

@ -44,6 +44,7 @@
<ClCompile Include="$(SrcDir)tests\dialogue_lexer.cpp" />
<ClCompile Include="$(SrcDir)tests\hotkey.cpp" />
<ClCompile Include="$(SrcDir)tests\iconv.cpp" />
<ClCompile Include="$(SrcDir)tests\ifind.cpp" />
<ClCompile Include="$(SrcDir)tests\keyframe.cpp" />
<ClCompile Include="$(SrcDir)tests\line_iterator.cpp" />
<ClCompile Include="$(SrcDir)tests\line_wrap.cpp" />

View file

@ -35,6 +35,9 @@
<ClCompile Include="$(SrcDir)tests\iconv.cpp">
<Filter>Tests</Filter>
</ClCompile>
<ClCompile Include="$(SrcDir)tests\ifind.cpp">
<Filter>Tests</Filter>
</ClCompile>
<ClCompile Include="$(SrcDir)tests\keyframe.cpp">
<Filter>Tests</Filter>
</ClCompile>

View file

@ -248,6 +248,7 @@ PKG_CHECK_MODULES(FONTCONFIG, fontconfig >= fontconfig_required_version,
libext=a
BOOST_REQUIRE([boost_required_version])
BOOST_FILESYSTEM
BOOST_LOCALE
BOOST_REGEX
########

View file

@ -18,8 +18,45 @@
#include "libaegisub/util.h"
#include "libaegisub/exception.h"
#include <boost/locale/boundary.hpp>
#include <boost/locale/conversion.hpp>
#include <boost/range/algorithm_ext.hpp>
#include <ctime>
namespace {
const size_t bad_pos = (size_t)-1;
const std::pair<size_t, size_t> bad_match(bad_pos, bad_pos);
template<typename Iterator>
size_t advance_both(Iterator& folded, Iterator& raw) {
size_t len;
if (*folded == *raw) {
len = folded->length();
++folded;
}
else {
// This character was changed by case folding, so refold it and eat the
// appropriate number of characters from folded
len = boost::locale::fold_case(raw->str()).size();
for (size_t folded_consumed = 0; folded_consumed < len; ++folded)
folded_consumed += folded->length();
}
++raw;
return len;
}
std::pair<size_t, size_t> find_range(std::string const& haystack, std::string const& needle, size_t start = 0) {
const size_t match_start = haystack.find(needle, start);
if (match_start == std::string::npos)
return bad_match;
return std::make_pair(match_start, match_start + needle.size());
}
}
namespace agi { namespace util {
std::string strftime(const char *fmt, const tm *tmptr) {
@ -33,4 +70,66 @@ std::string strftime(const char *fmt, const tm *tmptr) {
return buff;
}
std::pair<size_t, size_t> ifind(std::string const& haystack, std::string const& needle) {
const auto folded_hs = boost::locale::fold_case(haystack);
const auto folded_n = boost::locale::fold_case(needle);
auto match = find_range(folded_hs, folded_n);
if (match == bad_match || folded_hs == haystack)
return match;
// We have a match, but the position is an index into the folded string
// and we want an index into the unfolded string.
using namespace boost::locale::boundary;
const ssegment_index haystack_characters(character, begin(haystack), end(haystack));
const ssegment_index folded_characters(character, begin(folded_hs), end(folded_hs));
const size_t haystack_char_count = boost::distance(haystack_characters);
const size_t folded_char_count = boost::distance(folded_characters);
// As of Unicode 6.2, case folding can never reduce the number of
// characters, and can only reduce the number of bytes with UTF-8 when
// increasing the number of characters. As a result, iff the bytes and
// characters are unchanged, no folds changed the size of any characters
// and our indices are correct.
if (haystack.size() == folded_hs.size() && haystack_char_count == folded_char_count)
return match;
const auto map_folded_to_raw = [&]() -> std::pair<size_t, size_t> {
size_t start = -1;
// Iterate over each pair of characters and refold each character which was
// changed by folding, so that we can find the corresponding positions in
// the unfolded string
auto folded_it = begin(folded_characters);
auto haystack_it = begin(haystack_characters);
size_t folded_pos = 0;
while (folded_pos < match.first)
folded_pos += advance_both(folded_it, haystack_it);
// If we overshot the start then the match started in the middle of a
// character which was folded to multiple characters
if (folded_pos > match.first)
return bad_match;
start = distance(begin(haystack), begin(*haystack_it));
while (folded_pos < match.second)
folded_pos += advance_both(folded_it, haystack_it);
if (folded_pos > match.second)
return bad_match;
return std::make_pair(start, distance(begin(haystack), begin(*haystack_it)));
};
auto ret = map_folded_to_raw();
while (ret == bad_match) {
// Found something, but it was an invalid match so retry from the next character
match = find_range(folded_hs, folded_n, match.first + 1);
if (match == bad_match) return match;
ret = map_folded_to_raw();
}
return ret;
}
} }

View file

@ -41,6 +41,16 @@ namespace agi {
/// @return The strftime-formatted string
std::string strftime(const char *fmt, const tm *tmptr = nullptr);
/// Case-insensitive find with proper case folding
/// @param haystack String to search
/// @param needle String to look for
/// @return make_pair(-1,-1) if `needle` could not be found, or a range equivalent to `needle` in `haystack` if it could
///
/// `needle` and `haystack` must both be in Normalization Form D. The size
/// of the match might be different from the size of `needle`, since it's
/// based on the unfolded length.
std::pair<size_t, size_t> ifind(std::string const& haystack, std::string const& needle);
struct delete_ptr {
template<class T>
void operator()(T* ptr) const {

View file

@ -36,6 +36,7 @@
#include <wx/checkbox.h>
#include <wx/combobox.h>
#include <wx/radiobox.h>
#include <wx/msgdlg.h>
#include <wx/sizer.h>
#include <wx/stattext.h>
#include <wx/textctrl.h>

View file

@ -25,8 +25,9 @@
#include "text_selection_controller.h"
#include <libaegisub/of_type_adaptor.h>
#include <libaegisub/util.h>
#include <boost/algorithm/string/case_conv.hpp>
#include <boost/locale.hpp>
#include <wx/msgdlg.h>
@ -43,6 +44,14 @@ auto get_dialogue_field(SearchReplaceSettings::Field field) -> decltype(&AssDial
throw agi::InternalError("Bad field for search", 0);
}
std::string const& get_normalized(const AssDialogue *diag, decltype(&AssDialogue::Text) field) {
auto& value = const_cast<AssDialogue*>(diag)->*field;
auto normalized = boost::locale::normalize(value.get());
if (normalized != value)
value = normalized;
return value.get();
}
typedef std::function<MatchState (const AssDialogue*, size_t)> matcher;
class noop_accessor {
@ -54,7 +63,7 @@ public:
std::string get(const AssDialogue *d, size_t s) {
start = s;
return (d->*field).get().substr(s);
return get_normalized(d, field).substr(s);
}
MatchState make_match_state(size_t s, size_t e, boost::u32regex *r = nullptr) {
@ -87,7 +96,7 @@ public:
skip_tags_accessor(SearchReplaceSettings::Field f) : field(get_dialogue_field(f)), start(0) { }
std::string get(const AssDialogue *d, size_t s) {
auto const& str = (d->*field).get();
auto const& str = get_normalized(d, field);
parse_str(str);
std::string out;
@ -156,25 +165,25 @@ matcher get_matcher(SearchReplaceSettings const& settings, Accessor&& a) {
};
}
bool full_match_only = settings.exact_match;
bool match_case = settings.match_case;
const bool full_match_only = settings.exact_match;
const bool match_case = settings.match_case;
std::string look_for = settings.find;
if (!settings.match_case)
boost::to_lower(look_for);
look_for = boost::locale::fold_case(look_for);
return [=](const AssDialogue *diag, size_t start) mutable -> MatchState {
auto str = a.get(diag, start);
const auto str = a.get(diag, start);
if (full_match_only && str.size() != look_for.size())
return MatchState();
if (!match_case)
boost::to_lower(str);
if (match_case) {
const auto pos = str.find(look_for);
return pos == std::string::npos ? MatchState() : a.make_match_state(pos, pos + look_for.size());
}
size_t pos = str.find(look_for);
if (pos == std::string::npos)
return MatchState();
return a.make_match_state(pos, pos + look_for.size());
const auto pos = agi::util::ifind(str, look_for);
return pos.first == bad_pos ? MatchState() : a.make_match_state(pos.first, pos.second);
};
}

View file

@ -23,6 +23,7 @@ SRC = \
tests/dialogue_lexer.cpp \
tests/hotkey.cpp \
tests/iconv.cpp \
tests/ifind.cpp \
tests/keyframe.cpp \
tests/line_iterator.cpp \
tests/line_wrap.cpp \

View file

@ -18,8 +18,11 @@
#include <libaegisub/fs.h>
#include <libaegisub/log.h>
#include <boost/locale/generator.hpp>
int main(int argc, char **argv) {
agi::dispatch::Init([](agi::dispatch::Thunk f) { });
std::locale::global(boost::locale::generator().generate(""));
int retval;
agi::log::log = new agi::log::LogSink;

View file

@ -0,0 +1,75 @@
// Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// Aegisub Project http://www.aegisub.org/
#include <libaegisub/util.h>
#include "main.h"
#define IFIND(haystack, needle) \
std::pair<size_t, size_t> pos; \
ASSERT_NO_THROW(pos = agi::util::ifind(haystack, needle))
#define EXPECT_IFIND(haystack, needle, s, e) \
do { \
IFIND(haystack, needle); \
EXPECT_EQ((size_t)s, pos.first); \
EXPECT_EQ((size_t)e, pos.second); \
} while(false)
#define EXPECT_NO_MATCH(haystack, needle) \
do { \
IFIND(haystack, needle); \
EXPECT_EQ((size_t)-1, pos.first); \
EXPECT_EQ((size_t)-1, pos.second); \
} while(false)
TEST(lagi_ifind, basic_match) {
EXPECT_IFIND(" a ", "a", 1, 2);
EXPECT_IFIND(" a ", "A", 1, 2);
EXPECT_NO_MATCH(" a ", "b");
}
TEST(lagi_ifind, sharp_s_matches_ss) {
// lowercase
EXPECT_IFIND(" \xC3\x9F ", "ss", 1, 3);
EXPECT_IFIND(" ss ", "\xC3\x9F", 1, 3);
// uppercase
EXPECT_IFIND(" \xE1\xBA\x9E ", "ss", 1, 4);
EXPECT_IFIND(" ss ", "\xE1\xBA\x9E", 1, 3);
}
TEST(lagi_ifind, no_partial_match_on_decomposed_character) {
EXPECT_NO_MATCH("s\xEF\xAC\x86", "ss"); // LATIN SMALL LIGATURE ST
EXPECT_NO_MATCH("\xEF\xAC\x86t", "tt");
EXPECT_NO_MATCH(" \xE1\xBA\x9E ", "s");
EXPECT_NO_MATCH("\xE1\xBA\x9E", "s");
EXPECT_IFIND(" \xE1\xBA\x9E s ", "s", 5, 6);
EXPECT_IFIND("s\xE1\xBA\x9E", "ss", 1, 4);
EXPECT_IFIND("s\xE1\xBA\x9E", "sss", 0, 4);
EXPECT_IFIND("\xE1\xBA\x9Es", "sss", 0, 4);
EXPECT_IFIND("\xEF\xAC\x86", "st", 0, 3);
}
TEST(lagi_ifind, correct_index_with_expanded_character_before_match) {
// U+0587 turns into U+0565 U+0582, all of which are two bytes in UTF-8
EXPECT_IFIND(" \xD6\x87 a ", "a", 4, 5);
}
TEST(lagi_ifind, correct_index_with_shrunk_character_before_match) {
// U+FB00 turns into "ff", which is one byte shorter in UTF-8
EXPECT_IFIND(" \xEF\xAC\x80 a ", "a", 5, 6);
}