Use ICU/boost.locale for case-insensitive searching
Do proper unicode case-folding for case-insensitive searching rather than converting only ascii characters to lowercase. The Turkish 'i' is still not handled correctly (since it's the only place where case-folding is locale-dependent), but that's probably not worth caring about as long as we don't have a Turkish UI translation. This affects both the find/replace dialog and the select lines dialog. Closes #1342.
This commit is contained in:
parent
1cc52611de
commit
47c36c9033
11 changed files with 218 additions and 15 deletions
|
@ -76,7 +76,7 @@ CPPFLAGS_WX = @WX_CPPFLAGS@
|
|||
LIBS_WX = @WX_LIBS@ -lz
|
||||
|
||||
CPPFLAGS_BOOST = @BOOST_CPPFLAGS@
|
||||
LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_REGEX_LDFLAGS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LDFLAGS@ @BOOST_SYSTEM_LIBS@
|
||||
LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_LOCALE_LIBS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LIBS@
|
||||
|
||||
CFLAGS_FFMS2 = @FFMS2_CFLAGS@
|
||||
CFLAGS_FFTW3 = @FFTW3_CFLAGS@
|
||||
|
|
|
@ -44,6 +44,7 @@
|
|||
<ClCompile Include="$(SrcDir)tests\dialogue_lexer.cpp" />
|
||||
<ClCompile Include="$(SrcDir)tests\hotkey.cpp" />
|
||||
<ClCompile Include="$(SrcDir)tests\iconv.cpp" />
|
||||
<ClCompile Include="$(SrcDir)tests\ifind.cpp" />
|
||||
<ClCompile Include="$(SrcDir)tests\keyframe.cpp" />
|
||||
<ClCompile Include="$(SrcDir)tests\line_iterator.cpp" />
|
||||
<ClCompile Include="$(SrcDir)tests\line_wrap.cpp" />
|
||||
|
|
|
@ -35,6 +35,9 @@
|
|||
<ClCompile Include="$(SrcDir)tests\iconv.cpp">
|
||||
<Filter>Tests</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="$(SrcDir)tests\ifind.cpp">
|
||||
<Filter>Tests</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="$(SrcDir)tests\keyframe.cpp">
|
||||
<Filter>Tests</Filter>
|
||||
</ClCompile>
|
||||
|
|
|
@ -248,6 +248,7 @@ PKG_CHECK_MODULES(FONTCONFIG, fontconfig >= fontconfig_required_version,
|
|||
libext=a
|
||||
BOOST_REQUIRE([boost_required_version])
|
||||
BOOST_FILESYSTEM
|
||||
BOOST_LOCALE
|
||||
BOOST_REGEX
|
||||
|
||||
########
|
||||
|
|
|
@ -18,8 +18,45 @@
|
|||
|
||||
#include "libaegisub/util.h"
|
||||
|
||||
#include "libaegisub/exception.h"
|
||||
|
||||
#include <boost/locale/boundary.hpp>
|
||||
#include <boost/locale/conversion.hpp>
|
||||
#include <boost/range/algorithm_ext.hpp>
|
||||
#include <ctime>
|
||||
|
||||
namespace {
|
||||
const size_t bad_pos = (size_t)-1;
|
||||
const std::pair<size_t, size_t> bad_match(bad_pos, bad_pos);
|
||||
|
||||
template<typename Iterator>
|
||||
size_t advance_both(Iterator& folded, Iterator& raw) {
|
||||
size_t len;
|
||||
if (*folded == *raw) {
|
||||
len = folded->length();
|
||||
++folded;
|
||||
}
|
||||
else {
|
||||
// This character was changed by case folding, so refold it and eat the
|
||||
// appropriate number of characters from folded
|
||||
len = boost::locale::fold_case(raw->str()).size();
|
||||
for (size_t folded_consumed = 0; folded_consumed < len; ++folded)
|
||||
folded_consumed += folded->length();
|
||||
}
|
||||
|
||||
++raw;
|
||||
return len;
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t> find_range(std::string const& haystack, std::string const& needle, size_t start = 0) {
|
||||
const size_t match_start = haystack.find(needle, start);
|
||||
if (match_start == std::string::npos)
|
||||
return bad_match;
|
||||
return std::make_pair(match_start, match_start + needle.size());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace agi { namespace util {
|
||||
|
||||
std::string strftime(const char *fmt, const tm *tmptr) {
|
||||
|
@ -33,4 +70,66 @@ std::string strftime(const char *fmt, const tm *tmptr) {
|
|||
return buff;
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t> ifind(std::string const& haystack, std::string const& needle) {
|
||||
const auto folded_hs = boost::locale::fold_case(haystack);
|
||||
const auto folded_n = boost::locale::fold_case(needle);
|
||||
auto match = find_range(folded_hs, folded_n);
|
||||
if (match == bad_match || folded_hs == haystack)
|
||||
return match;
|
||||
|
||||
// We have a match, but the position is an index into the folded string
|
||||
// and we want an index into the unfolded string.
|
||||
|
||||
using namespace boost::locale::boundary;
|
||||
const ssegment_index haystack_characters(character, begin(haystack), end(haystack));
|
||||
const ssegment_index folded_characters(character, begin(folded_hs), end(folded_hs));
|
||||
const size_t haystack_char_count = boost::distance(haystack_characters);
|
||||
const size_t folded_char_count = boost::distance(folded_characters);
|
||||
|
||||
// As of Unicode 6.2, case folding can never reduce the number of
|
||||
// characters, and can only reduce the number of bytes with UTF-8 when
|
||||
// increasing the number of characters. As a result, iff the bytes and
|
||||
// characters are unchanged, no folds changed the size of any characters
|
||||
// and our indices are correct.
|
||||
if (haystack.size() == folded_hs.size() && haystack_char_count == folded_char_count)
|
||||
return match;
|
||||
|
||||
const auto map_folded_to_raw = [&]() -> std::pair<size_t, size_t> {
|
||||
size_t start = -1;
|
||||
|
||||
// Iterate over each pair of characters and refold each character which was
|
||||
// changed by folding, so that we can find the corresponding positions in
|
||||
// the unfolded string
|
||||
auto folded_it = begin(folded_characters);
|
||||
auto haystack_it = begin(haystack_characters);
|
||||
size_t folded_pos = 0;
|
||||
|
||||
while (folded_pos < match.first)
|
||||
folded_pos += advance_both(folded_it, haystack_it);
|
||||
// If we overshot the start then the match started in the middle of a
|
||||
// character which was folded to multiple characters
|
||||
if (folded_pos > match.first)
|
||||
return bad_match;
|
||||
|
||||
start = distance(begin(haystack), begin(*haystack_it));
|
||||
|
||||
while (folded_pos < match.second)
|
||||
folded_pos += advance_both(folded_it, haystack_it);
|
||||
if (folded_pos > match.second)
|
||||
return bad_match;
|
||||
|
||||
return std::make_pair(start, distance(begin(haystack), begin(*haystack_it)));
|
||||
};
|
||||
|
||||
auto ret = map_folded_to_raw();
|
||||
while (ret == bad_match) {
|
||||
// Found something, but it was an invalid match so retry from the next character
|
||||
match = find_range(folded_hs, folded_n, match.first + 1);
|
||||
if (match == bad_match) return match;
|
||||
ret = map_folded_to_raw();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
} }
|
||||
|
|
|
@ -41,6 +41,16 @@ namespace agi {
|
|||
/// @return The strftime-formatted string
|
||||
std::string strftime(const char *fmt, const tm *tmptr = nullptr);
|
||||
|
||||
/// Case-insensitive find with proper case folding
|
||||
/// @param haystack String to search
|
||||
/// @param needle String to look for
|
||||
/// @return make_pair(-1,-1) if `needle` could not be found, or a range equivalent to `needle` in `haystack` if it could
|
||||
///
|
||||
/// `needle` and `haystack` must both be in Normalization Form D. The size
|
||||
/// of the match might be different from the size of `needle`, since it's
|
||||
/// based on the unfolded length.
|
||||
std::pair<size_t, size_t> ifind(std::string const& haystack, std::string const& needle);
|
||||
|
||||
struct delete_ptr {
|
||||
template<class T>
|
||||
void operator()(T* ptr) const {
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
#include <wx/checkbox.h>
|
||||
#include <wx/combobox.h>
|
||||
#include <wx/radiobox.h>
|
||||
#include <wx/msgdlg.h>
|
||||
#include <wx/sizer.h>
|
||||
#include <wx/stattext.h>
|
||||
#include <wx/textctrl.h>
|
||||
|
|
|
@ -25,8 +25,9 @@
|
|||
#include "text_selection_controller.h"
|
||||
|
||||
#include <libaegisub/of_type_adaptor.h>
|
||||
#include <libaegisub/util.h>
|
||||
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
#include <boost/locale.hpp>
|
||||
|
||||
#include <wx/msgdlg.h>
|
||||
|
||||
|
@ -43,6 +44,14 @@ auto get_dialogue_field(SearchReplaceSettings::Field field) -> decltype(&AssDial
|
|||
throw agi::InternalError("Bad field for search", 0);
|
||||
}
|
||||
|
||||
std::string const& get_normalized(const AssDialogue *diag, decltype(&AssDialogue::Text) field) {
|
||||
auto& value = const_cast<AssDialogue*>(diag)->*field;
|
||||
auto normalized = boost::locale::normalize(value.get());
|
||||
if (normalized != value)
|
||||
value = normalized;
|
||||
return value.get();
|
||||
}
|
||||
|
||||
typedef std::function<MatchState (const AssDialogue*, size_t)> matcher;
|
||||
|
||||
class noop_accessor {
|
||||
|
@ -54,7 +63,7 @@ public:
|
|||
|
||||
std::string get(const AssDialogue *d, size_t s) {
|
||||
start = s;
|
||||
return (d->*field).get().substr(s);
|
||||
return get_normalized(d, field).substr(s);
|
||||
}
|
||||
|
||||
MatchState make_match_state(size_t s, size_t e, boost::u32regex *r = nullptr) {
|
||||
|
@ -87,7 +96,7 @@ public:
|
|||
skip_tags_accessor(SearchReplaceSettings::Field f) : field(get_dialogue_field(f)), start(0) { }
|
||||
|
||||
std::string get(const AssDialogue *d, size_t s) {
|
||||
auto const& str = (d->*field).get();
|
||||
auto const& str = get_normalized(d, field);
|
||||
parse_str(str);
|
||||
|
||||
std::string out;
|
||||
|
@ -156,25 +165,25 @@ matcher get_matcher(SearchReplaceSettings const& settings, Accessor&& a) {
|
|||
};
|
||||
}
|
||||
|
||||
bool full_match_only = settings.exact_match;
|
||||
bool match_case = settings.match_case;
|
||||
const bool full_match_only = settings.exact_match;
|
||||
const bool match_case = settings.match_case;
|
||||
std::string look_for = settings.find;
|
||||
|
||||
if (!settings.match_case)
|
||||
boost::to_lower(look_for);
|
||||
look_for = boost::locale::fold_case(look_for);
|
||||
|
||||
return [=](const AssDialogue *diag, size_t start) mutable -> MatchState {
|
||||
auto str = a.get(diag, start);
|
||||
const auto str = a.get(diag, start);
|
||||
if (full_match_only && str.size() != look_for.size())
|
||||
return MatchState();
|
||||
|
||||
if (!match_case)
|
||||
boost::to_lower(str);
|
||||
if (match_case) {
|
||||
const auto pos = str.find(look_for);
|
||||
return pos == std::string::npos ? MatchState() : a.make_match_state(pos, pos + look_for.size());
|
||||
}
|
||||
|
||||
size_t pos = str.find(look_for);
|
||||
if (pos == std::string::npos)
|
||||
return MatchState();
|
||||
|
||||
return a.make_match_state(pos, pos + look_for.size());
|
||||
const auto pos = agi::util::ifind(str, look_for);
|
||||
return pos.first == bad_pos ? MatchState() : a.make_match_state(pos.first, pos.second);
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ SRC = \
|
|||
tests/dialogue_lexer.cpp \
|
||||
tests/hotkey.cpp \
|
||||
tests/iconv.cpp \
|
||||
tests/ifind.cpp \
|
||||
tests/keyframe.cpp \
|
||||
tests/line_iterator.cpp \
|
||||
tests/line_wrap.cpp \
|
||||
|
|
|
@ -18,8 +18,11 @@
|
|||
#include <libaegisub/fs.h>
|
||||
#include <libaegisub/log.h>
|
||||
|
||||
#include <boost/locale/generator.hpp>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
agi::dispatch::Init([](agi::dispatch::Thunk f) { });
|
||||
std::locale::global(boost::locale::generator().generate(""));
|
||||
|
||||
int retval;
|
||||
agi::log::log = new agi::log::LogSink;
|
||||
|
|
75
aegisub/tests/tests/ifind.cpp
Normal file
75
aegisub/tests/tests/ifind.cpp
Normal file
|
@ -0,0 +1,75 @@
|
|||
// Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
|
||||
//
|
||||
// Permission to use, copy, modify, and distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
//
|
||||
// Aegisub Project http://www.aegisub.org/
|
||||
|
||||
#include <libaegisub/util.h>
|
||||
|
||||
#include "main.h"
|
||||
|
||||
#define IFIND(haystack, needle) \
|
||||
std::pair<size_t, size_t> pos; \
|
||||
ASSERT_NO_THROW(pos = agi::util::ifind(haystack, needle))
|
||||
|
||||
#define EXPECT_IFIND(haystack, needle, s, e) \
|
||||
do { \
|
||||
IFIND(haystack, needle); \
|
||||
EXPECT_EQ((size_t)s, pos.first); \
|
||||
EXPECT_EQ((size_t)e, pos.second); \
|
||||
} while(false)
|
||||
|
||||
#define EXPECT_NO_MATCH(haystack, needle) \
|
||||
do { \
|
||||
IFIND(haystack, needle); \
|
||||
EXPECT_EQ((size_t)-1, pos.first); \
|
||||
EXPECT_EQ((size_t)-1, pos.second); \
|
||||
} while(false)
|
||||
|
||||
TEST(lagi_ifind, basic_match) {
|
||||
EXPECT_IFIND(" a ", "a", 1, 2);
|
||||
EXPECT_IFIND(" a ", "A", 1, 2);
|
||||
EXPECT_NO_MATCH(" a ", "b");
|
||||
}
|
||||
|
||||
TEST(lagi_ifind, sharp_s_matches_ss) {
|
||||
// lowercase
|
||||
EXPECT_IFIND(" \xC3\x9F ", "ss", 1, 3);
|
||||
EXPECT_IFIND(" ss ", "\xC3\x9F", 1, 3);
|
||||
|
||||
// uppercase
|
||||
EXPECT_IFIND(" \xE1\xBA\x9E ", "ss", 1, 4);
|
||||
EXPECT_IFIND(" ss ", "\xE1\xBA\x9E", 1, 3);
|
||||
}
|
||||
|
||||
TEST(lagi_ifind, no_partial_match_on_decomposed_character) {
|
||||
EXPECT_NO_MATCH("s\xEF\xAC\x86", "ss"); // LATIN SMALL LIGATURE ST
|
||||
EXPECT_NO_MATCH("\xEF\xAC\x86t", "tt");
|
||||
EXPECT_NO_MATCH(" \xE1\xBA\x9E ", "s");
|
||||
EXPECT_NO_MATCH("\xE1\xBA\x9E", "s");
|
||||
EXPECT_IFIND(" \xE1\xBA\x9E s ", "s", 5, 6);
|
||||
EXPECT_IFIND("s\xE1\xBA\x9E", "ss", 1, 4);
|
||||
EXPECT_IFIND("s\xE1\xBA\x9E", "sss", 0, 4);
|
||||
EXPECT_IFIND("\xE1\xBA\x9Es", "sss", 0, 4);
|
||||
EXPECT_IFIND("\xEF\xAC\x86", "st", 0, 3);
|
||||
}
|
||||
|
||||
TEST(lagi_ifind, correct_index_with_expanded_character_before_match) {
|
||||
// U+0587 turns into U+0565 U+0582, all of which are two bytes in UTF-8
|
||||
EXPECT_IFIND(" \xD6\x87 a ", "a", 4, 5);
|
||||
}
|
||||
|
||||
TEST(lagi_ifind, correct_index_with_shrunk_character_before_match) {
|
||||
// U+FB00 turns into "ff", which is one byte shorter in UTF-8
|
||||
EXPECT_IFIND(" \xEF\xAC\x80 a ", "a", 5, 6);
|
||||
}
|
Loading…
Reference in a new issue