forked from mia/Aegisub
Use ICU/boost.locale for case-insensitive searching
Do proper unicode case-folding for case-insensitive searching rather than converting only ascii characters to lowercase. The Turkish 'i' is still not handled correctly (since it's the only place where case-folding is locale-dependent), but that's probably not worth caring about as long as we don't have a Turkish UI translation. This affects both the find/replace dialog and the select lines dialog. Closes #1342.
This commit is contained in:
parent
1cc52611de
commit
47c36c9033
11 changed files with 218 additions and 15 deletions
|
@ -76,7 +76,7 @@ CPPFLAGS_WX = @WX_CPPFLAGS@
|
||||||
LIBS_WX = @WX_LIBS@ -lz
|
LIBS_WX = @WX_LIBS@ -lz
|
||||||
|
|
||||||
CPPFLAGS_BOOST = @BOOST_CPPFLAGS@
|
CPPFLAGS_BOOST = @BOOST_CPPFLAGS@
|
||||||
LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_REGEX_LDFLAGS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LDFLAGS@ @BOOST_SYSTEM_LIBS@
|
LIBS_BOOST = @BOOST_FILESYSTEM_LDFLAGS@ @BOOST_FILESYSTEM_LIBS@ @BOOST_LOCALE_LIBS@ @BOOST_REGEX_LIBS@ @BOOST_SYSTEM_LIBS@
|
||||||
|
|
||||||
CFLAGS_FFMS2 = @FFMS2_CFLAGS@
|
CFLAGS_FFMS2 = @FFMS2_CFLAGS@
|
||||||
CFLAGS_FFTW3 = @FFTW3_CFLAGS@
|
CFLAGS_FFTW3 = @FFTW3_CFLAGS@
|
||||||
|
|
|
@ -44,6 +44,7 @@
|
||||||
<ClCompile Include="$(SrcDir)tests\dialogue_lexer.cpp" />
|
<ClCompile Include="$(SrcDir)tests\dialogue_lexer.cpp" />
|
||||||
<ClCompile Include="$(SrcDir)tests\hotkey.cpp" />
|
<ClCompile Include="$(SrcDir)tests\hotkey.cpp" />
|
||||||
<ClCompile Include="$(SrcDir)tests\iconv.cpp" />
|
<ClCompile Include="$(SrcDir)tests\iconv.cpp" />
|
||||||
|
<ClCompile Include="$(SrcDir)tests\ifind.cpp" />
|
||||||
<ClCompile Include="$(SrcDir)tests\keyframe.cpp" />
|
<ClCompile Include="$(SrcDir)tests\keyframe.cpp" />
|
||||||
<ClCompile Include="$(SrcDir)tests\line_iterator.cpp" />
|
<ClCompile Include="$(SrcDir)tests\line_iterator.cpp" />
|
||||||
<ClCompile Include="$(SrcDir)tests\line_wrap.cpp" />
|
<ClCompile Include="$(SrcDir)tests\line_wrap.cpp" />
|
||||||
|
|
|
@ -35,6 +35,9 @@
|
||||||
<ClCompile Include="$(SrcDir)tests\iconv.cpp">
|
<ClCompile Include="$(SrcDir)tests\iconv.cpp">
|
||||||
<Filter>Tests</Filter>
|
<Filter>Tests</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
<ClCompile Include="$(SrcDir)tests\ifind.cpp">
|
||||||
|
<Filter>Tests</Filter>
|
||||||
|
</ClCompile>
|
||||||
<ClCompile Include="$(SrcDir)tests\keyframe.cpp">
|
<ClCompile Include="$(SrcDir)tests\keyframe.cpp">
|
||||||
<Filter>Tests</Filter>
|
<Filter>Tests</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
|
|
@ -248,6 +248,7 @@ PKG_CHECK_MODULES(FONTCONFIG, fontconfig >= fontconfig_required_version,
|
||||||
libext=a
|
libext=a
|
||||||
BOOST_REQUIRE([boost_required_version])
|
BOOST_REQUIRE([boost_required_version])
|
||||||
BOOST_FILESYSTEM
|
BOOST_FILESYSTEM
|
||||||
|
BOOST_LOCALE
|
||||||
BOOST_REGEX
|
BOOST_REGEX
|
||||||
|
|
||||||
########
|
########
|
||||||
|
|
|
@ -18,8 +18,45 @@
|
||||||
|
|
||||||
#include "libaegisub/util.h"
|
#include "libaegisub/util.h"
|
||||||
|
|
||||||
|
#include "libaegisub/exception.h"
|
||||||
|
|
||||||
|
#include <boost/locale/boundary.hpp>
|
||||||
|
#include <boost/locale/conversion.hpp>
|
||||||
|
#include <boost/range/algorithm_ext.hpp>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
const size_t bad_pos = (size_t)-1;
|
||||||
|
const std::pair<size_t, size_t> bad_match(bad_pos, bad_pos);
|
||||||
|
|
||||||
|
template<typename Iterator>
|
||||||
|
size_t advance_both(Iterator& folded, Iterator& raw) {
|
||||||
|
size_t len;
|
||||||
|
if (*folded == *raw) {
|
||||||
|
len = folded->length();
|
||||||
|
++folded;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// This character was changed by case folding, so refold it and eat the
|
||||||
|
// appropriate number of characters from folded
|
||||||
|
len = boost::locale::fold_case(raw->str()).size();
|
||||||
|
for (size_t folded_consumed = 0; folded_consumed < len; ++folded)
|
||||||
|
folded_consumed += folded->length();
|
||||||
|
}
|
||||||
|
|
||||||
|
++raw;
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<size_t, size_t> find_range(std::string const& haystack, std::string const& needle, size_t start = 0) {
|
||||||
|
const size_t match_start = haystack.find(needle, start);
|
||||||
|
if (match_start == std::string::npos)
|
||||||
|
return bad_match;
|
||||||
|
return std::make_pair(match_start, match_start + needle.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
namespace agi { namespace util {
|
namespace agi { namespace util {
|
||||||
|
|
||||||
std::string strftime(const char *fmt, const tm *tmptr) {
|
std::string strftime(const char *fmt, const tm *tmptr) {
|
||||||
|
@ -33,4 +70,66 @@ std::string strftime(const char *fmt, const tm *tmptr) {
|
||||||
return buff;
|
return buff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::pair<size_t, size_t> ifind(std::string const& haystack, std::string const& needle) {
|
||||||
|
const auto folded_hs = boost::locale::fold_case(haystack);
|
||||||
|
const auto folded_n = boost::locale::fold_case(needle);
|
||||||
|
auto match = find_range(folded_hs, folded_n);
|
||||||
|
if (match == bad_match || folded_hs == haystack)
|
||||||
|
return match;
|
||||||
|
|
||||||
|
// We have a match, but the position is an index into the folded string
|
||||||
|
// and we want an index into the unfolded string.
|
||||||
|
|
||||||
|
using namespace boost::locale::boundary;
|
||||||
|
const ssegment_index haystack_characters(character, begin(haystack), end(haystack));
|
||||||
|
const ssegment_index folded_characters(character, begin(folded_hs), end(folded_hs));
|
||||||
|
const size_t haystack_char_count = boost::distance(haystack_characters);
|
||||||
|
const size_t folded_char_count = boost::distance(folded_characters);
|
||||||
|
|
||||||
|
// As of Unicode 6.2, case folding can never reduce the number of
|
||||||
|
// characters, and can only reduce the number of bytes with UTF-8 when
|
||||||
|
// increasing the number of characters. As a result, iff the bytes and
|
||||||
|
// characters are unchanged, no folds changed the size of any characters
|
||||||
|
// and our indices are correct.
|
||||||
|
if (haystack.size() == folded_hs.size() && haystack_char_count == folded_char_count)
|
||||||
|
return match;
|
||||||
|
|
||||||
|
const auto map_folded_to_raw = [&]() -> std::pair<size_t, size_t> {
|
||||||
|
size_t start = -1;
|
||||||
|
|
||||||
|
// Iterate over each pair of characters and refold each character which was
|
||||||
|
// changed by folding, so that we can find the corresponding positions in
|
||||||
|
// the unfolded string
|
||||||
|
auto folded_it = begin(folded_characters);
|
||||||
|
auto haystack_it = begin(haystack_characters);
|
||||||
|
size_t folded_pos = 0;
|
||||||
|
|
||||||
|
while (folded_pos < match.first)
|
||||||
|
folded_pos += advance_both(folded_it, haystack_it);
|
||||||
|
// If we overshot the start then the match started in the middle of a
|
||||||
|
// character which was folded to multiple characters
|
||||||
|
if (folded_pos > match.first)
|
||||||
|
return bad_match;
|
||||||
|
|
||||||
|
start = distance(begin(haystack), begin(*haystack_it));
|
||||||
|
|
||||||
|
while (folded_pos < match.second)
|
||||||
|
folded_pos += advance_both(folded_it, haystack_it);
|
||||||
|
if (folded_pos > match.second)
|
||||||
|
return bad_match;
|
||||||
|
|
||||||
|
return std::make_pair(start, distance(begin(haystack), begin(*haystack_it)));
|
||||||
|
};
|
||||||
|
|
||||||
|
auto ret = map_folded_to_raw();
|
||||||
|
while (ret == bad_match) {
|
||||||
|
// Found something, but it was an invalid match so retry from the next character
|
||||||
|
match = find_range(folded_hs, folded_n, match.first + 1);
|
||||||
|
if (match == bad_match) return match;
|
||||||
|
ret = map_folded_to_raw();
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
} }
|
} }
|
||||||
|
|
|
@ -41,6 +41,16 @@ namespace agi {
|
||||||
/// @return The strftime-formatted string
|
/// @return The strftime-formatted string
|
||||||
std::string strftime(const char *fmt, const tm *tmptr = nullptr);
|
std::string strftime(const char *fmt, const tm *tmptr = nullptr);
|
||||||
|
|
||||||
|
/// Case-insensitive find with proper case folding
|
||||||
|
/// @param haystack String to search
|
||||||
|
/// @param needle String to look for
|
||||||
|
/// @return make_pair(-1,-1) if `needle` could not be found, or a range equivalent to `needle` in `haystack` if it could
|
||||||
|
///
|
||||||
|
/// `needle` and `haystack` must both be in Normalization Form D. The size
|
||||||
|
/// of the match might be different from the size of `needle`, since it's
|
||||||
|
/// based on the unfolded length.
|
||||||
|
std::pair<size_t, size_t> ifind(std::string const& haystack, std::string const& needle);
|
||||||
|
|
||||||
struct delete_ptr {
|
struct delete_ptr {
|
||||||
template<class T>
|
template<class T>
|
||||||
void operator()(T* ptr) const {
|
void operator()(T* ptr) const {
|
||||||
|
|
|
@ -36,6 +36,7 @@
|
||||||
#include <wx/checkbox.h>
|
#include <wx/checkbox.h>
|
||||||
#include <wx/combobox.h>
|
#include <wx/combobox.h>
|
||||||
#include <wx/radiobox.h>
|
#include <wx/radiobox.h>
|
||||||
|
#include <wx/msgdlg.h>
|
||||||
#include <wx/sizer.h>
|
#include <wx/sizer.h>
|
||||||
#include <wx/stattext.h>
|
#include <wx/stattext.h>
|
||||||
#include <wx/textctrl.h>
|
#include <wx/textctrl.h>
|
||||||
|
|
|
@ -25,8 +25,9 @@
|
||||||
#include "text_selection_controller.h"
|
#include "text_selection_controller.h"
|
||||||
|
|
||||||
#include <libaegisub/of_type_adaptor.h>
|
#include <libaegisub/of_type_adaptor.h>
|
||||||
|
#include <libaegisub/util.h>
|
||||||
|
|
||||||
#include <boost/algorithm/string/case_conv.hpp>
|
#include <boost/locale.hpp>
|
||||||
|
|
||||||
#include <wx/msgdlg.h>
|
#include <wx/msgdlg.h>
|
||||||
|
|
||||||
|
@ -43,6 +44,14 @@ auto get_dialogue_field(SearchReplaceSettings::Field field) -> decltype(&AssDial
|
||||||
throw agi::InternalError("Bad field for search", 0);
|
throw agi::InternalError("Bad field for search", 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string const& get_normalized(const AssDialogue *diag, decltype(&AssDialogue::Text) field) {
|
||||||
|
auto& value = const_cast<AssDialogue*>(diag)->*field;
|
||||||
|
auto normalized = boost::locale::normalize(value.get());
|
||||||
|
if (normalized != value)
|
||||||
|
value = normalized;
|
||||||
|
return value.get();
|
||||||
|
}
|
||||||
|
|
||||||
typedef std::function<MatchState (const AssDialogue*, size_t)> matcher;
|
typedef std::function<MatchState (const AssDialogue*, size_t)> matcher;
|
||||||
|
|
||||||
class noop_accessor {
|
class noop_accessor {
|
||||||
|
@ -54,7 +63,7 @@ public:
|
||||||
|
|
||||||
std::string get(const AssDialogue *d, size_t s) {
|
std::string get(const AssDialogue *d, size_t s) {
|
||||||
start = s;
|
start = s;
|
||||||
return (d->*field).get().substr(s);
|
return get_normalized(d, field).substr(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
MatchState make_match_state(size_t s, size_t e, boost::u32regex *r = nullptr) {
|
MatchState make_match_state(size_t s, size_t e, boost::u32regex *r = nullptr) {
|
||||||
|
@ -87,7 +96,7 @@ public:
|
||||||
skip_tags_accessor(SearchReplaceSettings::Field f) : field(get_dialogue_field(f)), start(0) { }
|
skip_tags_accessor(SearchReplaceSettings::Field f) : field(get_dialogue_field(f)), start(0) { }
|
||||||
|
|
||||||
std::string get(const AssDialogue *d, size_t s) {
|
std::string get(const AssDialogue *d, size_t s) {
|
||||||
auto const& str = (d->*field).get();
|
auto const& str = get_normalized(d, field);
|
||||||
parse_str(str);
|
parse_str(str);
|
||||||
|
|
||||||
std::string out;
|
std::string out;
|
||||||
|
@ -156,25 +165,25 @@ matcher get_matcher(SearchReplaceSettings const& settings, Accessor&& a) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool full_match_only = settings.exact_match;
|
const bool full_match_only = settings.exact_match;
|
||||||
bool match_case = settings.match_case;
|
const bool match_case = settings.match_case;
|
||||||
std::string look_for = settings.find;
|
std::string look_for = settings.find;
|
||||||
|
|
||||||
if (!settings.match_case)
|
if (!settings.match_case)
|
||||||
boost::to_lower(look_for);
|
look_for = boost::locale::fold_case(look_for);
|
||||||
|
|
||||||
return [=](const AssDialogue *diag, size_t start) mutable -> MatchState {
|
return [=](const AssDialogue *diag, size_t start) mutable -> MatchState {
|
||||||
auto str = a.get(diag, start);
|
const auto str = a.get(diag, start);
|
||||||
if (full_match_only && str.size() != look_for.size())
|
if (full_match_only && str.size() != look_for.size())
|
||||||
return MatchState();
|
return MatchState();
|
||||||
|
|
||||||
if (!match_case)
|
if (match_case) {
|
||||||
boost::to_lower(str);
|
const auto pos = str.find(look_for);
|
||||||
|
return pos == std::string::npos ? MatchState() : a.make_match_state(pos, pos + look_for.size());
|
||||||
|
}
|
||||||
|
|
||||||
size_t pos = str.find(look_for);
|
const auto pos = agi::util::ifind(str, look_for);
|
||||||
if (pos == std::string::npos)
|
return pos.first == bad_pos ? MatchState() : a.make_match_state(pos.first, pos.second);
|
||||||
return MatchState();
|
|
||||||
|
|
||||||
return a.make_match_state(pos, pos + look_for.size());
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ SRC = \
|
||||||
tests/dialogue_lexer.cpp \
|
tests/dialogue_lexer.cpp \
|
||||||
tests/hotkey.cpp \
|
tests/hotkey.cpp \
|
||||||
tests/iconv.cpp \
|
tests/iconv.cpp \
|
||||||
|
tests/ifind.cpp \
|
||||||
tests/keyframe.cpp \
|
tests/keyframe.cpp \
|
||||||
tests/line_iterator.cpp \
|
tests/line_iterator.cpp \
|
||||||
tests/line_wrap.cpp \
|
tests/line_wrap.cpp \
|
||||||
|
|
|
@ -18,8 +18,11 @@
|
||||||
#include <libaegisub/fs.h>
|
#include <libaegisub/fs.h>
|
||||||
#include <libaegisub/log.h>
|
#include <libaegisub/log.h>
|
||||||
|
|
||||||
|
#include <boost/locale/generator.hpp>
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
agi::dispatch::Init([](agi::dispatch::Thunk f) { });
|
agi::dispatch::Init([](agi::dispatch::Thunk f) { });
|
||||||
|
std::locale::global(boost::locale::generator().generate(""));
|
||||||
|
|
||||||
int retval;
|
int retval;
|
||||||
agi::log::log = new agi::log::LogSink;
|
agi::log::log = new agi::log::LogSink;
|
||||||
|
|
75
aegisub/tests/tests/ifind.cpp
Normal file
75
aegisub/tests/tests/ifind.cpp
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
// Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and distribute this software for any
|
||||||
|
// purpose with or without fee is hereby granted, provided that the above
|
||||||
|
// copyright notice and this permission notice appear in all copies.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
//
|
||||||
|
// Aegisub Project http://www.aegisub.org/
|
||||||
|
|
||||||
|
#include <libaegisub/util.h>
|
||||||
|
|
||||||
|
#include "main.h"
|
||||||
|
|
||||||
|
#define IFIND(haystack, needle) \
|
||||||
|
std::pair<size_t, size_t> pos; \
|
||||||
|
ASSERT_NO_THROW(pos = agi::util::ifind(haystack, needle))
|
||||||
|
|
||||||
|
#define EXPECT_IFIND(haystack, needle, s, e) \
|
||||||
|
do { \
|
||||||
|
IFIND(haystack, needle); \
|
||||||
|
EXPECT_EQ((size_t)s, pos.first); \
|
||||||
|
EXPECT_EQ((size_t)e, pos.second); \
|
||||||
|
} while(false)
|
||||||
|
|
||||||
|
#define EXPECT_NO_MATCH(haystack, needle) \
|
||||||
|
do { \
|
||||||
|
IFIND(haystack, needle); \
|
||||||
|
EXPECT_EQ((size_t)-1, pos.first); \
|
||||||
|
EXPECT_EQ((size_t)-1, pos.second); \
|
||||||
|
} while(false)
|
||||||
|
|
||||||
|
TEST(lagi_ifind, basic_match) {
|
||||||
|
EXPECT_IFIND(" a ", "a", 1, 2);
|
||||||
|
EXPECT_IFIND(" a ", "A", 1, 2);
|
||||||
|
EXPECT_NO_MATCH(" a ", "b");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(lagi_ifind, sharp_s_matches_ss) {
|
||||||
|
// lowercase
|
||||||
|
EXPECT_IFIND(" \xC3\x9F ", "ss", 1, 3);
|
||||||
|
EXPECT_IFIND(" ss ", "\xC3\x9F", 1, 3);
|
||||||
|
|
||||||
|
// uppercase
|
||||||
|
EXPECT_IFIND(" \xE1\xBA\x9E ", "ss", 1, 4);
|
||||||
|
EXPECT_IFIND(" ss ", "\xE1\xBA\x9E", 1, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(lagi_ifind, no_partial_match_on_decomposed_character) {
|
||||||
|
EXPECT_NO_MATCH("s\xEF\xAC\x86", "ss"); // LATIN SMALL LIGATURE ST
|
||||||
|
EXPECT_NO_MATCH("\xEF\xAC\x86t", "tt");
|
||||||
|
EXPECT_NO_MATCH(" \xE1\xBA\x9E ", "s");
|
||||||
|
EXPECT_NO_MATCH("\xE1\xBA\x9E", "s");
|
||||||
|
EXPECT_IFIND(" \xE1\xBA\x9E s ", "s", 5, 6);
|
||||||
|
EXPECT_IFIND("s\xE1\xBA\x9E", "ss", 1, 4);
|
||||||
|
EXPECT_IFIND("s\xE1\xBA\x9E", "sss", 0, 4);
|
||||||
|
EXPECT_IFIND("\xE1\xBA\x9Es", "sss", 0, 4);
|
||||||
|
EXPECT_IFIND("\xEF\xAC\x86", "st", 0, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(lagi_ifind, correct_index_with_expanded_character_before_match) {
|
||||||
|
// U+0587 turns into U+0565 U+0582, all of which are two bytes in UTF-8
|
||||||
|
EXPECT_IFIND(" \xD6\x87 a ", "a", 4, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(lagi_ifind, correct_index_with_shrunk_character_before_match) {
|
||||||
|
// U+FB00 turns into "ff", which is one byte shorter in UTF-8
|
||||||
|
EXPECT_IFIND(" \xEF\xAC\x80 a ", "a", 5, 6);
|
||||||
|
}
|
Loading…
Reference in a new issue