Use ICU directly for character counting
ICU docs say not to create a new break iterator each time as boost.locale does, and in fact creating the break iterator is about 90% of the run time of the character counter, so use ICU directly and cache the break iterator.
This commit is contained in:
parent
af32733797
commit
2de95818db
1 changed files with 43 additions and 12 deletions
|
@ -17,28 +17,59 @@
|
||||||
#include "libaegisub/character_count.h"
|
#include "libaegisub/character_count.h"
|
||||||
|
|
||||||
#include "libaegisub/ass/dialogue_parser.h"
|
#include "libaegisub/ass/dialogue_parser.h"
|
||||||
|
#include "libaegisub/exception.h"
|
||||||
|
|
||||||
#include <boost/locale/boundary.hpp>
|
|
||||||
#include <boost/range/algorithm_ext.hpp>
|
|
||||||
#include <unicode/uchar.h>
|
#include <unicode/uchar.h>
|
||||||
#include <unicode/utf8.h>
|
#include <unicode/utf8.h>
|
||||||
|
|
||||||
|
#include <mutex>
|
||||||
|
#include <unicode/brkiter.h>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
struct utext_deleter {
|
||||||
|
void operator()(UText *ut) { if (ut) utext_close(ut); }
|
||||||
|
};
|
||||||
|
using utext_ptr = std::unique_ptr<UText, utext_deleter>;
|
||||||
|
|
||||||
|
template<typename Iterator>
|
||||||
|
utext_ptr to_utext(Iterator begin, Iterator end) {
|
||||||
|
UErrorCode err = U_ZERO_ERROR;
|
||||||
|
utext_ptr ret(utext_openUTF8(nullptr, &*begin, end - begin, &err));
|
||||||
|
if (U_FAILURE(err)) throw agi::InternalError("Failed to open utext", nullptr);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Iterator>
|
template <typename Iterator>
|
||||||
size_t count_in_range(Iterator begin, Iterator end, bool ignore_whitespace) {
|
size_t count_in_range(Iterator begin, Iterator end, bool ignore_whitespace) {
|
||||||
using namespace boost::locale::boundary;
|
if (begin == end) return 0;
|
||||||
const ssegment_index characters(character, begin, end);
|
|
||||||
if (!ignore_whitespace)
|
static std::unique_ptr<icu::BreakIterator> character_bi;
|
||||||
return boost::distance(characters);
|
static std::once_flag token;
|
||||||
|
std::call_once(token, [&] {
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
character_bi.reset(BreakIterator::createCharacterInstance(Locale::getDefault(), status));
|
||||||
|
if (U_FAILURE(status)) throw agi::InternalError("Failed to create character iterator", nullptr);
|
||||||
|
});
|
||||||
|
|
||||||
|
UErrorCode err = U_ZERO_ERROR;
|
||||||
|
|
||||||
|
utext_ptr ut = to_utext(begin, end);
|
||||||
|
character_bi->setText(ut.get(), err);
|
||||||
|
if (U_FAILURE(err)) throw agi::InternalError("Failed to set break iterator text", nullptr);
|
||||||
|
|
||||||
// characters.rule(word_any) doesn't seem to work for character indexes (everything is word_none)
|
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
for (auto const& chr : characters) {
|
auto pos = character_bi->first();
|
||||||
UChar32 c;
|
for (auto end = character_bi->next(); end != BreakIterator::DONE; pos = end, end = character_bi->next()) {
|
||||||
int i = 0;
|
if (!ignore_whitespace)
|
||||||
U8_NEXT_UNSAFE(chr.begin(), i, c);
|
|
||||||
if (!u_isUWhiteSpace(c))
|
|
||||||
++count;
|
++count;
|
||||||
|
else {
|
||||||
|
UChar32 c;
|
||||||
|
int i = 0;
|
||||||
|
U8_NEXT_UNSAFE(begin + pos, i, c);
|
||||||
|
if (!u_isUWhiteSpace(c))
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue