Add tool to repack thesaurus dictionaries

The standard MyThes dictionaries include words that we don't support
(nearly half the entries in the English dictionary have speaces in them)
which waste space, and they aren't UTF-8 which slows down loading as we
need to convert them.

Knocks a total of 10 MB off the thesaurus dictionaries.
This commit is contained in:
Thomas Goyne 2013-12-18 08:41:42 -08:00
parent c28a02a34d
commit d9a2669389
2 changed files with 98 additions and 0 deletions

View file

@ -11,6 +11,13 @@ CLEANFILES += osx-bundle-restart-helper
all: osx-bundle-restart-helper
endif
CXXFLAGS += -I../libaegisub/include $(CFLAGS_ICU)
LIBS := -L../libaegisub -laegisub $(LIBS)
LIBS += $(LIBS_BOOST) $(LIBS_ICU) $(LIBS_LUA)
repack-thes-dict: repack-thes-dict.cpp
$(BIN_CXX) -o repack-thes-dict repack-thes-dict.cpp $(CXXFLAGS) $(LIBS)
EXTRA_DIST = \
osx-bundle-restart-helper.c \
osx-bundle.sh \

View file

@ -0,0 +1,91 @@
// Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include <libaegisub/charset_conv.h>
#include <libaegisub/dispatch.h>
#include <libaegisub/io.h>
#include <libaegisub/line_iterator.h>
#include <libaegisub/log.h>
#include <libaegisub/util.h>
#include <boost/algorithm/string.hpp>
#include <boost/locale/generator.hpp>
#include <boost/phoenix/core/argument.hpp>
#include <boost/phoenix/operator/comparison.hpp>
#include <fstream>
namespace {
using boost::phoenix::placeholders::_1;
void convert(std::string const& path) {
std::unique_ptr<std::ifstream> idx(agi::io::Open(path + ".idx"));
std::unique_ptr<std::ifstream> dat(agi::io::Open(path + ".dat"));
std::ostringstream idx_out_buffer;
agi::io::Save idx_out(path + ".out.idx");
agi::io::Save dat_out(path + ".out.dat");
idx_out.Get() << "UTF-8\n";
dat_out.Get() << "UTF-8\n";
std::string encoding_name;
getline(*idx, encoding_name);
agi::charset::IconvWrapper conv(encoding_name.c_str(), "utf-8");
std::string unused_entry_count;
getline(*idx, unused_entry_count);
int entry_count = 0;
for (auto const& line : agi::line_iterator<std::string>(*idx, encoding_name)) {
std::vector<std::string> chunks;
boost::split(chunks, line, _1 == '|');
if (chunks.size() != 2)
continue;
if (chunks[0].find(' ') != std::string::npos)
continue;
++entry_count;
idx_out_buffer << chunks[0] << '|' << dat_out.Get().tellp() << '\n';
dat->seekg(atoi(chunks[1].c_str()));
agi::line_iterator<std::string> iter{*dat, encoding_name};
dat_out.Get() << *iter << '\n';
std::vector<std::string> header;
boost::split(header, *iter, _1 == '|');
int meanings = atoi(header[1].c_str());
for (int i = 0; i < meanings; ++i)
dat_out.Get() << *++iter << '\n';
}
idx_out.Get() << entry_count << '\n' << idx_out_buffer.str();
}
}
int main(int argc, char *argv[]) {
if (argc != 2) {
printf("usage: respack-thes-dict <path-to-dict-without-extension>\n");
return 1;
}
agi::dispatch::Init([](agi::dispatch::Thunk f) { });
std::locale::global(boost::locale::generator().generate(""));
agi::log::log = new agi::log::LogSink;
convert(argv[1]);
}