Add tool to repack thesaurus dictionaries
The standard MyThes dictionaries include words that we don't support (nearly half the entries in the English dictionary have speaces in them) which waste space, and they aren't UTF-8 which slows down loading as we need to convert them. Knocks a total of 10 MB off the thesaurus dictionaries.
This commit is contained in:
parent
c28a02a34d
commit
d9a2669389
2 changed files with 98 additions and 0 deletions
|
@ -11,6 +11,13 @@ CLEANFILES += osx-bundle-restart-helper
|
|||
all: osx-bundle-restart-helper
|
||||
endif
|
||||
|
||||
CXXFLAGS += -I../libaegisub/include $(CFLAGS_ICU)
|
||||
LIBS := -L../libaegisub -laegisub $(LIBS)
|
||||
LIBS += $(LIBS_BOOST) $(LIBS_ICU) $(LIBS_LUA)
|
||||
|
||||
repack-thes-dict: repack-thes-dict.cpp
|
||||
$(BIN_CXX) -o repack-thes-dict repack-thes-dict.cpp $(CXXFLAGS) $(LIBS)
|
||||
|
||||
EXTRA_DIST = \
|
||||
osx-bundle-restart-helper.c \
|
||||
osx-bundle.sh \
|
||||
|
|
91
aegisub/tools/repack-thes-dict.cpp
Normal file
91
aegisub/tools/repack-thes-dict.cpp
Normal file
|
@ -0,0 +1,91 @@
|
|||
// Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
|
||||
//
|
||||
// Permission to use, copy, modify, and distribute this software for any
|
||||
// purpose with or without fee is hereby granted, provided that the above
|
||||
// copyright notice and this permission notice appear in all copies.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
#include <libaegisub/charset_conv.h>
|
||||
#include <libaegisub/dispatch.h>
|
||||
#include <libaegisub/io.h>
|
||||
#include <libaegisub/line_iterator.h>
|
||||
#include <libaegisub/log.h>
|
||||
#include <libaegisub/util.h>
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/locale/generator.hpp>
|
||||
#include <boost/phoenix/core/argument.hpp>
|
||||
#include <boost/phoenix/operator/comparison.hpp>
|
||||
#include <fstream>
|
||||
|
||||
namespace {
|
||||
using boost::phoenix::placeholders::_1;
|
||||
|
||||
void convert(std::string const& path) {
|
||||
std::unique_ptr<std::ifstream> idx(agi::io::Open(path + ".idx"));
|
||||
std::unique_ptr<std::ifstream> dat(agi::io::Open(path + ".dat"));
|
||||
|
||||
std::ostringstream idx_out_buffer;
|
||||
agi::io::Save idx_out(path + ".out.idx");
|
||||
agi::io::Save dat_out(path + ".out.dat");
|
||||
|
||||
idx_out.Get() << "UTF-8\n";
|
||||
dat_out.Get() << "UTF-8\n";
|
||||
|
||||
std::string encoding_name;
|
||||
getline(*idx, encoding_name);
|
||||
|
||||
agi::charset::IconvWrapper conv(encoding_name.c_str(), "utf-8");
|
||||
|
||||
std::string unused_entry_count;
|
||||
getline(*idx, unused_entry_count);
|
||||
|
||||
int entry_count = 0;
|
||||
|
||||
for (auto const& line : agi::line_iterator<std::string>(*idx, encoding_name)) {
|
||||
std::vector<std::string> chunks;
|
||||
boost::split(chunks, line, _1 == '|');
|
||||
if (chunks.size() != 2)
|
||||
continue;
|
||||
if (chunks[0].find(' ') != std::string::npos)
|
||||
continue;
|
||||
|
||||
++entry_count;
|
||||
|
||||
idx_out_buffer << chunks[0] << '|' << dat_out.Get().tellp() << '\n';
|
||||
dat->seekg(atoi(chunks[1].c_str()));
|
||||
|
||||
agi::line_iterator<std::string> iter{*dat, encoding_name};
|
||||
dat_out.Get() << *iter << '\n';
|
||||
|
||||
std::vector<std::string> header;
|
||||
boost::split(header, *iter, _1 == '|');
|
||||
int meanings = atoi(header[1].c_str());
|
||||
for (int i = 0; i < meanings; ++i)
|
||||
dat_out.Get() << *++iter << '\n';
|
||||
}
|
||||
|
||||
idx_out.Get() << entry_count << '\n' << idx_out_buffer.str();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 2) {
|
||||
printf("usage: respack-thes-dict <path-to-dict-without-extension>\n");
|
||||
return 1;
|
||||
}
|
||||
agi::dispatch::Init([](agi::dispatch::Thunk f) { });
|
||||
std::locale::global(boost::locale::generator().generate(""));
|
||||
agi::log::log = new agi::log::LogSink;
|
||||
|
||||
convert(argv[1]);
|
||||
}
|
||||
|
Loading…
Reference in a new issue