From d9a266938931e7b94a797ac803110fcf5f8c092a Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Wed, 18 Dec 2013 08:41:42 -0800 Subject: [PATCH] Add tool to repack thesaurus dictionaries The standard MyThes dictionaries include words that we don't support (nearly half the entries in the English dictionary have speaces in them) which waste space, and they aren't UTF-8 which slows down loading as we need to convert them. Knocks a total of 10 MB off the thesaurus dictionaries. --- aegisub/tools/Makefile | 7 +++ aegisub/tools/repack-thes-dict.cpp | 91 ++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 aegisub/tools/repack-thes-dict.cpp diff --git a/aegisub/tools/Makefile b/aegisub/tools/Makefile index 6bc78bd28..20045143b 100644 --- a/aegisub/tools/Makefile +++ b/aegisub/tools/Makefile @@ -11,6 +11,13 @@ CLEANFILES += osx-bundle-restart-helper all: osx-bundle-restart-helper endif +CXXFLAGS += -I../libaegisub/include $(CFLAGS_ICU) +LIBS := -L../libaegisub -laegisub $(LIBS) +LIBS += $(LIBS_BOOST) $(LIBS_ICU) $(LIBS_LUA) + +repack-thes-dict: repack-thes-dict.cpp + $(BIN_CXX) -o repack-thes-dict repack-thes-dict.cpp $(CXXFLAGS) $(LIBS) + EXTRA_DIST = \ osx-bundle-restart-helper.c \ osx-bundle.sh \ diff --git a/aegisub/tools/repack-thes-dict.cpp b/aegisub/tools/repack-thes-dict.cpp new file mode 100644 index 000000000..9e2229e8c --- /dev/null +++ b/aegisub/tools/repack-thes-dict.cpp @@ -0,0 +1,91 @@ +// Copyright (c) 2013, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace { +using boost::phoenix::placeholders::_1; + +void convert(std::string const& path) { + std::unique_ptr idx(agi::io::Open(path + ".idx")); + std::unique_ptr dat(agi::io::Open(path + ".dat")); + + std::ostringstream idx_out_buffer; + agi::io::Save idx_out(path + ".out.idx"); + agi::io::Save dat_out(path + ".out.dat"); + + idx_out.Get() << "UTF-8\n"; + dat_out.Get() << "UTF-8\n"; + + std::string encoding_name; + getline(*idx, encoding_name); + + agi::charset::IconvWrapper conv(encoding_name.c_str(), "utf-8"); + + std::string unused_entry_count; + getline(*idx, unused_entry_count); + + int entry_count = 0; + + for (auto const& line : agi::line_iterator(*idx, encoding_name)) { + std::vector chunks; + boost::split(chunks, line, _1 == '|'); + if (chunks.size() != 2) + continue; + if (chunks[0].find(' ') != std::string::npos) + continue; + + ++entry_count; + + idx_out_buffer << chunks[0] << '|' << dat_out.Get().tellp() << '\n'; + dat->seekg(atoi(chunks[1].c_str())); + + agi::line_iterator iter{*dat, encoding_name}; + dat_out.Get() << *iter << '\n'; + + std::vector header; + boost::split(header, *iter, _1 == '|'); + int meanings = atoi(header[1].c_str()); + for (int i = 0; i < meanings; ++i) + dat_out.Get() << *++iter << '\n'; + } + + idx_out.Get() << entry_count << '\n' << idx_out_buffer.str(); +} + +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + printf("usage: respack-thes-dict \n"); + return 1; + } + agi::dispatch::Init([](agi::dispatch::Thunk f) { }); + std::locale::global(boost::locale::generator().generate("")); + agi::log::log = new agi::log::LogSink; + + convert(argv[1]); +} +