2012-01-08 01:36:50 +00:00
|
|
|
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
|
|
|
|
//
|
|
|
|
// Permission to use, copy, modify, and distribute this software for any
|
|
|
|
// purpose with or without fee is hereby granted, provided that the above
|
|
|
|
// copyright notice and this permission notice appear in all copies.
|
|
|
|
//
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
|
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
|
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
|
|
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
|
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
|
|
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
|
|
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
|
|
|
|
/// @file thesaurus.cpp
|
|
|
|
/// @brief MyThes-compatible thesaurus implementation
|
|
|
|
/// @ingroup libaegisub thesaurus
|
|
|
|
|
|
|
|
#include "libaegisub/thesaurus.h"
|
|
|
|
|
|
|
|
#include "libaegisub/charset_conv.h"
|
2014-03-21 13:12:56 -07:00
|
|
|
#include "libaegisub/file_mapping.h"
|
2012-01-08 01:36:50 +00:00
|
|
|
#include "libaegisub/line_iterator.h"
|
2014-03-21 13:12:56 -07:00
|
|
|
#include "libaegisub/util.h"
|
2012-01-08 01:36:50 +00:00
|
|
|
|
2012-11-13 06:07:39 -08:00
|
|
|
#include <boost/algorithm/string.hpp>
|
2014-03-21 13:12:56 -07:00
|
|
|
#include <boost/interprocess/streams/bufferstream.hpp>
|
2012-11-13 06:07:39 -08:00
|
|
|
#include <boost/phoenix/operator/comparison.hpp>
|
|
|
|
#include <boost/phoenix/core/argument.hpp>
|
|
|
|
|
|
|
|
using boost::phoenix::placeholders::_1;
|
2012-01-08 01:36:50 +00:00
|
|
|
|
|
|
|
namespace agi {
|
|
|
|
|
2013-01-04 07:01:50 -08:00
|
|
|
Thesaurus::Thesaurus(agi::fs::path const& dat_path, agi::fs::path const& idx_path)
|
2014-03-21 13:12:56 -07:00
|
|
|
: dat(util::make_unique<read_file_mapping>(dat_path))
|
2012-01-08 01:36:50 +00:00
|
|
|
{
|
2014-03-21 13:12:56 -07:00
|
|
|
read_file_mapping idx_file(idx_path);
|
|
|
|
boost::interprocess::ibufferstream idx(idx_file.read(), static_cast<size_t>(idx_file.size()));
|
2012-01-08 01:36:50 +00:00
|
|
|
|
|
|
|
std::string encoding_name;
|
2014-03-21 13:12:56 -07:00
|
|
|
getline(idx, encoding_name);
|
2012-01-08 01:36:50 +00:00
|
|
|
std::string unused_entry_count;
|
2014-03-21 13:12:56 -07:00
|
|
|
getline(idx, unused_entry_count);
|
2012-01-08 01:36:50 +00:00
|
|
|
|
|
|
|
// Read the list of words and file offsets for those words
|
2014-03-21 13:12:56 -07:00
|
|
|
for (auto const& line : line_iterator<std::string>(idx, encoding_name)) {
|
2012-01-08 01:36:50 +00:00
|
|
|
std::vector<std::string> chunks;
|
2013-12-20 09:18:24 -08:00
|
|
|
boost::split(chunks, line, _1 == '|');
|
2012-11-13 06:07:39 -08:00
|
|
|
if (chunks.size() == 2)
|
2012-01-08 01:36:50 +00:00
|
|
|
offsets[chunks[0]] = atoi(chunks[1].c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
conv.reset(new charset::IconvWrapper(encoding_name.c_str(), "utf-8"));
|
|
|
|
}
|
|
|
|
|
|
|
|
Thesaurus::~Thesaurus() { }
|
|
|
|
|
2013-11-02 07:52:45 -07:00
|
|
|
std::vector<Thesaurus::Entry> Thesaurus::Lookup(std::string const& word) {
|
|
|
|
std::vector<Entry> out;
|
|
|
|
if (!dat.get()) return out;
|
2012-01-08 01:36:50 +00:00
|
|
|
|
2013-11-02 07:52:45 -07:00
|
|
|
auto it = offsets.find(word);
|
|
|
|
if (it == offsets.end()) return out;
|
2014-03-21 13:12:56 -07:00
|
|
|
if (it->second >= dat->size()) return out;
|
2012-01-08 01:36:50 +00:00
|
|
|
|
2014-03-21 13:12:56 -07:00
|
|
|
auto len = dat->size() - it->second;
|
|
|
|
auto buff = dat->read(it->second, len);
|
|
|
|
auto buff_end = buff + len;
|
2012-01-08 01:36:50 +00:00
|
|
|
|
|
|
|
std::string temp;
|
2014-03-21 13:12:56 -07:00
|
|
|
auto read_line = [&] () -> std::string const& {
|
|
|
|
auto start = buff;
|
|
|
|
auto end = std::find(buff, buff_end, '\n');
|
|
|
|
buff = end < buff_end ? end + 1 : buff_end;
|
|
|
|
if (end > start && end[-1] == '\r') --end;
|
|
|
|
temp.clear();
|
|
|
|
conv->Convert(start, end - start, temp);
|
|
|
|
return temp;
|
|
|
|
};
|
|
|
|
|
|
|
|
// First line is the word and meaning count
|
2012-01-08 01:36:50 +00:00
|
|
|
std::vector<std::string> header;
|
2014-03-21 13:12:56 -07:00
|
|
|
boost::split(header, read_line(), _1 == '|');
|
2013-11-02 07:52:45 -07:00
|
|
|
if (header.size() != 2) return out;
|
2012-01-08 01:36:50 +00:00
|
|
|
int meanings = atoi(header[1].c_str());
|
|
|
|
|
2013-11-02 07:52:45 -07:00
|
|
|
out.reserve(meanings);
|
2012-01-08 01:36:50 +00:00
|
|
|
for (int i = 0; i < meanings; ++i) {
|
2013-11-02 07:52:45 -07:00
|
|
|
std::vector<std::string> line;
|
2014-03-21 13:12:56 -07:00
|
|
|
boost::split(line, read_line(), _1 == '|');
|
2012-01-08 01:36:50 +00:00
|
|
|
|
2013-11-02 07:52:45 -07:00
|
|
|
if (line.size() < 2)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Entry e;
|
|
|
|
// The "definition" is just the part of speech (which may be empty)
|
|
|
|
// plus the word it's giving synonyms for (which may not be the passed word)
|
|
|
|
if (!line[0].empty())
|
|
|
|
e.first = line[0] + ' ';
|
|
|
|
e.first += line[1];
|
|
|
|
e.second.reserve(line.size() - 2);
|
|
|
|
|
|
|
|
for (size_t i = 2; i < line.size(); ++i) {
|
|
|
|
if (line[i].size())
|
|
|
|
e.second.emplace_back(std::move(line[i]));
|
|
|
|
}
|
|
|
|
|
|
|
|
out.emplace_back(std::move(e));
|
2012-01-08 01:36:50 +00:00
|
|
|
}
|
2013-11-02 07:52:45 -07:00
|
|
|
|
|
|
|
return out;
|
2012-01-08 01:36:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|