Use read_file_mapping for the thesaurus
This commit is contained in:
parent
0268ffd345
commit
d454872c00
4 changed files with 35 additions and 23 deletions
|
@ -295,16 +295,14 @@ IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding,
|
|||
IconvWrapper::~IconvWrapper() {
|
||||
}
|
||||
|
||||
std::string IconvWrapper::Convert(std::string const& source) {
|
||||
std::string IconvWrapper::Convert(const char *source, size_t len) {
|
||||
std::string dest;
|
||||
Convert(source, dest);
|
||||
Convert(source, len, dest);
|
||||
return dest;
|
||||
}
|
||||
void IconvWrapper::Convert(std::string const& source, std::string &dest) {
|
||||
void IconvWrapper::Convert(const char *src, size_t srcLen, std::string &dest) {
|
||||
char buff[512];
|
||||
|
||||
const char *src = source.data();
|
||||
size_t srcLen = source.size();
|
||||
size_t res;
|
||||
do {
|
||||
char *dst = buff;
|
||||
|
|
|
@ -19,10 +19,12 @@
|
|||
#include "libaegisub/thesaurus.h"
|
||||
|
||||
#include "libaegisub/charset_conv.h"
|
||||
#include "libaegisub/io.h"
|
||||
#include "libaegisub/file_mapping.h"
|
||||
#include "libaegisub/line_iterator.h"
|
||||
#include "libaegisub/util.h"
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/interprocess/streams/bufferstream.hpp>
|
||||
#include <boost/phoenix/operator/comparison.hpp>
|
||||
#include <boost/phoenix/core/argument.hpp>
|
||||
|
||||
|
@ -33,17 +35,18 @@ using boost::phoenix::placeholders::_1;
|
|||
namespace agi {
|
||||
|
||||
Thesaurus::Thesaurus(agi::fs::path const& dat_path, agi::fs::path const& idx_path)
|
||||
: dat(io::Open(dat_path))
|
||||
: dat(util::make_unique<read_file_mapping>(dat_path))
|
||||
{
|
||||
auto idx = io::Open(idx_path);
|
||||
read_file_mapping idx_file(idx_path);
|
||||
boost::interprocess::ibufferstream idx(idx_file.read(), static_cast<size_t>(idx_file.size()));
|
||||
|
||||
std::string encoding_name;
|
||||
getline(*idx, encoding_name);
|
||||
getline(idx, encoding_name);
|
||||
std::string unused_entry_count;
|
||||
getline(*idx, unused_entry_count);
|
||||
getline(idx, unused_entry_count);
|
||||
|
||||
// Read the list of words and file offsets for those words
|
||||
for (auto const& line : line_iterator<std::string>(*idx, encoding_name)) {
|
||||
for (auto const& line : line_iterator<std::string>(idx, encoding_name)) {
|
||||
std::vector<std::string> chunks;
|
||||
boost::split(chunks, line, _1 == '|');
|
||||
if (chunks.size() == 2)
|
||||
|
@ -61,25 +64,33 @@ std::vector<Thesaurus::Entry> Thesaurus::Lookup(std::string const& word) {
|
|||
|
||||
auto it = offsets.find(word);
|
||||
if (it == offsets.end()) return out;
|
||||
if (it->second >= dat->size()) return out;
|
||||
|
||||
dat->seekg(it->second, std::ios::beg);
|
||||
if (!dat->good()) return out;
|
||||
auto len = dat->size() - it->second;
|
||||
auto buff = dat->read(it->second, len);
|
||||
auto buff_end = buff + len;
|
||||
|
||||
std::string temp;
|
||||
auto read_line = [&] () -> std::string const& {
|
||||
auto start = buff;
|
||||
auto end = std::find(buff, buff_end, '\n');
|
||||
buff = end < buff_end ? end + 1 : buff_end;
|
||||
if (end > start && end[-1] == '\r') --end;
|
||||
temp.clear();
|
||||
conv->Convert(start, end - start, temp);
|
||||
return temp;
|
||||
};
|
||||
|
||||
// First line is the word and meaning count
|
||||
std::string temp;
|
||||
getline(*dat, temp);
|
||||
std::vector<std::string> header;
|
||||
std::string converted(conv->Convert(temp));
|
||||
boost::split(header, converted, _1 == '|');
|
||||
boost::split(header, read_line(), _1 == '|');
|
||||
if (header.size() != 2) return out;
|
||||
int meanings = atoi(header[1].c_str());
|
||||
|
||||
out.reserve(meanings);
|
||||
for (int i = 0; i < meanings; ++i) {
|
||||
getline(*dat, temp);
|
||||
auto converted = conv->Convert(temp);
|
||||
std::vector<std::string> line;
|
||||
boost::split(line, converted, _1 == '|');
|
||||
boost::split(line, read_line(), _1 == '|');
|
||||
|
||||
if (line.size() < 2)
|
||||
continue;
|
||||
|
|
|
@ -65,11 +65,13 @@ public:
|
|||
/// @return Converted string. Note that std::string always uses a single byte
|
||||
/// terminator, so c_str() may not return a valid string if the dest
|
||||
/// charset has wider terminators
|
||||
std::string Convert(std::string const& source);
|
||||
std::string Convert(std::string const& source) { return Convert(source.c_str(), source.size()); }
|
||||
std::string Convert(const char *source, size_t len);
|
||||
/// @brief Convert a string from the source to destination charset
|
||||
/// @param source String to convert
|
||||
/// @param[out] dest String to place the result in
|
||||
void Convert(std::string const& source, std::string &dest);
|
||||
void Convert(std::string const& source, std::string &dest) { Convert(source.c_str(), source.size(), dest); }
|
||||
void Convert(const char *source, size_t len, std::string &dest);
|
||||
size_t Convert(const char* source, size_t sourceSize, char* dest, size_t destSize);
|
||||
/// Bare wrapper around iconv; see iconv documention for details
|
||||
size_t Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
|
||||
|
|
|
@ -26,13 +26,14 @@
|
|||
|
||||
namespace agi {
|
||||
|
||||
class read_file_mapping;
|
||||
namespace charset { class IconvWrapper; }
|
||||
|
||||
class Thesaurus {
|
||||
/// Map of word -> byte position in the data file
|
||||
boost::container::flat_map<std::string, int> offsets;
|
||||
/// Read handle to the data file
|
||||
std::unique_ptr<std::istream> dat;
|
||||
std::unique_ptr<read_file_mapping> dat;
|
||||
/// Converter from the data file's charset to UTF-8
|
||||
std::unique_ptr<charset::IconvWrapper> conv;
|
||||
|
||||
|
|
Loading…
Reference in a new issue