Use read_file_mapping for the thesaurus

This commit is contained in:
Thomas Goyne 2014-03-21 13:12:56 -07:00
parent 0268ffd345
commit d454872c00
4 changed files with 35 additions and 23 deletions

View file

@ -295,16 +295,14 @@ IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding,
IconvWrapper::~IconvWrapper() { IconvWrapper::~IconvWrapper() {
} }
std::string IconvWrapper::Convert(std::string const& source) { std::string IconvWrapper::Convert(const char *source, size_t len) {
std::string dest; std::string dest;
Convert(source, dest); Convert(source, len, dest);
return dest; return dest;
} }
void IconvWrapper::Convert(std::string const& source, std::string &dest) { void IconvWrapper::Convert(const char *src, size_t srcLen, std::string &dest) {
char buff[512]; char buff[512];
const char *src = source.data();
size_t srcLen = source.size();
size_t res; size_t res;
do { do {
char *dst = buff; char *dst = buff;

View file

@ -19,10 +19,12 @@
#include "libaegisub/thesaurus.h" #include "libaegisub/thesaurus.h"
#include "libaegisub/charset_conv.h" #include "libaegisub/charset_conv.h"
#include "libaegisub/io.h" #include "libaegisub/file_mapping.h"
#include "libaegisub/line_iterator.h" #include "libaegisub/line_iterator.h"
#include "libaegisub/util.h"
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
#include <boost/interprocess/streams/bufferstream.hpp>
#include <boost/phoenix/operator/comparison.hpp> #include <boost/phoenix/operator/comparison.hpp>
#include <boost/phoenix/core/argument.hpp> #include <boost/phoenix/core/argument.hpp>
@ -33,17 +35,18 @@ using boost::phoenix::placeholders::_1;
namespace agi { namespace agi {
Thesaurus::Thesaurus(agi::fs::path const& dat_path, agi::fs::path const& idx_path) Thesaurus::Thesaurus(agi::fs::path const& dat_path, agi::fs::path const& idx_path)
: dat(io::Open(dat_path)) : dat(util::make_unique<read_file_mapping>(dat_path))
{ {
auto idx = io::Open(idx_path); read_file_mapping idx_file(idx_path);
boost::interprocess::ibufferstream idx(idx_file.read(), static_cast<size_t>(idx_file.size()));
std::string encoding_name; std::string encoding_name;
getline(*idx, encoding_name); getline(idx, encoding_name);
std::string unused_entry_count; std::string unused_entry_count;
getline(*idx, unused_entry_count); getline(idx, unused_entry_count);
// Read the list of words and file offsets for those words // Read the list of words and file offsets for those words
for (auto const& line : line_iterator<std::string>(*idx, encoding_name)) { for (auto const& line : line_iterator<std::string>(idx, encoding_name)) {
std::vector<std::string> chunks; std::vector<std::string> chunks;
boost::split(chunks, line, _1 == '|'); boost::split(chunks, line, _1 == '|');
if (chunks.size() == 2) if (chunks.size() == 2)
@ -61,25 +64,33 @@ std::vector<Thesaurus::Entry> Thesaurus::Lookup(std::string const& word) {
auto it = offsets.find(word); auto it = offsets.find(word);
if (it == offsets.end()) return out; if (it == offsets.end()) return out;
if (it->second >= dat->size()) return out;
dat->seekg(it->second, std::ios::beg); auto len = dat->size() - it->second;
if (!dat->good()) return out; auto buff = dat->read(it->second, len);
auto buff_end = buff + len;
std::string temp;
auto read_line = [&] () -> std::string const& {
auto start = buff;
auto end = std::find(buff, buff_end, '\n');
buff = end < buff_end ? end + 1 : buff_end;
if (end > start && end[-1] == '\r') --end;
temp.clear();
conv->Convert(start, end - start, temp);
return temp;
};
// First line is the word and meaning count // First line is the word and meaning count
std::string temp;
getline(*dat, temp);
std::vector<std::string> header; std::vector<std::string> header;
std::string converted(conv->Convert(temp)); boost::split(header, read_line(), _1 == '|');
boost::split(header, converted, _1 == '|');
if (header.size() != 2) return out; if (header.size() != 2) return out;
int meanings = atoi(header[1].c_str()); int meanings = atoi(header[1].c_str());
out.reserve(meanings); out.reserve(meanings);
for (int i = 0; i < meanings; ++i) { for (int i = 0; i < meanings; ++i) {
getline(*dat, temp);
auto converted = conv->Convert(temp);
std::vector<std::string> line; std::vector<std::string> line;
boost::split(line, converted, _1 == '|'); boost::split(line, read_line(), _1 == '|');
if (line.size() < 2) if (line.size() < 2)
continue; continue;

View file

@ -65,11 +65,13 @@ public:
/// @return Converted string. Note that std::string always uses a single byte /// @return Converted string. Note that std::string always uses a single byte
/// terminator, so c_str() may not return a valid string if the dest /// terminator, so c_str() may not return a valid string if the dest
/// charset has wider terminators /// charset has wider terminators
std::string Convert(std::string const& source); std::string Convert(std::string const& source) { return Convert(source.c_str(), source.size()); }
std::string Convert(const char *source, size_t len);
/// @brief Convert a string from the source to destination charset /// @brief Convert a string from the source to destination charset
/// @param source String to convert /// @param source String to convert
/// @param[out] dest String to place the result in /// @param[out] dest String to place the result in
void Convert(std::string const& source, std::string &dest); void Convert(std::string const& source, std::string &dest) { Convert(source.c_str(), source.size(), dest); }
void Convert(const char *source, size_t len, std::string &dest);
size_t Convert(const char* source, size_t sourceSize, char* dest, size_t destSize); size_t Convert(const char* source, size_t sourceSize, char* dest, size_t destSize);
/// Bare wrapper around iconv; see iconv documention for details /// Bare wrapper around iconv; see iconv documention for details
size_t Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); size_t Convert(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);

View file

@ -26,13 +26,14 @@
namespace agi { namespace agi {
class read_file_mapping;
namespace charset { class IconvWrapper; } namespace charset { class IconvWrapper; }
class Thesaurus { class Thesaurus {
/// Map of word -> byte position in the data file /// Map of word -> byte position in the data file
boost::container::flat_map<std::string, int> offsets; boost::container::flat_map<std::string, int> offsets;
/// Read handle to the data file /// Read handle to the data file
std::unique_ptr<std::istream> dat; std::unique_ptr<read_file_mapping> dat;
/// Converter from the data file's charset to UTF-8 /// Converter from the data file's charset to UTF-8
std::unique_ptr<charset::IconvWrapper> conv; std::unique_ptr<charset::IconvWrapper> conv;