Use mmap for reading subtitles from Matroska files

Cuts cold-cache read time for an arbitrary 1 GB file read over a network
off a USB 2.0 hard drive from ~70 seconds to ~45 seconds.
This commit is contained in:
Thomas Goyne 2014-03-20 19:40:05 -07:00
parent 00b4d6908f
commit 0c9f39ca25

View file

@ -43,46 +43,78 @@
#include "dialog_progress.h" #include "dialog_progress.h"
#include "MatroskaParser.h" #include "MatroskaParser.h"
#include <libaegisub/file_mapping.h>
#include <libaegisub/fs.h> #include <libaegisub/fs.h>
#include <libaegisub/scoped_ptr.h> #include <libaegisub/scoped_ptr.h>
#include <algorithm> #include <algorithm>
#include <boost/algorithm/string/classification.hpp> #include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/replace.hpp> #include <boost/algorithm/string/replace.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/format.hpp> #include <boost/format.hpp>
#include <boost/lexical_cast.hpp> #include <boost/lexical_cast.hpp>
#include <boost/range/irange.hpp> #include <boost/range/irange.hpp>
#include <boost/tokenizer.hpp> #include <boost/tokenizer.hpp>
#include <cerrno>
#include <cstdint>
#include <cstdio>
#include <iterator> #include <iterator>
#include <wx/choicdlg.h> // Keep this last so wxUSE_CHOICEDLG is set. #include <wx/choicdlg.h> // Keep this last so wxUSE_CHOICEDLG is set.
class MkvStdIO final : public InputStream { struct MkvStdIO final : InputStream {
public: agi::read_file_mapping file;
MkvStdIO(agi::fs::path const& filename); std::string error;
~MkvStdIO() { if (fp) fclose(fp); }
FILE *fp = nullptr; static int Read(InputStream *st, ulonglong pos, void *buffer, int count) {
int error = 0; auto *self = static_cast<MkvStdIO*>(st);
if (pos == self->file.size())
return 0;
try {
memcpy(buffer, self->file.read(pos, count), count);
}
catch (agi::Exception const& e) {
self->error = e.GetChainedMessage();
return -1;
}
return count;
}
static longlong Scan(InputStream *st, ulonglong start, unsigned signature) {
auto *self = static_cast<MkvStdIO*>(st);
try {
unsigned cmp = 0;
for (auto i : boost::irange(start, self->file.size())) {
int c = *self->file.read(i, 1);
cmp = ((cmp << 8) | c) & 0xffffffff;
if (cmp == signature)
return i - 4;
}
}
catch (agi::Exception const& e) {
self->error = e.GetChainedMessage();
}
return -1;
}
static longlong Size(InputStream *st) {
return static_cast<MkvStdIO*>(st)->file.size();
}
MkvStdIO(agi::fs::path const& filename) : file(filename) {
read = &MkvStdIO::Read;
scan = &MkvStdIO::Scan;
getcachesize = [](InputStream *) -> unsigned int { return 16 * 1024 * 1024; };
geterror = [](InputStream *st) -> const char * { return ((MkvStdIO *)st)->error.c_str(); };
memalloc = [](InputStream *, size_t size) { return malloc(size); };
memrealloc = [](InputStream *, void *mem, size_t size) { return realloc(mem, size); };
memfree = [](InputStream *, void *mem) { free(mem); };
progress = [](InputStream *, ulonglong, ulonglong) { return 1; };
getfilesize = &MkvStdIO::Size;
}
}; };
#define CACHESIZE 1024
#ifdef __VISUALC__
#define std_fseek _fseeki64
#define std_ftell _ftelli64
#else
#define std_fseek fseeko
#define std_ftell ftello
#endif
static void read_subtitles(agi::ProgressSink *ps, MatroskaFile *file, MkvStdIO *input, bool srt, double totalTime, AssParser *parser) { static void read_subtitles(agi::ProgressSink *ps, MatroskaFile *file, MkvStdIO *input, bool srt, double totalTime, AssParser *parser) {
std::vector<std::pair<int, std::string>> subList; std::vector<std::pair<int, std::string>> subList;
std::string readBuf;
// Load blocks // Load blocks
ulonglong startTime, endTime, filePos; ulonglong startTime, endTime, filePos;
@ -92,36 +124,42 @@ static void read_subtitles(agi::ProgressSink *ps, MatroskaFile *file, MkvStdIO *
if (ps->IsCancelled()) return; if (ps->IsCancelled()) return;
if (frameSize == 0) continue; if (frameSize == 0) continue;
readBuf.resize(frameSize); const auto readBuf = input->file.read(filePos, frameSize);
std_fseek(input->fp, filePos, SEEK_SET); const auto readBufEnd = readBuf + frameSize;
fread(&readBuf[0], 1, frameSize, input->fp);
// Get start and end times // Get start and end times
longlong timecodeScaleLow = 1000000; longlong timecodeScaleLow = 1000000;
AssTime subStart = startTime / timecodeScaleLow; AssTime subStart = startTime / timecodeScaleLow;
AssTime subEnd = endTime / timecodeScaleLow; AssTime subEnd = endTime / timecodeScaleLow;
using str_range = boost::iterator_range<const char *>;
// Process SSA/ASS // Process SSA/ASS
if (!srt) { if (!srt) {
std::vector<boost::iterator_range<std::string::iterator>> chunks; auto first = std::find(readBuf, readBufEnd, ',');
boost::split(chunks, readBuf, boost::is_any_of(",")); if (first == readBufEnd) continue;
auto second = std::find(first + 1, readBufEnd, ',');
if (second == readBufEnd) continue;
subList.emplace_back( subList.emplace_back(
boost::lexical_cast<int>(chunks[0]), boost::lexical_cast<int>(str_range(readBuf, first)),
str(boost::format("Dialogue: %d,%s,%s,%s") str(boost::format("Dialogue: %d,%s,%s,%s")
% boost::lexical_cast<int>(chunks[1]) % boost::lexical_cast<int>(str_range(first + 1, second))
% subStart.GetAssFormated() % subStart.GetAssFormated()
% subEnd.GetAssFormated() % subEnd.GetAssFormated()
% boost::make_iterator_range(begin(chunks[2]), readBuf.end()))); % str_range(second + 1, readBufEnd)));
} }
// Process SRT // Process SRT
else { else {
readBuf = str(boost::format("Dialogue: 0,%s,%s,Default,,0,0,0,,%s") % subStart.GetAssFormated() % subEnd.GetAssFormated() % readBuf); auto line = str(boost::format("Dialogue: 0,%s,%s,Default,,0,0,0,,%s")
boost::replace_all(readBuf, "\r\n", "\\N"); % subStart.GetAssFormated()
boost::replace_all(readBuf, "\r", "\\N"); % subEnd.GetAssFormated()
boost::replace_all(readBuf, "\n", "\\N"); % str_range(readBuf, readBufEnd));
boost::replace_all(line, "\r\n", "\\N");
boost::replace_all(line, "\r", "\\N");
boost::replace_all(line, "\n", "\\N");
subList.emplace_back(subList.size(), readBuf); subList.emplace_back(subList.size(), std::move(line));
} }
ps->SetProgress(startTime / timecodeScaleLow, totalTime); ps->SetProgress(startTime / timecodeScaleLow, totalTime);
@ -238,70 +276,3 @@ bool MatroskaWrapper::HasSubtitles(agi::fs::path const& filename) {
return false; return false;
} }
int StdIoRead(InputStream *_st, ulonglong pos, void *buffer, int count) {
auto *st = static_cast<MkvStdIO*>(_st);
if (std_fseek(st->fp, pos, SEEK_SET)) {
st->error = errno;
return -1;
}
auto rd = fread(buffer, 1, count, st->fp);
if (rd == 0) {
if (feof(st->fp))
return 0;
st->error = errno;
return -1;
}
return rd;
}
/// @brief scan for a signature sig(big-endian) starting at file position pos
/// @return position of the first byte of signature or -1 if error/not found
longlong StdIoScan(InputStream *st, ulonglong start, unsigned signature) {
FILE *fp = static_cast<MkvStdIO*>(st)->fp;
if (std_fseek(fp, start, SEEK_SET))
return -1;
int c;
unsigned cmp = 0;
while ((c = getc(fp)) != EOF) {
cmp = ((cmp << 8) | c) & 0xffffffff;
if (cmp == signature)
return std_ftell(fp) - 4;
}
return -1;
}
longlong StdIoGetFileSize(InputStream *st) {
auto fp = static_cast<MkvStdIO*>(st)->fp;
auto cpos = std_ftell(fp);
std_fseek(fp, 0, SEEK_END);
auto epos = std_ftell(fp);
std_fseek(fp, cpos, SEEK_SET);
return epos;
}
MkvStdIO::MkvStdIO(agi::fs::path const& filename) {
read = StdIoRead;
scan = StdIoScan;
getcachesize = [](InputStream *) -> unsigned int { return CACHESIZE; };
geterror = [](InputStream *st) -> const char * { return strerror(((MkvStdIO *)st)->error); };
memalloc = [](InputStream *, size_t size) { return malloc(size); };
memrealloc = [](InputStream *, void *mem, size_t size) { return realloc(mem, size); };
memfree = [](InputStream *, void *mem) { free(mem); };
progress = [](InputStream *, ulonglong, ulonglong) { return 1; };
getfilesize = StdIoGetFileSize;
#ifdef __VISUALC__
fp = _wfopen(filename.c_str(), L"rb");
#else
fp = fopen(filename.c_str(), "rb");
#endif
if (!fp)
throw agi::fs::FileNotFound(filename);
setvbuf(fp, nullptr, _IOFBF, CACHESIZE);
}