Rewrite SRT parsing to use more flexible algorithm, allow more freeform files. Updates #1213.

Originally committed to SVN as r4557.
This commit is contained in:
Niels Martin Hansen 2010-06-20 19:07:43 +00:00
parent 4c01f9a36b
commit 2fc8420ee9

View file

@ -39,6 +39,10 @@
// Headers // Headers
#include "config.h" #include "config.h"
#ifndef AGI_PRE
#include <wx/regex.h>
#endif
#include "ass_dialogue.h" #include "ass_dialogue.h"
#include "ass_file.h" #include "ass_file.h"
#include "subtitle_format_srt.h" #include "subtitle_format_srt.h"
@ -46,6 +50,9 @@
#include "text_file_writer.h" #include "text_file_writer.h"
DEFINE_SIMPLE_EXCEPTION(SRTParseError, SubtitleFormatParseError, "subtitle_io/parse/srt")
/// @brief Can read? /// @brief Can read?
/// @param filename /// @param filename
/// @return /// @return
@ -108,84 +115,134 @@ void SRTSubtitleFormat::ReadFile(wxString filename,wxString encoding) {
// Default // Default
LoadDefault(false); LoadDefault(false);
// Parse file // See parsing algorithm at <http://devel.aegisub.org/wiki/SubtitleFormats/SRT>
int linen = 1;
int fileLine = 0; // "hh:mm:ss,fff --> hh:mm:ss,fff" (e.g. "00:00:04,070 --> 00:00:10,04")
int mode = 0; /// @todo: move the full parsing of SRT timestamps here, instead of having it in AssTime
int lines = 0; wxRegEx timestamp_regex(L"^([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}) --> ([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3})");
long templ; if (!timestamp_regex.IsValid())
AssDialogue *line = NULL; throw agi::InternalError("Parsing SRT: Failed compiling regex", 0);
int state = 1;
int line_num = 0;
int linebreak_debt = 0;
AssDialogue *line = 0;
while (file.HasMoreLines()) { while (file.HasMoreLines()) {
// Reads line wxString text_line = file.ReadLineFromFile();
wxString curLine = file.ReadLineFromFile(); line_num++;
fileLine++; text_line.Trim(true).Trim(false);
if (mode == 0) { switch (state) {
// Checks if there is anything to read case 1:
if (curLine.IsEmpty()) continue; {
// start of file, no subtitles found yet
// Check if it's a line number if (text_line.IsEmpty())
if (!curLine.IsNumber()) { // ignore blank lines
Clear(); break;
if (line) delete line; if (text_line.IsNumber()) {
throw wxString::Format(_T("Parse error on entry %i at line %i (expecting line number). Possible malformed file."),linen,fileLine); // found the line number, throw it away and hope for timestamps
state = 2;
break;
}
if (timestamp_regex.Matches(text_line)) {
goto found_timestamps;
}
char cvtbuf[16]; sprintf(cvtbuf, "%d", line_num);
throw SRTParseError(std::string("Parsing SRT: Expected subtitle index at line ") + cvtbuf, 0);
} }
case 2:
// Read line number {
curLine.ToLong(&templ); // want timestamps
if (templ != linen) { if (timestamp_regex.Matches(text_line) == false) {
linen = templ; // bad format
} char cvtbuf[16]; sprintf(cvtbuf, "%d", line_num);
line = new AssDialogue(); throw SRTParseError(std::string("Parsing SRT: Expected timestamp pair at line ") + cvtbuf, 0);
mode = 1; }
} found_timestamps:
if (line != 0) {
else if (mode == 1) { // finalise active line
// Read timestamps line->ParseSRTTags();
if (curLine.substr(13,3) != _T("-->")) { line = 0;
Clear(); }
if (line) delete line; // create new subtitle
throw wxString::Format(_T("Parse error on entry %i at line %i (expecting timestamps). Possible malformed file."),linen,fileLine); line = new AssDialogue();
} line->group = L"[Events]";
line->Start.ParseSRT(curLine.substr(0,12));
line->End.ParseSRT(curLine.substr(17,12));
mode = 2;
}
else if (mode == 2) {
// Checks if it's done
bool eof = !file.HasMoreLines();
bool isDone = curLine.IsEmpty();
// Append text
if (!isDone) {
if (line->Text != _T("")) line->Text += _T("\\N");
line->Text += curLine;
}
// Done
if (isDone || eof) {
mode = 0;
linen++;
line->group = _T("[Events]");
line->Style = _T("Default"); line->Style = _T("Default");
line->Comment = false; line->Comment = false;
line->ParseSRTTags(); // this parsing should best be moved out of AssTime
line->Start.ParseSRT(timestamp_regex.GetMatch(text_line, 1));
line->End.ParseSRT(timestamp_regex.GetMatch(text_line, 2));
// store pointer to subtitle, we'll continue working on it
Line->push_back(line); Line->push_back(line);
lines++; // next we're reading the text
line = NULL; state = 3;
break;
}
case 3:
{
// reading first line of subtitle text
if (text_line.IsEmpty()) {
// that's not very interesting... blank subtitle?
state = 5;
linebreak_debt = 1;
break;
}
line->Text.Append(text_line);
state = 4;
break;
}
case 4:
{
// reading following line of subtitle text
if (text_line.IsEmpty()) {
// blank line, next may begin a new subtitle
state = 5;
linebreak_debt = 1;
break;
}
line->Text.Append(L"\\N").Append(text_line);
break;
}
case 5:
{
// blank line in subtitle text
linebreak_debt++;
if (text_line.IsEmpty()) {
// multiple blank lines in a row, just add a line break...
break;
}
if (text_line.IsNumber()) {
// must be a subtitle index!
// go for timestamps next
state = 2;
break;
}
if (timestamp_regex.Matches(text_line)) {
goto found_timestamps;
}
// assume it's a continuation of the subtitle text
// resolve our line break debt and append the line text
while (linebreak_debt-- > 0)
line->Text.Append(L"\\N");
line->Text.Append(text_line);
state = 4;
break;
}
default:
{
char cvtbuf[16]; sprintf(cvtbuf, "%d", state);
throw agi::InternalError(std::string("Parsing SRT: Reached unexpected state ") + cvtbuf, 0);
} }
} }
} }
// No lines? if (state == 1 || state == 2) {
if (lines == 0) { throw SRTParseError(std::string("Parsing SRT: Incomplete file"), 0);
line = new AssDialogue(); }
line->group = _T("[Events]");
line->Style = _T("Default"); if (line) {
line->Start.SetMS(0); // an unfinalised line
line->End.SetMS(5000); line->ParseSRTTags();
Line->push_back(line);
} }
} }