Port new SRT parser from 2.1.9. Updates #1301.

Originally committed to SVN as r5910.
This commit is contained in:
Thomas Goyne 2011-11-25 19:27:51 +00:00
parent 1452f1fbe3
commit 0992a839cc
5 changed files with 353 additions and 263 deletions

View file

@ -243,137 +243,6 @@ wxString AssDialogue::GetSSAText () const {
return GetData(true);
}
void AssDialogue::ParseSRTTags () {
// Search and replace
size_t total = 0;
total += Text.Replace("<i>","{\\i1}");
total += Text.Replace("</i>","{\\i0}");
total += Text.Replace("<b>","{\\b1}");
total += Text.Replace("</b>","{\\b0}");
total += Text.Replace("<u>","{\\u1}");
total += Text.Replace("</u>","{\\u0}");
total += Text.Replace("<s>","{\\s1}");
total += Text.Replace("</s>","{\\s0}");
// Process <font> tag
wxString work = Text;
work.UpperCase();
size_t pos_open = 0;
size_t pos_close = 0;
size_t pos = 0;
size_t end = 0;
size_t start = 0;
bool isOpen;
// Iterate
pos_open = work.find("<FONT",0);
pos_close = work.find("</FONT",0);
while (pos_open != wxString::npos || pos_close != wxString::npos) {
// Determine if it's an open or close tag
if (pos_open < pos_close) {
start = pos_open;
isOpen = true;
}
else {
start = pos_close;
isOpen = false;
}
end = work.find(">",start)+1;
//if (end == wxString::npos) continue;
// Open tag
if (isOpen) {
wxString replaced;
// Color tag
if ((pos = work.find("COLOR=\"",start)) != wxString::npos) {
if (pos < end) {
pos += 7;
size_t end_tag = Text.find("\"",pos);
if (end_tag != wxString::npos) {
if (end_tag-pos == 7) {
replaced += "{\\c&H";
replaced += work.substr(pos+5,2);
replaced += work.substr(pos+3,2);
replaced += work.substr(pos+1,2);
replaced += "&}";
total++;
}
}
}
}
// Face tag
if ((pos = work.find("FACE=\"",start)) != wxString::npos) {
if (pos < end) {
pos += 6;
size_t end_tag = work.find("\"",pos);
if (end_tag != wxString::npos) {
replaced += "{\\fn";
replaced += work.substr(pos,end_tag-pos);
replaced += "}";
total++;
}
}
}
// Size tag
if ((pos = work.find("SIZE=\"",start)) != wxString::npos) {
if (pos < end) {
pos += 6;
size_t end_tag = Text.find("\"",pos);
if (end_tag != wxString::npos) {
replaced += "{\\fs";
replaced += work.substr(pos,end_tag-pos);
replaced += "}";
total++;
}
}
}
// Replace whole tag
//Text = Text.substr(0,start) + replaced + Text.substr(end);
Text = Text.substr(0, start);
Text << replaced << Text.substr(end);
total++;
}
// Close tag
else {
// Find if it's italic, bold, underline, and strikeout
wxString prev = Text.Left(start);
bool isItalic=false,isBold=false,isUnder=false,isStrike=false;
if (CountMatches(prev,"{\\i1}") > CountMatches(prev,"{\\i0}")) isItalic = true;
if (CountMatches(prev,"{\\b1}") > CountMatches(prev,"{\\b0}")) isBold = true;
if (CountMatches(prev,"{\\u1}") > CountMatches(prev,"{\\u0}")) isUnder = true;
if (CountMatches(prev,"{\\s1}") > CountMatches(prev,"{\\s0}")) isStrike = true;
// Generate new tag, by reseting and then restoring flags
wxString replaced = "{\\r";
if (isItalic) replaced += "\\i1";
if (isBold) replaced += "\\b1";
if (isUnder) replaced += "\\u1";
if (isStrike) replaced += "\\s1";
replaced += "}";
// Replace
//Text = Text.substr(0,start) + replaced + Text.substr(end);
Text = Text.substr(0, start);
Text << replaced << Text.substr(end);
total++;
}
// Get next
work = Text;
work.UpperCase();
pos_open = work.find("<FONT",0);
pos_close = work.find("</FONT",0);
}
// Remove double tagging
Text.Replace("}{","");
}
void AssDialogue::ParseASSTags () {
ClearBlocks();

View file

@ -197,8 +197,6 @@ public:
bool Parse(wxString data,int version=1);
/// Parse text as ASS to generate block information
void ParseASSTags();
/// Parse text as SRT to generate block information
void ParseSRTTags();
/// Clear all blocks, ALWAYS call this after you're done processing tags
void ClearBlocks();

View file

@ -115,40 +115,6 @@ void AssTime::ParseASS (const wxString text) {
SetMS(tms + tm*60000 + th*3600000);
}
/// @brief Parses from SRT
/// @param _text
///
void AssTime::ParseSRT (const wxString _text) {
// Prepare
wxString text = _text;
text.Trim(false);
text.Trim(true);
long tempv;
wxString temp;
int ms,s,m,h;
// Parse
temp = text.Mid(0,2);
temp.ToLong(&tempv);
h = tempv;
temp = text.Mid(3,2);
temp.ToLong(&tempv);
m = tempv;
temp = text.Mid(6,2);
temp.ToLong(&tempv);
s = tempv;
temp = text.Mid(9,3);
temp.ToLong(&tempv);
ms = tempv;
// Set value
SetMS(ms + s*1000 + m*60000 + h*3600000);
}
/// @brief AssTime conversion to/from miliseconds
/// @return
///
@ -187,55 +153,6 @@ wxString AssTime::GetASSFormated (bool msPrecision) const {
else return wxString::Format("%01i:%02i:%02i.%02i",h,m,s,ms/10);
}
/// @brief SRT Formated
/// @return
///
wxString AssTime::GetSRTFormated () {
int h,m,s,ms;
int _ms = time;
// Centisecond precision
if (!UseMSPrecision) _ms = _ms/10*10;
// Reset
h = m = s = ms = 0;
if (_ms < 0) _ms = 0;
// Hours
while (_ms >= 3600000) {
_ms -= 3600000;
h++;
}
// Ass overflow
if (h > 9) {
h = 9;
m = 59;
s = 59;
ms = 999;
}
// Minutes
while (_ms >= 60000) {
_ms -= 60000;
m++;
}
// Seconds
while (_ms >= 1000) {
_ms -= 1000;
s++;
}
ms = _ms;
wxString result = wxString::Format("%02i:%02i:%02i,%03i",h,m,s,ms);
return result;
}
/// @brief AssTime comparison
/// @param t1
/// @param t2

View file

@ -75,9 +75,7 @@ public:
int GetMS() const; // Returns milliseconds
void SetMS(int ms); // Sets values to milliseconds
void ParseASS(const wxString text); // Sets value to text-form time, in ASS format
void ParseSRT(const wxString text); // Sets value to text-form time, in SRT format
wxString GetASSFormated(bool ms=false) const; // Returns the ASS representation of time
wxString GetSRTFormated(); // Returns the SRT representation of time
};
// Comparison operators

View file

@ -42,13 +42,338 @@
#include "ass_dialogue.h"
#include "ass_file.h"
#include "ass_style.h"
#include "colorspace.h"
#include "compat.h"
#include "subtitle_format_srt.h"
#include "text_file_reader.h"
#include "text_file_writer.h"
DEFINE_SIMPLE_EXCEPTION(SRTParseError, SubtitleFormatParseError, "subtitle_io/parse/srt")
namespace {
class SrtTagParser {
struct FontAttribs {
wxString face;
wxString size;
wxString color;
};
enum TagType {
// leave 0 unused so indexing an unknown tag in the map won't clash
TAG_BOLD_OPEN = 1,
TAG_BOLD_CLOSE,
TAG_ITALICS_OPEN,
TAG_ITALICS_CLOSE,
TAG_UNDERLINE_OPEN,
TAG_UNDERLINE_CLOSE,
TAG_STRIKEOUT_OPEN,
TAG_STRIKEOUT_CLOSE,
TAG_FONT_OPEN,
TAG_FONT_CLOSE,
};
wxRegEx tag_matcher;
wxRegEx attrib_matcher;
std::map<wxString,TagType> tag_name_cases;
public:
SrtTagParser()
: tag_matcher("^(.*?)<(/?b|/?i|/?u|/?s|/?font)([^>]*)>(.*)$", wxRE_ICASE|wxRE_ADVANCED)
, attrib_matcher("^[[:space:]]+(face|size|color)=('[^']*'|\"[^\"]*\"|[^[:space:]]+)", wxRE_ICASE|wxRE_ADVANCED)
{
if (!tag_matcher.IsValid())
throw agi::InternalError("Parsing SRT: Failed compiling tag matching regex", 0);
if (!attrib_matcher.IsValid())
throw agi::InternalError("Parsing SRT: Failed compiling tag attribute matching regex", 0);
tag_name_cases["b"] = TAG_BOLD_OPEN;
tag_name_cases["/b"] = TAG_BOLD_CLOSE;
tag_name_cases["i"] = TAG_ITALICS_OPEN;
tag_name_cases["/i"] = TAG_ITALICS_CLOSE;
tag_name_cases["u"] = TAG_UNDERLINE_OPEN;
tag_name_cases["/u"] = TAG_UNDERLINE_CLOSE;
tag_name_cases["s"] = TAG_STRIKEOUT_OPEN;
tag_name_cases["/s"] = TAG_STRIKEOUT_CLOSE;
tag_name_cases["font"] = TAG_FONT_OPEN;
tag_name_cases["/font"] = TAG_FONT_CLOSE;
}
wxString ToAss(wxString srt)
{
int bold_level = 0;
int italics_level = 0;
int underline_level = 0;
int strikeout_level = 0;
std::vector<FontAttribs> font_stack;
wxString ass; // result to be built
while (!srt.empty())
{
if (!tag_matcher.Matches(srt))
{
// no more tags could be matched, end of string
ass.append(srt);
break;
}
// we found a tag, translate it
wxString pre_text = tag_matcher.GetMatch(srt, 1);
wxString tag_name = tag_matcher.GetMatch(srt, 2);
wxString tag_attrs = tag_matcher.GetMatch(srt, 3);
wxString post_text = tag_matcher.GetMatch(srt, 4);
// the text before the tag goes through unchanged
ass.append(pre_text);
// the text after the tag is the input for next iteration
srt = post_text;
switch (tag_name_cases[tag_name.Lower()])
{
case TAG_BOLD_OPEN:
if (bold_level == 0)
ass.append("{\\b1}");
bold_level++;
break;
case TAG_BOLD_CLOSE:
if (bold_level == 1)
ass.append("{\\b}");
if (bold_level > 0)
bold_level--;
break;
case TAG_ITALICS_OPEN:
if (italics_level == 0)
ass.append("{\\i1}");
italics_level++;
break;
case TAG_ITALICS_CLOSE:
if (italics_level == 1)
ass.append("{\\i}");
if (italics_level > 0)
italics_level--;
break;
case TAG_UNDERLINE_OPEN:
if (underline_level == 0)
ass.append("{\\u1}");
underline_level++;
break;
case TAG_UNDERLINE_CLOSE:
if (underline_level == 1)
ass.append("{\\u}");
if (underline_level > 0)
underline_level--;
break;
case TAG_STRIKEOUT_OPEN:
if (strikeout_level == 0)
ass.append("{\\s1}");
strikeout_level++;
break;
case TAG_STRIKEOUT_CLOSE:
if (strikeout_level == 1)
ass.append("{\\s}");
if (strikeout_level > 0)
strikeout_level--;
break;
case TAG_FONT_OPEN:
{
// new attributes to fill in
FontAttribs new_attribs;
FontAttribs old_attribs;
// start out with any previous ones on stack
if (font_stack.size() > 0)
{
old_attribs = font_stack.back();
}
new_attribs = old_attribs;
// now find all attributes on this font tag
while (attrib_matcher.Matches(tag_attrs))
{
// get attribute name and values
wxString attr_name = attrib_matcher.GetMatch(tag_attrs, 1);
wxString attr_value = attrib_matcher.GetMatch(tag_attrs, 2);
// clean them
attr_name.MakeLower();
if ((attr_value.StartsWith("'") && attr_value.EndsWith("'")) ||
(attr_value.StartsWith("\"") && attr_value.EndsWith("\"")))
{
attr_value = attr_value.Mid(1, attr_value.Len()-2);
}
// handle the attributes
if (attr_name == "face")
{
new_attribs.face = wxString::Format("{\\fn%s}", attr_value);
}
else if (attr_name == "size")
{
new_attribs.size = wxString::Format("{\\fs%s}", attr_value);
}
else if (attr_name == "color")
{
wxColour wxcl = html_to_color(attr_value);
wxString colorstr = AssColor(wxcl).GetASSFormatted(false, false, false);
new_attribs.color = wxString::Format("{\\c%s}", colorstr);
}
// remove this attribute to prepare for the next
size_t attr_pos, attr_len;
attrib_matcher.GetMatch(&attr_pos, &attr_len, 0);
tag_attrs.erase(attr_pos, attr_len);
}
// the attributes changed from old are then written out
if (new_attribs.face != old_attribs.face)
ass.append(new_attribs.face);
if (new_attribs.size != old_attribs.size)
ass.append(new_attribs.size);
if (new_attribs.color != old_attribs.color)
ass.append(new_attribs.color);
// lastly dump the new attributes state onto the stack
font_stack.push_back(new_attribs);
}
break;
case TAG_FONT_CLOSE:
{
// this requires a font stack entry
if (font_stack.empty())
break;
// get the current attribs
FontAttribs cur_attribs = font_stack.back();
// remove them from the stack
font_stack.pop_back();
// grab the old attributes if there are any
FontAttribs old_attribs;
if (font_stack.size() > 0)
old_attribs = font_stack.back();
// then restore the attributes to previous settings
if (cur_attribs.face != old_attribs.face)
{
if (old_attribs.face.empty())
ass.append("{\\fn}");
else
ass.append(old_attribs.face);
}
if (cur_attribs.size != old_attribs.size)
{
if (old_attribs.size.empty())
ass.append("{\\fs}");
else
ass.append(old_attribs.size);
}
if (cur_attribs.color != old_attribs.color)
{
if (old_attribs.color.empty())
ass.append("{\\c}");
else
ass.append(old_attribs.color);
}
}
break;
default:
// unknown tag, replicate it in the output
ass.append("<").append(tag_name).append(tag_attrs).append(">");
break;
}
}
// make it a little prettier, join tag groups
ass.Replace("}{", "", true);
return ass;
}
};
AssTime ReadSRTTime(wxString const& ts)
{
// For the sake of your sanity, please do not read this function.
int d, h, m, s, ms;
d = h = m = s = ms = 0;
size_t ci = 0;
int ms_chars = 0;
for (; ci < ts.length(); ++ci)
{
char ch = ts[ci];
switch (ch)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
s = s * 10 + (ch - '0');
break;
case ':':
d = h;
h = m;
m = s;
s = 0;
break;
case ',':
ci++;
goto milliseconds;
default:
goto allparsed;
}
}
goto allparsed;
milliseconds:
for (; ci < ts.length(); ++ci)
{
char ch = ts[ci];
switch (ch)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
ms = ms * 10 + (ch - '0');
ms_chars++;
break;
default:
goto allparsed;
}
}
allparsed:
while (ms_chars < 3) ms *= 10, ms_chars++;
while (ms_chars > 3) ms /= 10, ms_chars--;
AssTime res;
res.SetMS(ms + 1000*(s + 60*(m + 60*(h + d*24))));
return res;
}
wxString WriteSRTTime(AssTime const& ts)
{
int time = ts.GetMS();
int ms_part = time % 1000;
time /= 1000; // now holds seconds
int s_part = time % 60;
time /= 60; // now holds minutes
int m_part = time % 60;
time /= 60; // now holds hours
int h_part = time;
return wxString::Format("%02d:%02d:%02d,%03d", h_part, m_part, s_part, ms_part);
}
}
SRTSubtitleFormat::SRTSubtitleFormat()
: SubtitleFormat("SubRip")
{
@ -67,18 +392,19 @@ wxArrayString SRTSubtitleFormat::GetWriteWildcards() const {
void SRTSubtitleFormat::ReadFile(wxString const& filename, wxString const& encoding) {
using namespace std;
TextFileReader file(filename,encoding);
TextFileReader file(filename, encoding);
LoadDefault(false);
// See parsing algorithm at <http://devel.aegisub.org/wiki/SubtitleFormats/SRT>
// "hh:mm:ss,fff --> hh:mm:ss,fff" (e.g. "00:00:04,070 --> 00:00:10,04")
/// @todo: move the full parsing of SRT timestamps here, instead of having it in AssTime
wxRegEx timestamp_regex("^([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}) --> ([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3})");
wxRegEx timestamp_regex("^([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{1,}) --> ([0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{1,})");
if (!timestamp_regex.IsValid())
throw agi::InternalError("Parsing SRT: Failed compiling regex", 0);
SrtTagParser tag_parser;
int state = 1;
int line_num = 0;
int linebreak_debt = 0;
@ -90,9 +416,8 @@ void SRTSubtitleFormat::ReadFile(wxString const& filename, wxString const& encod
switch (state) {
case 1:
{
// start of file, no subtitles found yet
if (text_line.IsEmpty())
if (text_line.empty())
// ignore blank lines
break;
if (text_line.IsNumber()) {
@ -100,44 +425,37 @@ void SRTSubtitleFormat::ReadFile(wxString const& filename, wxString const& encod
state = 2;
break;
}
if (timestamp_regex.Matches(text_line)) {
if (timestamp_regex.Matches(text_line))
goto found_timestamps;
}
char cvtbuf[16]; sprintf(cvtbuf, "%d", line_num);
throw SRTParseError(std::string("Parsing SRT: Expected subtitle index at line ") + cvtbuf, 0);
}
throw SRTParseError(STD_STR(wxString::Format("Parsing SRT: Expected subtitle index at line %d", line_num)), 0);
case 2:
{
// want timestamps
if (timestamp_regex.Matches(text_line) == false) {
if (timestamp_regex.Matches(text_line) == false)
// bad format
char cvtbuf[16]; sprintf(cvtbuf, "%d", line_num);
throw SRTParseError(std::string("Parsing SRT: Expected timestamp pair at line ") + cvtbuf, 0);
}
throw SRTParseError(STD_STR(wxString::Format("Parsing SRT: Expected timestamp pair at line %d", line_num)), 0);
found_timestamps:
if (line != 0) {
// finalise active line
line->ParseSRTTags();
if (line) {
// finalize active line
line->Text = tag_parser.ToAss(line->Text);
line = 0;
}
// create new subtitle
line = new AssDialogue();
line = new AssDialogue;
line->group = "[Events]";
line->Style = "Default";
line->Comment = false;
// this parsing should best be moved out of AssTime
line->Start.ParseSRT(timestamp_regex.GetMatch(text_line, 1));
line->End.ParseSRT(timestamp_regex.GetMatch(text_line, 2));
line->Start = ReadSRTTime(timestamp_regex.GetMatch(text_line, 1));
line->End = ReadSRTTime(timestamp_regex.GetMatch(text_line, 2));
// store pointer to subtitle, we'll continue working on it
Line->push_back(line);
// next we're reading the text
state = 3;
break;
}
case 3:
{
// reading first line of subtitle text
if (text_line.IsEmpty()) {
if (text_line.empty()) {
// that's not very interesting... blank subtitle?
state = 5;
// no previous line that needs a line break after
@ -147,11 +465,9 @@ found_timestamps:
line->Text.Append(text_line);
state = 4;
break;
}
case 4:
{
// reading following line of subtitle text
if (text_line.IsEmpty()) {
if (text_line.empty()) {
// blank line, next may begin a new subtitle
state = 5;
// previous line needs a line break after
@ -160,24 +476,21 @@ found_timestamps:
}
line->Text.Append("\\N").Append(text_line);
break;
}
case 5:
{
// blank line in subtitle text
linebreak_debt++;
if (text_line.IsEmpty()) {
if (text_line.empty())
// multiple blank lines in a row, just add a line break...
break;
}
if (text_line.IsNumber()) {
// must be a subtitle index!
// go for timestamps next
state = 2;
break;
}
if (timestamp_regex.Matches(text_line)) {
if (timestamp_regex.Matches(text_line))
goto found_timestamps;
}
// assume it's a continuation of the subtitle text
// resolve our line break debt and append the line text
while (linebreak_debt-- > 0)
@ -185,12 +498,8 @@ found_timestamps:
line->Text.Append(text_line);
state = 4;
break;
}
default:
{
char cvtbuf[16]; sprintf(cvtbuf, "%d", state);
throw agi::InternalError(std::string("Parsing SRT: Reached unexpected state ") + cvtbuf, 0);
}
throw agi::InternalError(STD_STR(wxString::Format("Parsing SRT: Reached unexpected state %d", state)), 0);
}
}
@ -198,10 +507,9 @@ found_timestamps:
throw SRTParseError("Parsing SRT: Incomplete file", 0);
}
if (line) {
// an unfinalised line
line->ParseSRTTags();
}
if (line)
// an unfinalized line
line->Text = tag_parser.ToAss(line->Text);
}
void SRTSubtitleFormat::WriteFile(wxString const& filename, wxString const& encoding) {
@ -225,8 +533,8 @@ void SRTSubtitleFormat::WriteFile(wxString const& filename, wxString const& enco
int i=1;
for (std::list<AssEntry*>::iterator cur=Line->begin();cur!=Line->end();cur++) {
if (AssDialogue *current = dynamic_cast<AssDialogue*>(*cur)) {
file.WriteLineToFile(wxString::Format("%i", i++));
file.WriteLineToFile(current->Start.GetSRTFormated() + " --> " + current->End.GetSRTFormated());
file.WriteLineToFile(wxString::Format("%d", i++));
file.WriteLineToFile(WriteSRTTime(current->Start) + " --> " + WriteSRTTime(current->End));
file.WriteLineToFile(current->Text);
file.WriteLineToFile("");
}