Modify aegisub_convert_docs.pl to produce a static mirror suitable for serving on a web site
This commit is contained in:
parent
1c9af767e5
commit
598d3d77e6
1 changed files with 107 additions and 96 deletions
101
docs/wiki_convert/aegisub_convert_docs.pl
Normal file → Executable file
101
docs/wiki_convert/aegisub_convert_docs.pl
Normal file → Executable file
|
@ -30,12 +30,13 @@ use File::Path;
|
|||
|
||||
# variables
|
||||
my $base_dir = '.';
|
||||
my $host = 'http://aegisub.cellosoft.com';
|
||||
my $docs_base_url = 'http://aegisub.cellosoft.com/docs/';
|
||||
my $host = 'http://docs.aegisub.org';
|
||||
my $docs_base_url = 'http://docs.aegisub.org/manual/';
|
||||
my $agent = LWP::UserAgent->new();
|
||||
my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page
|
||||
my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page");
|
||||
my %pages = ($base_dir . '/Main_Page' => $docs_base_url . "Main_Page");
|
||||
my %accepted_types = (
|
||||
'application/javascript' => '.js',
|
||||
'image/png' => '.png',
|
||||
'image/jpeg' => '.jpg',
|
||||
'image/gif' => '.gif',
|
||||
|
@ -79,16 +80,14 @@ while ( @process_pages ) {
|
|||
next(GETLOOP);
|
||||
}
|
||||
|
||||
|
||||
# monstrously ugly hack to handle rewriting of .css filenames
|
||||
my $name;
|
||||
if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) {
|
||||
if ( $current_page =~ m!css$!i ) {
|
||||
$name = convert_css_link($current_page);
|
||||
} else {
|
||||
$name = convert_link($current_page);
|
||||
}
|
||||
|
||||
|
||||
# if it's html, parse it, grab new links
|
||||
# add them to @process_pages if they're not already in there
|
||||
# then write the modified html to disk
|
||||
|
@ -159,14 +158,26 @@ sub parse_and_fix_html {
|
|||
my ($url, $base, $content) = @_;
|
||||
my (@links, @links_to_modify); # list of links to return later
|
||||
|
||||
# strip RSS etc.
|
||||
$content =~ s!<link rel=[^>]*xml[^>]* />!!gi;
|
||||
|
||||
# kill the topbar
|
||||
$content =~ s!<div id=\"topbar\".*?<\!-- end topbar -->!!s;
|
||||
|
||||
# kill the article/discussion/history thing
|
||||
$content =~ s!<div id=\"p-cactions\".*?</div>!!s;
|
||||
|
||||
# kill the "toolbox" at the bottom left
|
||||
$content =~ s!<div class=\"portlet\" id=\"p-tb\".*?(<\!-- end of the left)!$1!s;
|
||||
|
||||
# kill "recent changes"
|
||||
$content =~ s!<li id=\"n-recentchanges\">.*?</li>!!;
|
||||
|
||||
# parse HTML
|
||||
my $html = HTML::LinkExtor->new();
|
||||
# $html->utf8_mode(1); # not needed
|
||||
$html->parse($content);
|
||||
$html->eof();
|
||||
|
||||
|
||||
# loop over the list of links
|
||||
# all real work is done here
|
||||
LINKLIST:
|
||||
|
@ -181,7 +192,7 @@ sub parse_and_fix_html {
|
|||
$href =~ s!\&!\&\;!g; # quotemeta kills & things for some reason
|
||||
|
||||
# skip and kill special or "edit" links
|
||||
if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) {
|
||||
if ( $attrs{'href'} =~ m!index\.php\?!i ) {
|
||||
$content =~ s!<a href=\"${href}\".*?>(.*?)</a>!$1!gi;
|
||||
next(LINKLIST);
|
||||
}
|
||||
|
@ -236,10 +247,10 @@ sub parse_and_fix_html {
|
|||
if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
|
||||
# "flatten" image links
|
||||
my $flatlink = $attrs{'src'};
|
||||
$flatlink =~ s!/docs/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
|
||||
$flatlink =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
|
||||
$flatlink =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
|
||||
my $quotedsrc = quotemeta($attrs{'src'});
|
||||
$content =~ s!$quotedsrc!$flatlink!;
|
||||
#push(@links_to_modify, $attrs{'src'});
|
||||
}
|
||||
else { next(LINKLIST); }
|
||||
|
||||
|
@ -248,19 +259,6 @@ sub parse_and_fix_html {
|
|||
# else do nothing
|
||||
}
|
||||
|
||||
|
||||
# kill the topbar
|
||||
$content =~ s!<div id=\"topbar\".*?<\!-- end topbar -->!!s;
|
||||
|
||||
# kill the article/discussion/history thing
|
||||
$content =~ s!<div id=\"p-cactions\".*?</div>!!s;
|
||||
|
||||
# kill the "toolbox" at the bottom left
|
||||
$content =~ s!<div class=\"portlet\" id=\"p-tb\".*?(<\!-- end of the left)!$1!s;
|
||||
|
||||
# kill "recent changes"
|
||||
$content =~ s!<li id=\"n-recentchanges\">.*?</li>!!;
|
||||
|
||||
# handle the @import links to get the css right
|
||||
while ( $content =~ m!\@import \"(.+?)\";!mg ) {
|
||||
my $importlink = $1;
|
||||
|
@ -282,10 +280,10 @@ sub parse_and_fix_html {
|
|||
$content =~ s!\"$link\"!\"$converted\"!g;
|
||||
}
|
||||
|
||||
$url =~ s!manual_real!manual!;
|
||||
$url =~ s!http://docs.aegisub.org!!;
|
||||
|
||||
my $filename = convert_link($url);
|
||||
|
||||
return($filename, $content, @links);
|
||||
return($url, $content, @links);
|
||||
}
|
||||
|
||||
|
||||
|
@ -293,24 +291,17 @@ sub write_to_disk {
|
|||
my ($path, $type, $thing) = @_;
|
||||
# return() if ( -e $path ); # this was a dumb idea
|
||||
|
||||
$path =~ m!(.*)/(.*?)\.\w{2,4}$!;
|
||||
$path =~ m!/(.*)/(.*?)$!;
|
||||
my ($tree, $filename) = ($1, $2);
|
||||
|
||||
# is it an image link?
|
||||
if ( $tree =~ m!\./images/! ) {
|
||||
# hax it
|
||||
$path =~ s!/images/.+/!/images/!i;
|
||||
$tree =~ s!/images.+!/images!i;
|
||||
}
|
||||
|
||||
# I don't think this is necessary really
|
||||
mkpath($tree) unless ( -e $tree and -d $tree );
|
||||
|
||||
if ( $type =~ m!^text! ) {
|
||||
write_text($path, $thing);
|
||||
write_text('.' . $path, $thing);
|
||||
}
|
||||
else {
|
||||
write_bin($path, $thing);
|
||||
write_bin('.' . $path, $thing);
|
||||
}
|
||||
|
||||
print("Writing $filename to ${path}...\n");
|
||||
|
@ -356,6 +347,21 @@ sub convert_link {
|
|||
substr($link,0,1) = '';
|
||||
return(convert_css_link($link));
|
||||
}
|
||||
elsif ($link =~ /\.css$/) {
|
||||
return(convert_css_link($link));
|
||||
}
|
||||
elsif ($link =~ /\.js/) {
|
||||
return(convert_js_link($link));
|
||||
}
|
||||
|
||||
$link =~ s!http://docs.aegisub.org!!;
|
||||
$link =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i;
|
||||
$link =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i;
|
||||
|
||||
$link =~ s!manual_real/skins/.*?!manual/!;
|
||||
|
||||
#print("link: $link\n");
|
||||
return($link);
|
||||
|
||||
# is it relative?
|
||||
if ( substr($link,0,1) eq '/' ) {
|
||||
|
@ -385,23 +391,30 @@ sub convert_link {
|
|||
return($link);
|
||||
}
|
||||
|
||||
|
||||
# HAX
|
||||
sub convert_css_link {
|
||||
my $link = shift(@_);
|
||||
|
||||
# does it seem like css?
|
||||
if ( $link =~ m!MediaWiki:(.+?)\.css!i ) {
|
||||
return(convert_link('/docs/' . $1 . '.css'));
|
||||
return("/manual/css/$1.css");
|
||||
}
|
||||
# has a sensible name already, don't fuck with it
|
||||
elsif ( $link =~ m!/(.+?)\.css$!i ) {
|
||||
return(convert_link($link));
|
||||
elsif ( $link =~ m!/([^/]+?)\.css$!i ) {
|
||||
return("/manual/css/$1.css");
|
||||
}
|
||||
# doesn't seem like anything useful
|
||||
else { return(undef); }
|
||||
else {
|
||||
print("UNKNOWN CSS: $link\n");
|
||||
return(undef);
|
||||
}
|
||||
}
|
||||
|
||||
sub convert_js_link {
|
||||
my $link = shift(@_);
|
||||
$link =~ m!/([^/]+\.js)!;
|
||||
return("/manual/js/$1");
|
||||
}
|
||||
|
||||
# argh
|
||||
sub parse_css {
|
||||
|
@ -424,7 +437,5 @@ sub parse_css {
|
|||
push(@links, URI->new_abs($text, $base));
|
||||
}
|
||||
|
||||
my $filename = convert_link($url);
|
||||
|
||||
return($filename, @links);
|
||||
return(@links);
|
||||
}
|
Loading…
Reference in a new issue