2008-01-15 01:18:49 +00:00
|
|
|
#!/usr/bin/perl
|
|
|
|
|
|
|
|
########################
|
|
|
|
#
|
|
|
|
# aegisub_convert_docs.pl - downloads and converts the Aegisub documentation wiki to static HTML.
|
|
|
|
#
|
|
|
|
# Usage: aegisub_convert_docs.pl
|
|
|
|
# Will write the entire wiki to the current directory.
|
|
|
|
#
|
|
|
|
# Warning: ugly hacking inside.
|
|
|
|
#
|
|
|
|
#########
|
|
|
|
#
|
2008-01-15 04:55:16 +00:00
|
|
|
# Written by Karl Blomster (TheFluff) 2008
|
|
|
|
# This script is hereby given into the public domain. Do whatever you want with it.
|
2008-01-15 01:18:49 +00:00
|
|
|
#
|
|
|
|
########################
|
|
|
|
|
|
|
|
|
|
|
|
# includes
|
|
|
|
use warnings;
|
|
|
|
use strict;
|
|
|
|
use utf8;
|
|
|
|
use LWP;
|
|
|
|
use URI;
|
|
|
|
# use CSS; # <--- fucking useless
|
|
|
|
use HTML::LinkExtor;
|
|
|
|
use File::Path;
|
|
|
|
|
|
|
|
|
|
|
|
# variables
|
|
|
|
my $base_dir = '.';
|
|
|
|
my $host = 'http://aegisub.cellosoft.com';
|
|
|
|
my $docs_base_url = 'http://aegisub.cellosoft.com/docs/';
|
|
|
|
my $agent = LWP::UserAgent->new();
|
|
|
|
my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page
|
|
|
|
my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page");
|
|
|
|
my %accepted_types = (
|
|
|
|
'image/png' => '.png',
|
|
|
|
'image/jpeg' => '.jpg',
|
|
|
|
'image/gif' => '.gif',
|
|
|
|
'text/html' => '.html',
|
|
|
|
'text/css' => '.css',
|
|
|
|
'text/javascript' => '.js',
|
|
|
|
'text/plain' => '.txt',
|
|
|
|
);
|
|
|
|
my $timestamp = time();
|
|
|
|
my ($requests, $writes);
|
|
|
|
|
|
|
|
|
|
|
|
# make sure we have somewhere to write to
|
|
|
|
mkdir($base_dir) or die("Couldn't create directory ", $base_dir, ": $!")
|
|
|
|
unless ( -e $base_dir and -d $base_dir );
|
|
|
|
chdir($base_dir) or die("Couldn't change directory to ", $base_dir, ": $!");
|
|
|
|
|
|
|
|
|
2008-01-19 05:22:55 +00:00
|
|
|
print("Starting downloading and conversion of documentation wiki to ", $base_dir, "\n",
|
2008-01-15 01:18:49 +00:00
|
|
|
"This will probably take a while.\n");
|
|
|
|
|
|
|
|
|
|
|
|
GETLOOP:
|
|
|
|
while ( @process_pages ) {
|
|
|
|
my $current_page = shift(@process_pages);
|
|
|
|
# initialize object and download the page
|
|
|
|
my $page_object = $agent->get($current_page);
|
|
|
|
print("Requesting ${current_page}...\n");
|
|
|
|
|
|
|
|
# warn and skip if we couldn't get it
|
|
|
|
unless ( $page_object->is_success() ) {
|
|
|
|
warn("Couldn't get ", $current_page, ": ", $page_object->status_line(), ", skipping\n");
|
|
|
|
next(GETLOOP);
|
|
|
|
}
|
|
|
|
|
|
|
|
my $content_type = $page_object->content_type();
|
|
|
|
|
|
|
|
# skip if we don't know what it is
|
|
|
|
unless ( exists($accepted_types{$content_type}) ) {
|
|
|
|
warn("I don't know what to do with ", $content_type, ", skipping\n");
|
|
|
|
next(GETLOOP);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# monstrously ugly hack to handle rewriting of .css filenames
|
|
|
|
my $name;
|
|
|
|
if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) {
|
|
|
|
$name = convert_css_link($current_page);
|
|
|
|
} else {
|
|
|
|
$name = convert_link($current_page);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# if it's html, parse it, grab new links
|
|
|
|
# add them to @process_pages if they're not already in there
|
|
|
|
# then write the modified html to disk
|
|
|
|
if ( $content_type eq 'text/html' ) {
|
|
|
|
my ($filename, $content, @newpages) = parse_and_fix_html($current_page, $page_object->base(), $page_object->decoded_content());
|
|
|
|
|
|
|
|
# skip this page if the parser decided it was a page we don't want
|
|
|
|
next(GETLOOP) unless ($content);
|
|
|
|
|
|
|
|
write_to_disk($filename, $content_type, $content);
|
|
|
|
|
|
|
|
foreach my $url (@newpages) {
|
|
|
|
my $newname = convert_link($url);
|
|
|
|
# check if we already added that page to our todo-list
|
|
|
|
if ( exists($pages{$newname}) ) {
|
|
|
|
next(); # we did, do nothing
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# new page, add it to the list of things to process
|
|
|
|
push(@process_pages, $url);
|
|
|
|
$pages{$newname} = $url;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# if it's CSS we need the @import'ed links
|
|
|
|
elsif ( $content_type eq 'text/css' ) {
|
|
|
|
my @newpages = parse_css($current_page, $page_object->base(), $page_object->decoded_content());
|
|
|
|
|
|
|
|
write_to_disk($name, $content_type, $page_object->decoded_content());
|
|
|
|
|
|
|
|
foreach my $url (@newpages) {
|
|
|
|
my $newname = convert_link($url);
|
|
|
|
# check if we already added that page to our todo-list
|
|
|
|
if ( exists($pages{$newname}) ) {
|
|
|
|
next(); # we did, do nothing
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# new page, add it to the list of things to process
|
|
|
|
push(@process_pages, $url);
|
|
|
|
$pages{$newname} = $url;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
write_to_disk($name, $content_type, $page_object->decoded_content());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
continue {
|
|
|
|
$requests++;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
print("Done.\nMade $requests requests and wrote $writes files in ",
|
|
|
|
time()-$timestamp, " seconds.\n");
|
|
|
|
|
|
|
|
exit(0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
##########################################
|
|
|
|
|
2008-01-15 04:49:01 +00:00
|
|
|
|
|
|
|
# parse out pages this page links to
|
|
|
|
# modify it
|
|
|
|
# return filename of the page, modified html and list of links
|
2008-01-15 01:18:49 +00:00
|
|
|
sub parse_and_fix_html {
|
|
|
|
#get arguments
|
|
|
|
my ($url, $base, $content) = @_;
|
|
|
|
my (@links, @links_to_modify); # list of links to return later
|
|
|
|
|
|
|
|
|
|
|
|
# parse HTML
|
|
|
|
my $html = HTML::LinkExtor->new();
|
|
|
|
# $html->utf8_mode(1); # not needed
|
|
|
|
$html->parse($content);
|
|
|
|
$html->eof();
|
|
|
|
|
|
|
|
|
|
|
|
# loop over the list of links
|
|
|
|
# all real work is done here
|
|
|
|
LINKLIST:
|
|
|
|
foreach my $tag ( $html->links() ) {
|
|
|
|
my ($tagname, %attrs) = @{$tag};
|
|
|
|
|
2008-01-15 04:49:01 +00:00
|
|
|
my $quoted = quotemeta($docs_base_url);
|
2008-01-15 01:18:49 +00:00
|
|
|
|
|
|
|
# does the link interest us?
|
|
|
|
if ( ($tagname eq 'a') and exists($attrs{'href'}) ) {
|
|
|
|
my $href = quotemeta($attrs{'href'}); # quotemeta?
|
2008-01-15 04:49:01 +00:00
|
|
|
$href =~ s!\&!\&\;!g; # quotemeta kills & things for some reason
|
2008-01-15 01:18:49 +00:00
|
|
|
|
|
|
|
# skip and kill special or "edit" links
|
|
|
|
if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) {
|
2008-01-15 04:49:01 +00:00
|
|
|
$content =~ s!<a href=\"${href}\".*?>(.*?)</a>!$1!gi;
|
2008-01-15 01:18:49 +00:00
|
|
|
next(LINKLIST);
|
|
|
|
}
|
|
|
|
# skip and kill image/special links
|
|
|
|
if ( $attrs{'href'} =~ m!(Special\:|Image\:|Talk\:)!i ) {
|
|
|
|
$content =~ s!<a.*?href\=\"${href}\".*?>(.*?)</a>!$1!gi;
|
|
|
|
next(LINKLIST);
|
|
|
|
}
|
2008-01-15 04:49:01 +00:00
|
|
|
# change somepage#anchor links so they point to the right document,
|
|
|
|
# but don't return them for further processing
|
|
|
|
# BUG: if a page is ONLY referred to by this type of link, it won't get downloaded!
|
|
|
|
# (highly unlikely)
|
|
|
|
if ( $attrs{'href'} =~ m!.+\#(.*?)$! ) {
|
|
|
|
push(@links_to_modify, $attrs{'href'});
|
2008-01-15 01:18:49 +00:00
|
|
|
next(LINKLIST);
|
|
|
|
}
|
|
|
|
|
|
|
|
# does it go within aegisub.cellosoft.com?
|
|
|
|
if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) {
|
|
|
|
push(@links_to_modify, $attrs{'href'});
|
|
|
|
}
|
|
|
|
# is not relative and goes somewhere else than aegisub.cellosoft.com
|
2008-01-15 04:49:01 +00:00
|
|
|
# so we're not interested in it (#anchor links are not touched either)
|
2008-01-15 01:18:49 +00:00
|
|
|
else { next(LINKLIST); }
|
|
|
|
|
|
|
|
push(@links, URI->new_abs($attrs{'href'}, $base));
|
|
|
|
}
|
|
|
|
elsif ( ($tagname eq 'link') and exists($attrs{'href'}) ) {
|
|
|
|
if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) {
|
|
|
|
push(@links_to_modify, $attrs{'href'});
|
|
|
|
}
|
|
|
|
else { next(LINKLIST); }
|
|
|
|
|
|
|
|
push(@links, URI->new_abs($attrs{'href'}, $base));
|
|
|
|
}
|
|
|
|
elsif ( ($tagname eq 'script') and exists($attrs{'src'}) ) {
|
|
|
|
my $src = quotemeta($attrs{'src'});
|
|
|
|
|
|
|
|
# bogus link, skip it
|
|
|
|
if ( $attrs{'src'} =~ m!index\.php\?title=-!i ) {
|
|
|
|
next(LINKLIST);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
|
|
|
|
push(@links_to_modify, $attrs{'src'});
|
|
|
|
}
|
|
|
|
else { next(LINKLIST); }
|
|
|
|
|
|
|
|
push(@links, URI->new_abs($attrs{'src'}, $base));
|
|
|
|
}
|
|
|
|
elsif ( ($tagname eq 'img') and exists($attrs{'src'}) ) {
|
|
|
|
if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
|
2008-01-19 05:22:55 +00:00
|
|
|
# "flatten" image links
|
|
|
|
my $flatlink = $attrs{'src'};
|
|
|
|
$flatlink =~ s!/docs/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
|
|
|
|
my $quotedsrc = quotemeta($attrs{'src'});
|
|
|
|
$content =~ s!$quotedsrc!$flatlink!;
|
|
|
|
#push(@links_to_modify, $attrs{'src'});
|
2008-01-15 01:18:49 +00:00
|
|
|
}
|
|
|
|
else { next(LINKLIST); }
|
|
|
|
|
|
|
|
push(@links, URI->new_abs($attrs{'src'}, $base));
|
|
|
|
}
|
|
|
|
# else do nothing
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# kill the topbar
|
2008-01-15 04:49:01 +00:00
|
|
|
$content =~ s!<div id=\"topbar\".*?<\!-- end topbar -->!!s;
|
|
|
|
|
|
|
|
# kill the article/discussion/history thing
|
|
|
|
$content =~ s!<div id=\"p-cactions\".*?</div>!!s;
|
2008-01-15 01:18:49 +00:00
|
|
|
|
2008-01-15 04:49:01 +00:00
|
|
|
# kill the "toolbox" at the bottom left
|
|
|
|
$content =~ s!<div class=\"portlet\" id=\"p-tb\".*?(<\!-- end of the left)!$1!s;
|
2008-01-15 01:18:49 +00:00
|
|
|
|
2008-01-15 05:09:18 +00:00
|
|
|
# kill "recent changes"
|
|
|
|
$content =~ s!<li id=\"n-recentchanges\">.*?</li>!!;
|
|
|
|
|
2008-01-15 01:18:49 +00:00
|
|
|
# handle the @import links to get the css right
|
|
|
|
while ( $content =~ m!\@import \"(.+?)\";!mg ) {
|
|
|
|
my $importlink = $1;
|
|
|
|
|
|
|
|
if ( convert_css_link($importlink) ) {
|
|
|
|
push(@links, URI->new_abs($importlink, $base));
|
|
|
|
push(@links_to_modify, '@' . $importlink);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# rewrite all the links
|
|
|
|
foreach my $link (@links_to_modify) {
|
|
|
|
my $converted = convert_link($link);
|
|
|
|
if ( substr($link,0,1) eq '@' ) {
|
|
|
|
substr($link,0,1) = '';
|
|
|
|
}
|
|
|
|
$link = quotemeta($link);
|
2008-01-15 04:49:01 +00:00
|
|
|
$content =~ s!\"$link\"!\"$converted\"!g;
|
2008-01-15 01:18:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
my $filename = convert_link($url);
|
|
|
|
|
|
|
|
return($filename, $content, @links);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub write_to_disk {
|
|
|
|
my ($path, $type, $thing) = @_;
|
|
|
|
# return() if ( -e $path ); # this was a dumb idea
|
|
|
|
|
|
|
|
$path =~ m!(.*)/(.*?)\.\w{2,4}$!;
|
|
|
|
my ($tree, $filename) = ($1, $2);
|
|
|
|
|
2008-01-19 05:22:55 +00:00
|
|
|
# is it an image link?
|
|
|
|
if ( $tree =~ m!\./images/! ) {
|
|
|
|
# hax it
|
|
|
|
$path =~ s!/images/.+/!/images/!i;
|
|
|
|
$tree =~ s!/images.+!/images!i;
|
|
|
|
}
|
|
|
|
|
|
|
|
# I don't think this is necessary really
|
2008-01-15 01:18:49 +00:00
|
|
|
mkpath($tree) unless ( -e $tree and -d $tree );
|
|
|
|
|
|
|
|
if ( $type =~ m!^text! ) {
|
|
|
|
write_text($path, $thing);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
write_bin($path, $thing);
|
|
|
|
}
|
|
|
|
|
|
|
|
print("Writing $filename to ${path}...\n");
|
|
|
|
|
|
|
|
$writes++;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub write_text {
|
|
|
|
my ($outfile, $thing) = @_;
|
|
|
|
|
|
|
|
open(OUT, ">:utf8", $outfile) or die("Couldn't open $outfile for writing: $!");
|
|
|
|
print OUT $thing;
|
|
|
|
close(OUT) or die("Couldn't close ${outfile}: $!");
|
|
|
|
|
|
|
|
return();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sub write_bin {
|
|
|
|
my ($outfile, $thing) = @_;
|
|
|
|
|
|
|
|
open(OUT, ">", $outfile) or die("Couldn't open $outfile for writing: $!");
|
|
|
|
binmode(OUT);
|
|
|
|
print OUT $thing;
|
|
|
|
close(OUT) or die("Couldn't close ${outfile}: $!");
|
|
|
|
|
|
|
|
return();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# converts links to relative starting with $base_dir
|
|
|
|
sub convert_link {
|
|
|
|
my $link = shift(@_);
|
|
|
|
|
|
|
|
# dereference if necessary
|
|
|
|
if ( ref($link) ) {
|
|
|
|
$link = $$link;
|
|
|
|
}
|
|
|
|
|
2008-01-15 04:49:01 +00:00
|
|
|
# SPECIAL CASE: it's one of those fukken @import links, do something else with it
|
2008-01-15 01:18:49 +00:00
|
|
|
if ( substr($link,0,1) eq '@' ) {
|
|
|
|
substr($link,0,1) = '';
|
|
|
|
return(convert_css_link($link));
|
|
|
|
}
|
|
|
|
|
|
|
|
# is it relative?
|
|
|
|
if ( substr($link,0,1) eq '/' ) {
|
|
|
|
$link =~ s!^/docs/!$base_dir/!i;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
my $quoted = quotemeta($host);
|
|
|
|
$link =~ s!${quoted}/docs/!$base_dir/!i;
|
|
|
|
}
|
|
|
|
|
|
|
|
# if it doesn't have a filename extension it's probably a page,
|
|
|
|
# and then we need to tack on .html to the end (fuck internet explorer)
|
2008-01-15 04:49:01 +00:00
|
|
|
# oh and jfs's .lua pages aren't really lua scripts either
|
|
|
|
my $bdirquoted = quotemeta($base_dir);
|
|
|
|
if ( $link =~ m/^(${bdirquoted}.+?)\#.*$/ ) {
|
|
|
|
my $pagename = $1;
|
|
|
|
if ( $pagename !~ m!\.html$! or (substr($pagename,-4) eq '.lua') ) {
|
|
|
|
$link =~ s!^${pagename}!${pagename}.html!;
|
|
|
|
}
|
|
|
|
} elsif ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) {
|
2008-01-15 01:18:49 +00:00
|
|
|
$link = $link . '.html';
|
|
|
|
}
|
|
|
|
|
|
|
|
$link =~ s!\:!_!g; # replace : with _
|
|
|
|
$link =~ s!\?!_!g; # replace ? with _
|
|
|
|
|
|
|
|
return($link);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# HAX
|
|
|
|
sub convert_css_link {
|
|
|
|
my $link = shift(@_);
|
|
|
|
|
|
|
|
# does it seem like css?
|
|
|
|
if ( $link =~ m!MediaWiki:(.+?)\.css!i ) {
|
|
|
|
return(convert_link('/docs/' . $1 . '.css'));
|
|
|
|
}
|
|
|
|
# has a sensible name already, don't fuck with it
|
|
|
|
elsif ( $link =~ m!/(.+?)\.css$!i ) {
|
|
|
|
return(convert_link($link));
|
|
|
|
}
|
|
|
|
# doesn't seem like anything useful
|
|
|
|
else { return(undef); }
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# argh
|
|
|
|
sub parse_css {
|
|
|
|
my ($url, $base, $content) = @_;
|
|
|
|
my @links;
|
2008-01-15 04:49:01 +00:00
|
|
|
# my $quoted = quotemeta($docs_base_url); # <--- not used
|
2008-01-15 01:18:49 +00:00
|
|
|
|
2008-01-15 04:49:01 +00:00
|
|
|
# find url("stuff") blocks
|
2008-01-15 01:18:49 +00:00
|
|
|
LINKLIST:
|
2008-01-15 04:49:01 +00:00
|
|
|
while ( $content =~ m!url\((\")?(.+?)(\")?\)!mgi ) {
|
|
|
|
my $text = $2;
|
|
|
|
|
|
|
|
# skip it if it's nonrelative and goes somewhere else than aegisub.cellosoft.com
|
|
|
|
if ( $text =~ m!^http!i ) {
|
|
|
|
# actually fuck this there shouldn't be any nonrelative links in there anyway
|
|
|
|
next(LINKLIST)
|
|
|
|
#unless ( $text =~ m!^${quoted}!i );
|
|
|
|
}
|
|
|
|
|
|
|
|
push(@links, URI->new_abs($text, $base));
|
2008-01-15 01:18:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
my $filename = convert_link($url);
|
|
|
|
|
|
|
|
return($filename, @links);
|
|
|
|
}
|