Added a Perl script that converts the online wiki manual to static HTML. Still has a few annoying parsing bugs but it's mostly complete.

Originally committed to SVN as r1724.
2008-01-15 01:18:49 +00:00 · 2008-01-15 01:18:49 +00:00 · 04cbb3a74f
commit 04cbb3a74f
parent 3ce6d62f4e
1 changed files with 387 additions and 0 deletions
--- a/docs/aegisub_convert_docs.pl
+++ b/docs/aegisub_convert_docs.pl
@ -0,0 +1,387 @@
 #!/usr/bin/perl
 ########################
 #
 # aegisub_convert_docs.pl - downloads and converts the Aegisub documentation wiki to static HTML.
 #
 # Usage: aegisub_convert_docs.pl
 # Will write the entire wiki to the current directory.
 #
 # Warning: ugly hacking inside.
 #
 #########
 # 
 # No copyright, no license.
 #
 ########################
 # includes
 use warnings;
 use strict;
 use utf8;
 use LWP;
 use URI;
 # use CSS; # <--- fucking useless
 use HTML::LinkExtor;
 use File::Path;
 # variables
 my $base_dir = '.';
 my $host = 'http://aegisub.cellosoft.com';
 my $docs_base_url = 'http://aegisub.cellosoft.com/docs/';
 my $agent = LWP::UserAgent->new();
 my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page
 my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page");
 my %accepted_types = (
 	'image/png' 	=> '.png',
 	'image/jpeg' 	=> '.jpg',
 	'image/gif' 	=> '.gif',
 	'text/html'		=> '.html',
 	'text/css'		=> '.css',
 	'text/javascript' => '.js',
 	'text/plain'	=> '.txt',
 );
 my $timestamp = time();
 my ($requests, $writes);
 # make sure we have somewhere to write to
 mkdir($base_dir) or die("Couldn't create directory ", $base_dir, ": $!")
 	unless ( -e $base_dir and -d $base_dir );
 chdir($base_dir) or die("Couldn't change directory to ", $base_dir, ": $!");
 print("Starting downloading and conversion of documentation wiki to ", $base_dir, ".\n",
 	"This will probably take a while.\n");
 GETLOOP:
 while ( @process_pages ) {
 	my $current_page = shift(@process_pages);
 	# initialize object and download the page
 	my $page_object = $agent->get($current_page);
 	print("Requesting ${current_page}...\n");
 	# warn and skip if we couldn't get it
 	unless ( $page_object->is_success() ) {
 		warn("Couldn't get ", $current_page, ": ", $page_object->status_line(), ", skipping\n");
 		next(GETLOOP);
 	}
 	my $content_type = $page_object->content_type();
 	# skip if we don't know what it is
 	unless ( exists($accepted_types{$content_type}) ) {
 		warn("I don't know what to do with ", $content_type, ", skipping\n");
 		next(GETLOOP);
 	}
 	# monstrously ugly hack to handle rewriting of .css filenames
 	my $name;
 	if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) {
 		$name = convert_css_link($current_page);
 	} else {
 		$name = convert_link($current_page);
 	}
 	# if it's html, parse it, grab new links
 	# add them to @process_pages if they're not already in there
 	# then write the modified html to disk
 	if ( $content_type eq 'text/html' ) {
 		my ($filename, $content, @newpages) = parse_and_fix_html($current_page, $page_object->base(), $page_object->decoded_content());
 		# skip this page if the parser decided it was a page we don't want
 		next(GETLOOP) unless ($content);
 		write_to_disk($filename, $content_type, $content);
 		foreach my $url (@newpages) {
 			my $newname = convert_link($url);
 			# check if we already added that page to our todo-list
 			if ( exists($pages{$newname}) ) {
 				next(); # we did, do nothing
 			}
 			else {
 				# new page, add it to the list of things to process
 				push(@process_pages, $url);
 				$pages{$newname} = $url;
 			}
 		}
 	}
 	# if it's CSS we need the @import'ed links
 	elsif ( $content_type eq 'text/css' ) {
 		my @newpages = parse_css($current_page, $page_object->base(), $page_object->decoded_content());
 		write_to_disk($name, $content_type, $page_object->decoded_content());
 		foreach my $url (@newpages) {
 			my $newname = convert_link($url);
 			# check if we already added that page to our todo-list
 			if ( exists($pages{$newname}) ) {
 				next(); # we did, do nothing
 			}
 			else {
 				# new page, add it to the list of things to process
 				push(@process_pages, $url);
 				$pages{$newname} = $url;
 			}
 		}
 	}
 	else {
 		write_to_disk($name, $content_type, $page_object->decoded_content());
 	}
 }
 continue {
 	$requests++;
 }
 print("Done.\nMade $requests requests and wrote $writes files in ",
 	time()-$timestamp, " seconds.\n");
 exit(0);
 ##########################################
 sub parse_and_fix_html {
 	# parse out pages this page links to
 	# modify it
 	# return filename of the page, modified html and list of links
 	#get arguments
 	my ($url, $base, $content) = @_;
 	my (@links, @links_to_modify); # list of links to return later
 	# parse HTML
 	my $html = HTML::LinkExtor->new();
 	# $html->utf8_mode(1); # not needed
 	$html->parse($content);
 	$html->eof();
 	# loop over the list of links
 	# all real work is done here
 	LINKLIST:
 	foreach my $tag ( $html->links() ) {
 		my ($tagname, %attrs) = @{$tag};
 		my $quoted = quotemeta($host);
 		# does the link interest us?
 		if ( ($tagname eq 'a') and exists($attrs{'href'}) ) {
 			my $href = quotemeta($attrs{'href'}); # quotemeta?
 			$href =~ s!\&!\&amp\;!g; # quotemeta kills &amp; things
 			# skip and kill special or "edit" links
 			if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) {
 				$content =~ s!<a(.*?)href\=\"${href}\"(.*?)>(.*?)</a>!$3!gi;
 				next(LINKLIST);
 			}
 			# skip and kill image/special links
 			if ( $attrs{'href'} =~ m!(Special\:|Image\:|Talk\:)!i ) {
 				$content =~ s!<a.*?href\=\"${href}\".*?>(.*?)</a>!$1!gi;
 				next(LINKLIST);
 			}
 			# don't process #anchor links
 			if ( $attrs{'href'} =~ m!\#(.*?)$! ) {
 				next(LINKLIST);
 			}
 			# does it go within aegisub.cellosoft.com?
 			if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) {
 				push(@links_to_modify, $attrs{'href'});
 			}
 			# is not relative and goes somewhere else than aegisub.cellosoft.com
 			# so we're not interested in it
 			else { next(LINKLIST); }
 			push(@links, URI->new_abs($attrs{'href'}, $base));
 		}
 		elsif ( ($tagname eq 'link') and exists($attrs{'href'}) ) {
 			if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) {
 				push(@links_to_modify, $attrs{'href'});
 			}
 			else { next(LINKLIST); }
 			push(@links, URI->new_abs($attrs{'href'}, $base));
 		}
 		elsif ( ($tagname eq 'script') and exists($attrs{'src'}) ) {
 			my $src = quotemeta($attrs{'src'});
 			# bogus link, skip it
 			if ( $attrs{'src'} =~ m!index\.php\?title=-!i ) {
 				next(LINKLIST);
 			}
 			if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
 				push(@links_to_modify, $attrs{'src'});
 			}
 			else { next(LINKLIST); }
 			push(@links, URI->new_abs($attrs{'src'}, $base));
 		}
 		elsif ( ($tagname eq 'img') and exists($attrs{'src'}) ) {
 			if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
 				push(@links_to_modify, $attrs{'src'});
 			}
 			else { next(LINKLIST); }
 			push(@links, URI->new_abs($attrs{'src'}, $base));
 		} 
 		# else do nothing
 	}
 	# kill the topbar
 	$content =~ s!\<div id=\"topbar\".*?\<\!-- end topbar --\>!!s;
 	# handle the @import links to get the css right
 	while ( $content =~ m!\@import \"(.+?)\";!mg ) {
 		my $importlink = $1;
 		if ( convert_css_link($importlink) ) {
 			push(@links, URI->new_abs($importlink, $base));
 			push(@links_to_modify, '@' . $importlink);
 		}
 	}
 	# rewrite all the links
 	foreach my $link (@links_to_modify) {
 		my $converted = convert_link($link);
 		if ( substr($link,0,1) eq '@' ) {
 			substr($link,0,1) = ''; 
 		}
 		$link = quotemeta($link);
 		$content =~ s!$link!$converted!g;
 	}
 	my $filename = convert_link($url);
 	return($filename, $content, @links);
 }
 sub write_to_disk {
 	my ($path, $type, $thing) = @_;
 	# return() if ( -e $path ); # this was a dumb idea
 	$path =~ m!(.*)/(.*?)\.\w{2,4}$!;
 	my ($tree, $filename) = ($1, $2);
 	mkpath($tree) unless ( -e $tree and -d $tree );
 	if ( $type =~ m!^text! ) {
 		write_text($path, $thing);
 	}
 	else {
 		write_bin($path, $thing);
 	}
 	print("Writing $filename to ${path}...\n");
 	$writes++;
 }
 sub write_text {
 	my ($outfile, $thing) = @_;
 	open(OUT, ">:utf8", $outfile) or die("Couldn't open $outfile for writing: $!");
 	print OUT $thing;
 	close(OUT) or die("Couldn't close ${outfile}: $!");
 	return();
 }
 sub write_bin {
 	my ($outfile, $thing) = @_;
 	open(OUT, ">", $outfile) or die("Couldn't open $outfile for writing: $!");
 	binmode(OUT);
 	print OUT $thing;
 	close(OUT) or die("Couldn't close ${outfile}: $!");
 	return();
 }
 # converts links to relative starting with $base_dir
 sub convert_link {
 	my $link = shift(@_);
 	# dereference if necessary
 	if ( ref($link) ) {
 		$link = $$link;
 	}
 	# SPECIAL CASE: it's one of those fukken @ import links, do something else with it
 	if ( substr($link,0,1) eq '@' ) {
 		substr($link,0,1) = '';
 		return(convert_css_link($link));
 	}
 	# is it relative?
 	if ( substr($link,0,1) eq '/' ) {
 		$link =~ s!^/docs/!$base_dir/!i;
 	}
 	else {
 		my $quoted = quotemeta($host);
 		$link =~ s!${quoted}/docs/!$base_dir/!i;
 	}
 	# if it doesn't have a filename extension it's probably a page,
 	# and then we need to tack on .html to the end (fuck internet explorer)
 	# oh and jfs's .lua pages aren't lua scripts either
 	if ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) {
 			$link = $link . '.html'; 
 	}
 	$link =~ s!\:!_!g; # replace : with _
 	$link =~ s!\?!_!g; # replace ? with _
 	return($link);
 }
 # HAX
 sub convert_css_link {
 	my $link = shift(@_);
 	# does it seem like css?
 	if ( $link =~ m!MediaWiki:(.+?)\.css!i ) {
 		return(convert_link('/docs/' . $1 . '.css'));
 	}
 	# has a sensible name already, don't fuck with it
 	elsif ( $link =~ m!/(.+?)\.css$!i ) {
 		return(convert_link($link)); 
 	}
 	# doesn't seem like anything useful
 	else { return(undef); }
 }
 # argh
 sub parse_css {
 	my ($url, $base, $content) = @_;
 	my @links;
 	LINKLIST:
 	while ( $content =~ m!url\(\"(.+?)\"\)!mgi ) {
 		push(@links, URI->new_abs($1, $base));
 	}
 	my $filename = convert_link($url);
 	return($filename, @links);
 }