#!/usr/bin/perl
#
# Tested with
#
# $ ./proctut.pl tutorial.html > foo && diff foo tutorial.html | head -20
#
# BTW, tutorial.html is from:
#
# http://www.geda.seul.org/docs/current/tutorials/gsch2pcb/tutorial.html
#
#
# Alternatively,
#
# http://diberri.dyndns.org/wikipedia/html2wiki/
#
# might do everything I was hoping this script would do.
#
#
# I just did (as wpd):
#
# $ perl -MCPAN -e 'install Bundle::HTMLWikiConverter'
#
# to install it, but I'm not sure it worked.
#
#
# I tried again, but it still looks like it didn't work.  Eventually I
# installed it as root, was able to run it, wasn't too happy with the
# results, and wasn't confident of my ability to change the script.  So
# now I'm back here.
#
# As of 05/03/07, I've been running this as:
#
# $ ./proctut.pl < tutorial.html > tutorial.wpd
#
# And looking at, and sometimes uploading, the results.
#
use strict;

my $list_indent = 0;
my @list_char_stack;
my $list_char = "";
my $list_item = 0;
my $in_href = 0;
my $in_code = 0;

while (<>) {
    next if /^<!DOCTYPE/;	# Skip the DOCTYPE line */
    next if /^<html>/;		# skip the <html><head>... header line */

    s/&gt;/>/g;			# Replaces "&gt;" with ">"
    s/&lt;/>/g;			# Replaces "&lt;" with "<"

    s/<\/?(center|blockquote)>//g;
				# Get rid of <center> and </center>
				# and <blockquote> pairs

    s/<\/?(table|tbody|tr|td).*>//g;
				# Get rid of table related stuff
				# and <blockquote> pairs

    s/<h2>(.*)<\/h2>/====== \1 ======/g;
				# Change <h2> lines to headline level 1
    s/<h3>(.*)<\/h3>/===== \1 =====/g;
				# Change <h3> lines to headline level 2

    s/<h4>(.*)<\/h4>/==== \1 ====/g;
				# Change <h4> lines to headline level 3

    s/<h5>(.*)<\/h5>/**\1**/g;	# Bill appears to use <h5> to mean "Bold",
				# At least that's the way it's rendered
				# in my browswer for the one place he
				# he used it.

    s/<b>(.*?)<\/b>/**\1**/g;	# Change <b>...</b> tags

    s/<hr.*>/----/g;		# horizontal line

    s/<i>(.*)<\/i>/\/\/\1\/\//g;
				# Change <i>...<\i> to //...//
				# Gee, that was fun with all of the escape
				# characters!

    s/<img src="tutorial_files\/(.*?)".*>/{{wiki:\1}}/g;
				# Deal with embedded images

    s/<\/p>//;			# Get rid of </p>


    next if /^\s*$/;		# skip blank lines

    s/<br>/\\\\\n/;		# Change <br> to forced linebreak (\\\n)

    s/^\s+//;			# Compress whitespace at the beginning of
				# a line.

    if (s/<ul>/\n/) {
	$list_indent += 2;
	push(@list_char_stack, $list_char);
	$list_char = "*";
#	print STDERR "++", $list_indent;
    }
    if (s/<\/ul>//) {
	if ($list_indent >= 2) {
	    $list_indent -= 2;
	    $list_char = pop(@list_char_stack);
	} else {
	    die "Bad value for list_indent";
	}
#	print STDERR "--", $list_indent;
    }
    if (s/<li>//) {
	$list_item = 1;
	print " " x $list_indent . $list_char;
    }

    if (s/<\/li>/\n/) {
	$list_item = 0;
    }

    # Deal with <pre> and </pre> by calling them <code> and </code> 
    if (s/<pre>/<code>/g) {
	$in_code = 1;
    }
    if (s/<\/pre>/<\/code>/g) {
	$in_code = 0;
    }

    chomp if ($list_item && !$in_code);

    # Deal with external references
#    if (s/<a href="([^\"]*)".*>/\[\[\1|/) {
#	$in_href = 1;
#    }

    if (s/<a href="(.*?)".*?>/\[\[\1|/) {
	$in_href = 1;
    }

# I'm not sure what all of the <a name="XXX"> tags are in the HTML file,
# I could probably read something about it and learn what they do, but
# for now, I'm just going to get rid of them.

    s/<a name=".*?">\s*//g;

    if ($in_href) {
	if (s/<\/a>/\]\]/) {
	    $in_href = 0;
	}
    } else {
	# Eliminate the other </a>'s
	s/<\/a>//g;
    }
#     s/<a\s+href="(.*)".*>


    s/<p>/\n/;			# Now that we've skipped blank lines, replace
				# <p> with a blank line

    print " " if ($list_item && !$in_code);
				# We chomped off the newlines at the end
				# of the lines inside a list item, so make
				# sure we replace them with a space.
	
    print;
}
