#!perl

# This code is by Sean Boisen, 2003. You are free to use it yourself
# for non-commercial purposes provided this notice remains intact. All
# rights reserved, no warranties expressed or implied, your mileage
# may very well be different or 0.

use strict;
use vars qw($opt_usage $opt_file $opt_stopwords $opt_baseforms
	    $opt_debug);
use XML::Twig;
use Tokens;

# don't buffer output
$| = 1;

#MAIN
{
    my %tokens = ();		# raw tokens and their frequencies
    my $verse_count = 0;	# number of verses
    my $word_count = 0;		# number of words (instances)
    my $token_count = 0;	# number of tokens (unique instances)
    my %stopwords = ();		# frequent words to ignore
    # could just compute these on the fly, but that would take an extra pass over the data
    my %baseforms = ();		# map from inflected/derived forms to base forms
    my %term_verses = ();	# maps a base word to a list of verses
    my %verse_tokens = ();	# maps a verseID to a list of tokens (incl whitespace)
    # maps a verseID to a list of either concordance terms or 0s:
    # co-indexed with verse_tokens
    my %verse_terms = ();
    my %seen_term = ();	# one term->verse link only
    my $verseid = "";
    
    process_options();
    file2hash($opt_stopwords, \%stopwords);
    file2hash($opt_baseforms, \%baseforms);


    my $twig= new XML::Twig(TwigHandlers =>     
			    { verse => \&verse });             
    
    $twig->parsefile($opt_file);

    sub verse {
	# handler params are always the twig and the element
	my( $twig, $verse)= @_;

	# one verse is sometimes split in two: so only reset when the verseid changes
	unless ($verseid eq $verse->atts->{'osisID'}) {
	    $verseid = $verse->atts->{'osisID'};
	    $verse_count++;
	    %seen_term = ();
	}
	# creates both normal words, and funny whitespace
	# tokens: necessary to create good HTML later
	my @tokens = Tokens::get_tokens($verse->text);
	# map base forms of content words into verse_terms: other
	# elements become 0
	my @terms = ();
	foreach (@tokens) {
	    my $term = 0;		# default: not content
	    # ignore non-content tokens
	    if (/\w+/) {
		$word_count++;
		my $cap_token = ucfirst($_);
		$tokens{$cap_token}++;
		# only use tokens that aren't stopwords
		# how does this interact with baseformation?
		unless (defined($stopwords{$cap_token})) {
		    # get a baseform, or use the original
		    if (defined($baseforms{$cap_token})) {
			$term = $baseforms{$cap_token};
		    } else {
			$term = $cap_token;
		    }
		    # check again to make sure the baseform isn't a stop word
		    unless (defined($stopwords{$term})) {
			unless ($seen_term{$term}) {
			    $seen_term{$term} = 1;
			    push(@{$term_verses{$term}}, $verseid);
			}
		    }
		}
	    }
	    push(@terms, $term);
	}
	# store the 'raw' tokens for pretty output later
	push(@{$verse_tokens{$verseid}}, @tokens);
	# parallel to verse_tokens
	push(@{$verse_terms{$verseid}}, @terms);
	
    }


    # get rid of any terms with only a single verse: no point in
    # generating pages for these since there's nothing new to link to
    # need to make a singleton array because they're already in verse_terms
    my %singletons = ();
    foreach my $term (keys %term_verses) {
	if ((scalar @{$term_verses{$term}}) < 2) {
	    delete $term_verses{$term};
	    $singletons{$term} = 1;
	}
    }

    # for each content term, generate an HTML page, hyperlinking
    # content terms within each verse
    my $initial = "";
    my %seen_initial = ();
    open(INDEX, ">hyperconc-index.html") or die "Can't open hyperconc-index.html";
    print INDEX index_header();
    foreach my $term (sort {$a cmp $b} keys %term_verses) {
	my $newinitial = substr($term, 0, 1);
	if ($newinitial ne $initial) {
	    unless ($initial eq "") { print INDEX "</td></tr>\n"; }
	    $initial = $newinitial;
	    print INDEX "<tr><td valign=\"top\"><a name=\"$initial\"><b>$initial:<b></a>&nbsp;</td><td>";
	}
	print INDEX " <a href=\"$initial/$term.html\">$term</a>";
	unless (-d $initial) { mkdir $initial or die("Can't mkdir $initial"); }
	# this should be done more portably
	open(OUT, ">$initial/$term.html") or die "Can't open $initial/$term.html for output";
	print OUT html_header($term);
	foreach my $verse (@{$term_verses{$term}}) {
	    # this shows i don't understand perl refs :-/
	    my @vtokens = @{$verse_tokens{$verse}};
	    my @vterms = ();
	    # filter out any singletons
	    foreach my $vterm (@{$verse_terms{$verse}}) {
		if (defined($singletons{$vterm})) { push(@vterms, 0); }
		else { push(@vterms, $vterm); }
	    }
	    print OUT html_for_verse($term, $verse, \@vtokens, \@vterms);
	}
	print OUT html_footer();
	close(OUT);
    }
    print INDEX "</td></tr></table>\n"; 
    print INDEX html_footer();
    close(INDEX);
}

sub html_header {
    my $term = shift;
    return "<html><head><title>New Testament Hyper-concordance: \"$term\"</title>
<LINK REL=StyleSheet HREF=\"../stylesheet.css\" TYPE=\"text/css\"></head><body>
<table width=\"100%\" id=\"banner\"><tr>
  <td id=\"bannerspan\">New Testament Hyper-concordance: \"$term\"</td> 
  <td style=\"text-align: right\" id=\"bannerspan\">(<a href=\"../hyperconc-index.html\">Index</a>)</td>
</tr></table>
<table width=\"100%\">";
}

sub html_for_verse {
    my ($term, $verse, $tokens_ref, $terms_ref) = @_;
    my $tr = "<tr><td valign=\"top\">$verse&nbsp;</td><td>";
    my @tokens = @{$tokens_ref};
    my @terms = @{$terms_ref};
    my $length = scalar(@tokens);
    for (my $i = 0; $i < $length; $i++) {
	if ($terms[$i]) {
	    if ($terms[$i] eq $term) {
		$tr .= "<b>$tokens[$i]</b>";
	    } else {
		$tr .= html_for_a($tokens[$i], $terms[$i]);
	    }
	} else {
	    $tr .= $tokens[$i];
	}
    }
    return $tr."</td></tr>\n";
}

sub html_footer {
    return "</table><br>
<div class=\"footer\"><a href=\"http://radio.weblogs.com/0122862/stories/2003/05/18/theNewTestamentHyperconcordance.html\">Hyper-concordance</a> v1.1, by <a href=\"http://radio.weblogs.com/0122862/\">Sean Boisen</a>. 
Scripture verses taken from the Revised Standard Version, copyright 1952, 1971.</div>
</body></html>\n";
}

sub html_for_a {
    my ($token, $term) = @_;
    my $initial = substr($token, 0, 1);
    return "<a href=\"../$initial/$term.html\">$token</a>";
}

sub index_header {
    my $index = "";
    foreach my $initial ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
			 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
			 'W', 'X', 'Y', 'Z') {
	$index .= "<a style=\"\" href=\"#".$initial."\">$initial</a>  ";
    }
    return "<html><head><title>New Testament Hyper-concordance Index</title><LINK REL=StyleSheet HREF=\"stylesheet.css\" TYPE=\"text/css\"></head><body>\n
<div id=\"banner\">New Testament Hyper-concordance Index</div>\n
<div class=\"itemTitle\" style=\"text-align: center; word-spacing: .75em; font-size: 100%\">$index</div><br>\n<table>";
}



# read a file, presumed to have one entry per line, optionally
# followed by a number indicating count of this token in the corpus,
# and store each entry in a hash
sub file2hash {
    my ($file, $hashref) = @_;
    open(IN, "<$file") or die "Can't open $file\n";
    while (<IN>) {
	my ($term, $value) = split;
	if (defined($value)) {
	    $hashref->{$term} = $value;
	} else {
	    $hashref->{$term} = 1;
	}
    }
    if ($opt_debug) {
	print ">> $file contained ", scalar (keys %{$hashref}), " items\n";
    }
}

sub process_options {
    $opt_usage = 0;
    $opt_debug = 0;
    $opt_file = "../OSIS/RSV/rsv-nt.xml";
    $opt_stopwords = "stopwords.txt";
    $opt_baseforms = "baseforms.txt";

    # Parse the command line options
    use Getopt::Long;
    &GetOptions("usage|help|?", "file=s", "stopwords=s", "baseforms=s",
		# unadvertised
		"debug");

    if ($opt_usage) {print usage(); exit 0;}
    my %filetypes = ("input file" => $opt_file,
		     "stopwords file" => $opt_stopwords,
		     "baseforms file" => $opt_baseforms);
    # make sure all the configuration files are readable
    foreach my $file (keys %filetypes) {
	unless (-e $filetypes{$file}) {
	    die "$file $filetypes{$file} isn't readable\n".usage();
	}
    }
}

sub usage {
    return "usage: hyperconc.pl [-file rsv-nt.xml] [-stopwords stopwords.txt]
    [-baseforms baseforms.txt]

    Parses the input file, which must be in OSIS format (default is
    rsv-nt.xml). Iterates over each verse, tokenizes it, etc. Tokens
    in the stopwords and singletons files, and tokens without content
    (just whitepace and/or punctuation) aren't processed further. Each
    remaining token is mapped to a base (dictionary) form using the
    table in the baseforms file.

    Also counts things.
";

}
