# This code is by Sean Boisen, 2003. You are free to use it yourself
# for non-commercial purposes provided this notice remains intact. All
# rights reserved, no warranties expressed or implied, your mileage
# may very well be different or 0.

package Tokens;
use strict;
use vars qw($VERSION @EXPORT_OK @ISA @EXPORT);
require Exporter;
@ISA       = qw(Exporter);
@EXPORT    = qw();
@EXPORT_OK = qw(tokens);
$VERSION = sprintf('%d.%02d', (q$Revision: 0.9 $ =~ /\d+/g));

# problem: the RSV marks pronunciation and hyphenation like
# She-al'ti-el. splitting on \b makes this into 7 tokens!
# my @tokens = split(/\b/, $verse->text); 

#this solves that problem, but includes trailing punctuation
#my @tokens = split(/(?=\S)(?<!\S)|(?!\S)(?<=\S)/, $verse->text); 

# current approach: first remove these artifacts (being careful to
# leave 's, etc.), then tokenize with \b

sub get_tokens {
    my $text = shift;
    my $versetext = protect_word_punct($text);
    # replace all other word-internal apostrophes and dashes
    $versetext =~ s/(\w)['-](\w)/$1$2/g;
    # restore word punct
    $versetext =~ s/#\|#/\'/g;
    $versetext =~ s/#_#/-/g;
    return (split(/\b/, $versetext));
}

# "protect" word-internal punctuation characters by temporarily
# replacing them with a unique sequence, so they can later be
# converted back after the pronunciation marks of the RSV have been
# removed. This list just covers the New Testament. 
sub protect_word_punct {
    my $original = shift;
    $_ = $original;
    # apostrophe s at end of word
    s/\'s\b/#|#s/g;
    # forty-six, thirty-eight, ninety-nine, eighty-four
    s/-(two|three|four|five|six|eight|nine)/#_#$1/g;	
    s/-in-law/#_#in#_#law/g; # mother|daughter-in-law
    # could collapse some of these
    s/self-/self#_#/g; # self-control
    s/money-/money#_#/g; # money-changers
    s/birth-/birth#_#/g; # birth-pangs
    s/empty-/empty#_#/g; # empty-handed
    s/passer-/passer#_#/g;	# passer-by
    s/first-/first#_#/g; # first-born
    s/high-/high#_#/g; # high-priesthood
    s/rock-/rock#_#/g; # rock-hewn
    s/ill-/ill#_#/g; # ill-treat(ment), ill-clad
    s/stiff-/stiff#_#/g; # stiff-necked
    s/God-/God#_#/g; # God-fearing
    s/storm-/storm#_#/g; # storm-tossed
    s/well-/well#_#/g; # well-doing
    s/simple-/simple#_#/g;	# simple-minded
    s/-called/#_#called/g;	# so-called
    s/life-/life#_#/g; # life-giving
    s/well-/well#_#/g; # well-doing
    s/eye-/eye#_#/g; # eye-service
    s/evil-/evil#_#/g; # evil-worker
    s/men-/men#_#/g; # men-pleasers
    s/double-/double#_#/g;	# double-tongued
    s/hard-/hard#_#/g; # hard-working
    s/quick-/quick#_#/g; # quick-tempered
    s/two-/two#_#/g; # two-edged
    s/full-/full#_#/g; # full-grown
    s/moth-/moth#_#/g; # moth-eaten
    s/-maker/#_#maker/g; # mischief-maker
    s/loud-/loud#_#/g; # loud-mouthed
    s/hundred-/hundred#_#/g; # hundred-weight
    return $_;
}


1;    

__END__

