# License Start:
#                    Carnegie Mellon University
#                      Copyright (c) 2004
#                       All Rights Reserved.
#
# Permission is hereby granted, free of charge, to use and distribute
# this software and its documentation without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this work, and to
# permit persons to whom this work is furnished to do so, subject to
# the following conditions:
#  1. The code must retain the above copyright notice, this list of
#     conditions and the following disclaimer.
#  2. Any modifications must be clearly marked as such.
#  3. Original authors' names are not deleted.
#  4. The authors' names are not used to endorse or promote products
#     derived from this software without specific prior written
#     permission.
#
# CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK
# DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
# SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE
# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
# ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
# THIS SOFTWARE.
#
# Author: Satanjeev "Bano" Banerjee satanjeev@cmu.edu
# Author: Alon Lavie alavie@cs.cmu.edu
# Author: Abhaya Agarwal abhayaa@cs.cmu.edu
#
# License End.

package wn_stem;

require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw(setUpDataStructures);

my $wn;                 # will contain the wordnet object
my $wnValidFormsRef;    # cache for validForms lookups

# this will create the alignment ready data structures by matching
# only words that are identical after stemming (but not identical
# before stemming)
sub setUpDataStructures {
    my $firstStringWordsRef           = shift;
    my $secondStringWordsRef          = shift;
    my $string2OriginalPosRef         = shift;
    my $string2MatchedPosRef          = shift;
    my $multiChoiceWordIndexesRef     = shift;
    my $posChoicesRef                 = shift;
    my $alreadyAlignedFirstStringRef  = shift;
    my $alreadyAlignedSecondStringRef = shift;
    my $lang = shift;
    $wn              = shift;
    $wnValidFormsRef = shift;

    my @firstStringStems  = ();
    my @secondStringStems = ();

    my $i;

    # Take the first string, stem the words and create a hash that
    # maps words to their (list of) positions in the string. Do this
    # only for words that are not already aligned in the previous
    # stages

    my %string1Pos = ();
    for ( $i = 0 ; $i <= $#{$firstStringWordsRef} ; $i++ ) {
        next if ( defined ${$alreadyAlignedFirstStringRef}{$i} );

        my $wordStem = wn_stem( ${$firstStringWordsRef}[$i] );
        $firstStringStems[$i] = $wordStem;
        push @{ $string1Pos{$wordStem} }, $i;
    }

    # Do the same for the second string
    my %string2Pos = ();
    for ( $i = 0 ; $i <= $#{$secondStringWordsRef} ; $i++ ) {
        next if ( defined ${$alreadyAlignedSecondStringRef}{$i} );

        my $wordStem = wn_stem( ${$secondStringWordsRef}[$i] );
        $secondStringStems[$i] = $wordStem;
        push @{ $string2Pos{$wordStem} }, $i;
    }

    # Now to construct the data structures for the alignment module
    my $index = 0;
    for ( $i = 0 ; $i <= $#secondStringStems ; $i++ ) {

        # skip if already aligned
        next if ( defined ${$alreadyAlignedSecondStringRef}{$i} );

        # skip if this stem doesn't occur in first string
        next unless ( defined $string1Pos{ $secondStringStems[$i] } );

        ${$string2OriginalPosRef}[$index] = $i;    # position in original 2nd string

        # check if occurs only once in both first and second string
        if (   ( $#{ $string1Pos{ $secondStringStems[$i] } } == 0 )
            && ( $#{ $string2Pos{ $secondStringStems[$i] } } == 0 ) )
        {
            ${$string2MatchedPosRef}[$index] = ${ $string1Pos{ $secondStringStems[$i] } }[0];
        }

        else {

            # okay, so this word has multiple pos choices
            ${$multiChoiceWordIndexesRef}{$index} = 1;

            ${$string2MatchedPosRef}[$index] =
              $secondStringStems[$i];    # this will be the key into the $posChoicesRef hash

            # create pos choices hash element for this stem, if not already created!
            unless ( defined ${$posChoicesRef}{ ${$string2MatchedPosRef}[$index] } ) {

                # put in all the first string positions for this word
                @{ ${$posChoicesRef}{ ${$string2MatchedPosRef}[$index] } } =
                  @{ $string1Pos{ ${$string2MatchedPosRef}[$index] } };

                # let d = number of occurrences in second string minus num
                # occ in second string for this word. Need d instances of
                # "-1" (skip) choices in the choices array.
                my $k;
                for (
                    $k = 0 ;
                    $k < (
                        $#{ $string2Pos{ ${$string2MatchedPosRef}[$index] } } -
                          $#{ $string1Pos{ ${$string2MatchedPosRef}[$index] } }
                    ) ;
                    $k++
                  )
                {
                    push @{ ${$posChoicesRef}{ ${$string2MatchedPosRef}[$index] } }, "-1";
                }
            }
        }

        $index++;
    }
}

sub wn_stem {
    my $input = shift;

    if($input eq ""){
     return $input;
    } 
    
    # get all the possible stems

    # Since we are making changes to the forms,
    # we make a copy.
    my @forms = @{ valid_forms_lookup($input) };

    # find the shortest string and return that as the stem
    my $output = "";
    foreach (@forms) {
        s/#.*//;    # get rid of trailing pos information
        $output = $_ if ( ( $output eq "" ) || ( length($output) > length($_) ) );
    }

    # if nothing found, return the input itself
    $output = $input if ( $output eq "" );

    return ($output);
}

# for doing the validForms lookup, this routine first checks in the cache
# and if not available performs a lookup from wordnet.
sub valid_forms_lookup {
    my $w = shift;

    # Let's do the lookup and put that in the cache
    ${$wnValidFormsRef}{$w} = [ $wn->validForms($w) ] unless ( exists ${$wnValidFormsRef}{$w} );

    return ${$wnValidFormsRef}{$w};
}

1;
