# License Start:
#                    Carnegie Mellon University
#                      Copyright (c) 2004
#                       All Rights Reserved.
#
# Permission is hereby granted, free of charge, to use and distribute
# this software and its documentation without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this work, and to
# permit persons to whom this work is furnished to do so, subject to
# the following conditions:
#  1. The code must retain the above copyright notice, this list of
#     conditions and the following disclaimer.
#  2. Any modifications must be clearly marked as such.
#  3. Original authors' names are not deleted.
#  4. The authors' names are not used to endorse or promote products
#     derived from this software without specific prior written
#     permission.
#
# CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK
# DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
# SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE
# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
# ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
# THIS SOFTWARE.
#
# Author: Satanjeev "Bano" Banerjee satanjeev@cmu.edu
# Author: Alon Lavie alavie@cs.cmu.edu
# Author: Abhaya Agarwal abhayaa@cs.cmu.edu
#
# License End.

# Module that takes two strings of space separated words as input and
# aligns matching words between the two strings over 1 or more
# stages. Each stage uses different matching criteria (exact match,
# stemmed match, synonymy match, relatedness match, etc) to find match
# candidates for the alignment process to align.

# this function takes as input a hash, containing the following structure:
# firstString -> the first string of words
# secondString -> the second string of words
# modules -> array of module names (1 or more)
# wn -> WordNet object / undefined
# maxComputations -> integer/undefined
# details -> 0/1/undefined
# debug -> 0/1/undefined
# prune -> 0/undefined=off, 1=on
# stop -> hash containing words to be stopped
# matchScore -> double dimensioned array with as many rows as modules. For row i, col 0 = number of matches, col 1 = score, for module i
# alignment -> This is the final alignment produced by the matcher.
# sizeOfFirstString -> # words in first string (after stopping, if stopping was requested)
# sizeOfSecondString -> # words in second string (after stopping, if stopping was requested)
# numChunks -> number of chunks in final match
# avgChunkLength -> average words in each chunk
# detailString -> string containing all text output due to "details" field being set to 1
# debugString -> string containing all text output due to "debug" field being set to 1

package mStageMatcher;

require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw(match);

sub match {
    ############################################################
    # First get all the inputs 	################################
    ############################################################

    my $inputHashRef = shift;

    # Check what language we are working with
    my $lang = lc($$inputHashRef{"language"}) || "en";
    
    # get the two strings
    my $firstString  = lc( $$inputHashRef{"firstString"} );
    my $secondString = lc( $$inputHashRef{"secondString"} );

    # check if the two strings need to be stopped
    if ( defined $$inputHashRef{"stop"} ) {
        %stopHash = %{ ${$inputHashRef}{"stop"} };
        removeStopWords( \$firstString );
        removeStopWords( \$secondString );
    }

    # split the strings into words
    @firstStringWords  = split /\s+/, $firstString;
    @secondStringWords = split /\s+/, $secondString;

    # put in the sizes of the strings
    $$inputHashRef{"sizeOfFirstString"}  = $#firstStringWords + 1;
    $$inputHashRef{"sizeOfSecondString"} = $#secondStringWords + 1;

    # get the modules
    my @modules = @{ $$inputHashRef{"modules"} };

    # intialize WordNet object if provided
    my $wn                      = "";
    my $wnValidFormsRef         = "";
    my $wnSynsetOffsetHashesRef = "";

    if ( defined ${$inputHashRef}{"wn"} ) {
        $wn                      = $$inputHashRef{"wn"};
        $wnValidFormsRef         = $$inputHashRef{"wnValidForms"};
        $wnSynsetOffsetHashesRef = $$inputHashRef{"wnSynsetOffsetHashes"};
    }

    # initialize maxComputations
    $maxComputations = ( defined $$inputHashRef{"maxComputations"} ) ? $$inputHashRef{"maxComputations"} : -1;
    $numComputations = 0;

    # switch on/off detailed view
    if ( ( defined $$inputHashRef{"details"} ) && ( $$inputHashRef{"details"} == 1 ) ) {
        $DETAILS      = 1;
        $detailString = "";
    }
    else {
        $DETAILS = 0;
    }

    # switch on/off debug mode
    if ( ( defined $$inputHashRef{"debug"} ) && ( $$inputHashRef{"debug"} == 1 ) ) {
        $DEBUG       = 1;
        $debugString = "";
    }
    else {
        $DEBUG = 0;
    }

    # switch on/off PRUNE_OFF (opposite of "prune")
    $PRUNE_OFF = ( ( defined $$inputHashRef{"prune"} ) && ( $$inputHashRef{"prune"} == 1 ) ) ? 0 : 1;

    ############################################################
    # Next, align and get scores module by module ##############
    ############################################################

    @finalAlignment = ();    # Array containing the final alignment that will get built across all the alignment stages.
    @tokenAlignmentStage = ();    # Array to remember which stage each token's alignment came from.
    foreach (@secondStringWords) {
        push @finalAlignment,      "-1";
        push @tokenAlignmentStage, "";
    }
    %alreadyAlignedFirstString  = ();
    %alreadyAlignedSecondString = ();

    # do this loop as many times as there are modules
    my $alignmentStage = 0;
    for ( $alignmentStage = 0 ; $alignmentStage <= $#modules ; $alignmentStage++ ) {

        # We need to set up the following four data structures for each alignment stage

        @string2OriginalPos = ();    # array containing pointers to the
                                     # original second string - needed since some of the words in the
                                     # original second string may get dropped from the computation since
                                     # they dont occur in the first string

        @string2MatchedPos = ();     # array containing pointers to the words
                                     # in the original first string that the words in the second string
                                     # got alligned to. At the end of the computation, this will
                                     # represent the "optimal" match

        %multiChoiceWordIndexes = ();    # indexes of words in the second
                                         # string array that have 2 or more position choices

        %posChoices = ();                # hash that maps each multi choice token to the
                                         # list of target position choices

        # Load in current module
        require "$modules[$alignmentStage].pm";
        my $temp = $modules[$alignmentStage];
        import $temp;

        # this function will be in the module just loaded
        setUpDataStructures(
            \@firstStringWords,          \@secondStringWords,          \@string2OriginalPos,
            \@string2MatchedPos,         \%multiChoiceWordIndexes,     \%posChoices,
            \%alreadyAlignedFirstString, \%alreadyAlignedSecondString, $lang, $wn,
            $wnValidFormsRef,            $wnSynsetOffsetHashesRef
        );    # if the module needs the wn object,
              # here it is. If the module doesn't
              # need it, it can just ignore the last
              # argument.

        if ($DEBUG) {
            $debugString .= "String to matched pos: \n";
            for ( $i = 0 ; $i <= $#string2MatchedPos ; $i++ ) {
                $debugString .=
                  "[$secondStringWords[$string2OriginalPos[$i]]] $string2OriginalPos[$i] $string2MatchedPos[$i]\n";
            }
            foreach ( sort keys %posChoices ) {
                $debugString .= "Position choices for $_: @{$posChoices{$_}}\n";
            }
            $debugString .= "\n";
        }

        # print "Entries in validForms cache: ".keys(%{$wnSynsetOffsetHashesRef})."\n";
        # now to find the alignment with the lowest score

        %usedUpPos = ();    # data structure to keep track of which pos's have been used up
        foreach ( keys %posChoices ) {
            my $temp;
            foreach $temp ( @{ $posChoices{$_} } ) {
                push @{ $usedUpPos{$_} }, 0;
            }
        }

        %fixedScoresCache = ();
        $bestScoreSoFar   = -1;
        @bestMatchSoFar   = ();
        $scoreSoFar       = 0;

        @currentMatch = @string2MatchedPos;

        # Call the recursive matching algorithm, and initialize it
        # with the first multi choice word
        getBestMatch(0);

        if ($DEBUG) {
            $debugString .= "Indexes:       ";
            $iter = 0;
            foreach (@firstStringWords) { $debugString .= " $iter  "; $iter++; }
            $debugString .= "\n";
            $debugString .= "First string:  ";
            foreach (@firstStringWords) {
                $debugString .= "[$_] ";
            }
            $debugString .= "\n";
            $debugString .= "Second string: ";
            foreach (@secondStringWords) {
                $debugString .= "[$_] ";
            }
            $debugString .= "\n";
            $debugString .= "Best match:    ";
            foreach (@bestMatchSoFar) {
                if ( $_ == -1 ) {
                    $debugString .= "-1  ";
                }
                else {
                    $debugString .= " $_  ";
                }
            }
            $debugString .= "\n";
            $debugString .= "Best score: $bestScoreSoFar\n";
            $debugString .= "Total matches: $totalMatches\n";
            $debugString .= "Fixed scores cache:\n";
            foreach ( sort keys %fixedScoresCache ) {
                $debugString .= "  $_: $fixedScoresCache{$_}\n";
            }
        }

        # Find number of non negative pos's in bestMatchSoFar
        my $numMatches = 0;
        foreach (@bestMatchSoFar) {
            $numMatches++ if ( $_ ne "-1" );
        }

        # Get base fixed score
        my $baseFixedScore = computeBaseFixedScore();
        my $totalScore     = $baseFixedScore + $bestScoreSoFar;

        if ($DEBUG) {
            $debugString .= "Base fixed score: $baseFixedScore\n";
            $debugString .= "Total score: $totalScore\n";
        }

        # push score into input hash
        ${ $$inputHashRef{"matchScore"} }[$alignmentStage][0] = $numMatches;
        ${ $$inputHashRef{"matchScore"} }[$alignmentStage][1] = $totalScore;    # This score is the number of flips

        # copy the best match over to the final alignment
        for ( $i = 0 ; $i <= $#string2OriginalPos ; $i++ ) {
            $finalAlignment[ $string2OriginalPos[$i] ]      = $bestMatchSoFar[$i];
            $tokenAlignmentStage[ $string2OriginalPos[$i] ] = $modules[$alignmentStage];
        }

        # from the finalAlignment, create hash of first and second string
        # words that are matched, and should not be used in the following
        # stages
        for ( $i = 0 ; $i <= $#finalAlignment ; $i++ ) {
            next if ( $finalAlignment[$i] == -1 );
            $alreadyAlignedFirstString{ $finalAlignment[$i] } = 1;
            $alreadyAlignedSecondString{$i} = 1;
        }
    }

    # Assign the final alignment to inputHash
    $$inputHashRef{"alignment"} = [@finalAlignment];
    $$inputHashRef{"alignmentStages"} = [@tokenAlignmentStage];

    # compute number of chunks and average chunk length
    my $numMatches        = 0;    # need to count *overall* num matches across all iterations
    my $numChunks         = 0;
    my $avgChunkLength    = 0;
    my $chunkStartPointer = -1;

    for ( $i = 0 ; $i <= $#finalAlignment ; $i++ ) {
        $numMatches++ if ( $finalAlignment[$i] != -1 );
        if ( $chunkStartPointer != -1 ) {    # there is an open chunk
            if ( $finalAlignment[$i] == -1 ) {
                $chunkStartPointer = -1;
                $numChunks++;
            }
            elsif ( $finalAlignment[$i] != ( $finalAlignment[ $i - 1 ] + 1 ) ) {
                $chunkStartPointer = $i;
                $numChunks++;
            }
        }
        else {                               # there aren't any open chunks
            $chunkStartPointer = $i if ( $finalAlignment[$i] != -1 );
        }
    }

    $numChunks++ if ( $chunkStartPointer != -1 );
    $avgChunkLength = ($numChunks) ? ( $numMatches / $numChunks ) : 0;

    $$inputHashRef{"numChunks"} = $numChunks;
    $$inputHashRef{"avgChunkLength"} = sprintf "%0.5f", $avgChunkLength;

    if ($DETAILS) {

        # print the actual alignment
        $detailString .= "First string indices:\n";
        $iter = 0;
        foreach (@firstStringWords) { $detailString .= " $iter  [$_]\n"; $iter++; }

        $detailString .= "Second string indices, alignments and alignment stages:\n";
        $iter = 0;
        my $index = 0;

        for ( $i = 0 ; $i <= $#secondStringWords ; $i++ ) {
            $detailString .= " $i  [$secondStringWords[$i]] ";
	    $detailString .= "[$secondStringWords[$i]] ";
            $detailString .= "[$finalAlignment[$i] $tokenAlignmentStage[$i]" if ( $finalAlignment[$i] != -1 );
            $detailString .= "\n";
        }

        $detailString .= "Total computations: $numComputations\n";
    }

    $$inputHashRef{"detailString"} = $detailString if ($DETAILS);
    $$inputHashRef{"debugString"}  = $debugString  if ($DEBUG);
}

sub getBestMatch {

    # count the number of times this function is called
    $numComputations++;

    return if ( ( $maxComputations != -1 ) && ( $bestScoreSoFar != -1 ) && ( $numComputations > $maxComputations ) );

    my $multiChoiceIndex = shift;

    while ( $multiChoiceIndex <= $#string2MatchedPos ) {
        last if ( defined $multiChoiceWordIndexes{$multiChoiceIndex} );
        $multiChoiceIndex++;
    }

    if ( $multiChoiceIndex > $#string2MatchedPos )    # no more multi choice words
    {
        $totalMatches++;

        if ($DEBUG) {
            print "Current match: @currentMatch\n";
            print "Score: $scoreSoFar\n";
        }

        if ( ( $bestScoreSoFar == -1 ) || ( $scoreSoFar < $bestScoreSoFar ) ) {
            @bestMatchSoFar = @currentMatch;
            $bestScoreSoFar = $scoreSoFar;
        }
        return;
    }

    # get the multi choice word
    my $multiChoiceWord = $string2MatchedPos[$multiChoiceIndex];

    # get the starting position for this word
    my $startingPos = -1;
    my $i;
    for ( $i = $#{ $usedUpPos{$multiChoiceWord} } ; $i >= 0 ; $i-- ) {
        if ( ${ $usedUpPos{$multiChoiceWord} }[$i] == 0 ) {
            $startingPos = $i;
            next;
        }
        last if ( ( ${ $usedUpPos{$multiChoiceWord} }[$i] == 1 ) && ( ${ $posChoices{$multiChoiceWord} }[$i] != -1 ) );
    }

    return if ( $startingPos == -1 );    # couldn't find a starting pos!

    my $previousScoreSoFar = $scoreSoFar;

    for ( $i = $startingPos ; $i <= $#{ $posChoices{$multiChoiceWord} } ; $i++ ) {
        next if ( ${ $usedUpPos{$multiChoiceWord} }[$i] == 1 );

        if ($DEBUG) {
            print
"Recursion level: $multiChoiceIndex. Word: $multiChoiceWord. Position choice being tested: $i=>${$posChoices{$multiChoiceWord}}[$i]\n";
        }

        ${ $usedUpPos{$multiChoiceWord} }[$i] = 1;

        $currentMatch[$multiChoiceIndex] = ${ $posChoices{$multiChoiceWord} }[$i];

        $scoreSoFar = $previousScoreSoFar + getScore($multiChoiceIndex);

        getBestMatch( $multiChoiceIndex + 1 )
          if ( ( ( $bestScoreSoFar == -1 ) || ( $scoreSoFar < $bestScoreSoFar ) ) || $PRUNE_OFF );

        return
          if ( ( $maxComputations != -1 ) && ( $bestScoreSoFar != -1 ) && ( $numComputations > $maxComputations ) );

        ${ $usedUpPos{$multiChoiceWord} }[$i] = 0;

        # if current chosen pos is -1, no need to go on since the rest
        # of the position choices after this will also be -1's too.
        last if ( $currentMatch[$multiChoiceIndex] == -1 );
    }

    $scoreSoFar = $previousScoreSoFar;
}

sub computeBaseFixedScore {
    my $i;
    my $j;
    my $baseFixedScore = 0;

    for ( $i = 0 ; $i <= $#currentMatch ; $i++ ) {
        next if ( defined $multiChoiceWordIndexes{$i} );

        for ( $j = 0 ; $j < $i ; $j++ ) {
            next              if ( defined $multiChoiceWordIndexes{$j} );
            $baseFixedScore++ if ( $currentMatch[$j] > $currentMatch[$i] );
        }
    }
    return $baseFixedScore;
}

sub getScore {
    my $index = shift;

    return 0 if ( $currentMatch[$index] == -1 );

    my $fixedScoreKey = "$index $currentMatch[$index]";
    my $i;

    unless ( defined $fixedScoresCache{$fixedScoreKey} ) {

        # compute fixed score
        $fixedScoresCache{$fixedScoreKey} = 0;
        for ( $i = 0 ; $i < $index ; $i++ ) {
            next                                if ( defined $multiChoiceWordIndexes{$i} );
            $fixedScoresCache{$fixedScoreKey}++ if ( $currentMatch[$i] > $currentMatch[$index] );
        }
        for ( $i = $index + 1 ; $i <= $#currentMatch ; $i++ ) {
            next                                if ( defined $multiChoiceWordIndexes{$i} );
            $fixedScoresCache{$fixedScoreKey}++ if ( $currentMatch[$i] < $currentMatch[$index] );
        }
    }

    # compute the score for the multi choice words
    my $score = $fixedScoresCache{$fixedScoreKey};

    for ( $i = 0 ; $i < $index ; $i++ ) {
        next unless ( defined $multiChoiceWordIndexes{$i} );
        $score++ if ( $currentMatch[$i] > $currentMatch[$index] );
    }

    return ($score);
}

# subroutine to remove stop words from the given sentence
sub removeStopWords {
    my $strPtr = shift;

    my @inWords = split /\s+/, $$strPtr;
    my @outWords = ();

    foreach (@inWords) {
        push @outWords, $_ unless ( defined $stopHash{$_} );
    }

    $$strPtr = join " ", @outWords;
}

1;
