#!/usr/bin/perl -w

#Usage:
#	sequence_aligner.pl {seq1 fasta file} {seq2 fasta file} {parameter file}

# Description:
#	reads two DNA sequences of possibly different lengths,
#		then computes an optimal global alignment between them

use strict;


if(!exists($ARGV[2])) {
	print "Usage: sequence_aligner.pl {seq1 fasta file} {seq2 fasta file} {parameter file}\n";
	exit;
}

my $seq1 = "";
my $seq2 = "";

$seq1 = loadSequence($ARGV[0]);
$seq2 = loadSequence($ARGV[1]);

my $match_score ;
my $mismatch_score ;
my $gap_score ;

loadParameters($ARGV[2]);

align($seq1,$seq2);


exit;


# define a new function to load and return the first sequence in a fasta file
sub loadSequence {

	my $filename = shift;  

	my $seq_name = "";

	my $seq = "";
	open(FP,$filename);
	while(<FP>) {
		chomp;

		if(/^>/) {
			$seq_name = $_;
		}
		else {
			my $uc_seq = uc($_);
			$seq = $seq . "$uc_seq";
		}
	} # end of while loop
	close(FP);
	print "Sequence $seq_name	$seq\n";
	return $seq;
} #end sub loadSequence

# define a new function to load and return the first sequence in a fasta file
sub loadParameters {

	my $filename = shift;  

	open(FP,$filename);
	while(<FP>) {
		chomp;

		my @fields = split;

		if(/^match/) {
			$match_score = $fields[1];
			next;
		}
		if(/^mismatch/) {
			$mismatch_score = $fields[1];
			next;
		}
		if(/^gap/) {
			$gap_score = $fields[1];
			next;
		}

	} # end of while loop
	close(FP);
	print "Alignment parameters: match $match_score; mismatch $mismatch_score; gap $gap_score\n";
} #end sub loadParameters

# define a new function to optimally align two sequences 
# this code implements the Needleman-Wunsch algorithm

sub align {

	my $s1 = shift;
	my $s2 = shift;

	my $i;
	my $j;

	my $l1 = length ($s1);
	my $l2 = length ($s2);

	# first declare & initialize the 2D dynamic programming matrix
	#	size: (l1+1) x (l2+1)


	my @matrix = ();

	#in addition store a path matrix which we will use later for traceback
	#	Path entry at [i][j] will be one of the following options:
	#		"left" means a deletion case
	#		"up" means an insertion case
	#		"cross_match" means a substitution case with a match
	#		"cross_mismatch" means a substitution case with a mismatch
	#		"X" means the cell has no predecessor (true only for [0][0] cell)

	my @path = ();


	for($i=0;$i<=$l1;$i++) {
		for($j=0;$j<=$l2;$j++) {
			$matrix[$i][$j] = 0;
			$path[$i][$j] = "X";
		}
	}

	#initialize the first row and 1st column

	$matrix[0][0] = 0;
	$path[0][0] = "X";

	for($i=1;$i<=$l1;$i++) {
		$matrix[$i][0] = $matrix[$i-1][0] + $gap_score;
		$path[$i][0] = "up";

	}
	for($j=1;$j<=$l2;$j++) {
		$matrix[0][$j] = $matrix[0][$j-1] + $gap_score;
		$path[0][$j] = "left";
	}
	
	#forward computing of the dynamic programming matrix

	for($i=1; $i<=$l1; $i++) {
		print "Row $i:	";
		for($j=1; $j<=$l2; $j++) {
			
			#capture all three cases: substitution, insertion & deletion

			#Case: Substitution
			my $S_val = 0;

			my $base1 = substr($s1,$i-1,1);
			my $base2 = substr($s2,$j-1,1);
			if($base1 eq $base2) {
				$S_val = $matrix[$i-1][$j-1] + $match_score;
			}
			else {
				$S_val = $matrix[$i-1][$j-1] + $mismatch_score;
			}

			#Case: Insertion
			my $I_val = 0;
			$I_val = $matrix[$i][$j-1] + $gap_score;

			#Case: Deletion
			my $D_val = 0;
			$D_val = $matrix[$i-1][$j] + $gap_score;

			#Now, compare all three values and store the maximum


			my $max=$S_val;
			my $direction = "";
			if($base1 eq $base2) {
				$direction = "cross_match";
			}
			else {
				$direction = "cross_mismatch";
			}
		
			if($max<$I_val) {
				$max = $I_val;
				$direction = "left";
			}
			if($max<$D_val) {
				$max = $D_val;
				$direction = "up";
			}

			$matrix[$i][$j] = $max;
			$path[$i][$j] = $direction;

			print "	$max";

		}#end for j
		print "\n";
	}#end for i



	#Now let us traceback to display the optimal path

	my $path1 = "";
	my $path2 = "";
	my $type = "";

	#plan is to display the alignment like this:
	#	$path1:	A A C A T G - A G C A T 
	#	$type:	| | |   | |   | |     |
	#	$path2:	A A C C T G T A G G - T 

	my $opt_path_len=0;

	$i = $l1;
	$j = $l2;

	my $n_matches=0;
	my $n_mismatches=0;
	my $n_insertions=0;
	my $n_deletions=0;
	

	while($i!=0 || $j!=0) {

		my $base1 = substr($s1,$i-1,1);
		my $base2 = substr($s2,$j-1,1);

		#check who gave the maximum at cell [i][j]
		my $dir = $path[$i][$j];
		if($dir eq "cross_match") {
			$path1 = $path1 . "$base1";
			$path2 = $path2 . "$base2";
			$type = $type . "|";
			$n_matches++;
			#update row/column pointers for next iteration
			$i--;
			$j--;
		}
		if($dir eq "cross_mismatch") {
			$path1 = $path1 . "$base1";
			$path2 = $path2 . "$base2";
			$type = $type . " ";
			$n_mismatches++;
			#update row/column pointers for next iteration
			$i--;
			$j--;
		}
		if($dir eq "left") {
			$path1 = $path1 . "-";
			$path2 = $path2 . "$base2";
			$type = $type . " ";
			$n_insertions++;
			#update row/column pointers for next iteration
			$j--;
		}
		if($dir eq "up") {
			$path1 = $path1 . "$base1";
			$path2 = $path2 . "-";
			$type = $type . " ";
			$n_deletions++;
			#update row/column pointers for next iteration
			$i--;
		}
		$opt_path_len++;
	}#end while loop

	#Now print the optimal path (by reversing the path arrays)
	$path1 = reverse($path1);
	$path2 = reverse($path2);
	$type = reverse($type);

	print "String 1:	";
	for($i=0; $i<$opt_path_len;$i++) {
		print " " .  substr($path1,$i,1);
	}
	print "\n";

	print "aligntyp:	";
	for($i=0; $i<$opt_path_len;$i++) {
		print " " .  substr($type,$i,1);
	}
	print "\n";

	print "String 2:	";
	for($i=0; $i<$opt_path_len;$i++) {
		print " " .  substr($path2,$i,1);
	}
	print "\n";


	print "Optimal score = " . $matrix[$l1][$l2];
	print "  #matches = $n_matches";
	print "  #mismatches = $n_mismatches";
	print "  #insertions = $n_insertions";
	print "  #deletions= $n_deletions";
	print "\n";

	my $percent_identity = $n_matches*100/$opt_path_len;
	print "percent identity = $percent_identity %\n";

}#end sub align 


