#!/usr/bin/perl

############################################################################
# script for starting knetfoldsingle.pl
# this scripts deals with problem of large number of gaps
# it runs first RNAfold to obtain on probability matrix
# then it finds a set of n representatives of the alignment
# for each representative, the alignment and prob-matrix is collapsed
# and a prediction compututed.
# the prediction of a collapsed matrix is expanded to the original size
# the average of the collapsed prediction matrices is the final prediction
# Eckart Bindewald 2006
# SAIC-Frederick
# NCI-Frederick
# Frederick, MD, 21702
############################################################################

use Cwd;
use Env;
use File::Basename;
use Getopt::Std;

use strict;

if (scalar(@ARGV)  < 1) {
    die "Usage: knetfold.pl -i fastafile [-n iterations][-d debug-mode][-m min_stem_length][-q queue-mode][-o outnamebase][-r seed][-f filtermode][-h knetfold_home]\n";
}

my %opts;
getopt('ifbhqmnsvodrty', \%opts);    # -o, -D & -I take arg.  Values in %opts

########## INTERPRET ENVIRONMENT VARIABLES ########

my $KNETFOLD_HOME = $opts{"h"};
if (length($KNETFOLD_HOME) == 0) {
    $KNETFOLD_HOME = $ENV{"KNETFOLD_HOME"};
}

if (substr($KNETFOLD_HOME, 0,1) ne "/") {
    # workaround for relative file names: convert to absolute file names
    $KNETFOLD_HOME = `cd $KNETFOLD_HOME; pwd`;
    chomp($KNETFOLD_HOME);
}

if (length($KNETFOLD_HOME) == 0) {
    die "Environment variable KNETFOLD_HOME or option -h has to be specified!\n";
}
my $KNETFOLD_TMP = $ENV{"KNETFOLD_TMP"};
if (length($KNETFOLD_TMP) == 0) {
    $KNETFOLD_TMP = "/tmp"; # default tmp directory
}

print "KNETFOLD_HOME: $KNETFOLD_HOME KNETFOLD_TMP: $KNETFOLD_TMP\n";

my $sleepintervall = 30;

if (&checkExistence("RNAfold") == 0) {
    die "The RNAfold program from the Vienna package could not be found. Please install and add location of binary to you path variable\n";
}

######## INTERPRET COMMAND LINE ################

my $debugMode = $opts{"d"};
if (length($debugMode) == 0) {
    $debugMode = 0; # if set to 1, do not delete scratch directory
}

my $filterMode = $opts{"f"};
if (length($filterMode) == 0) {
    $filterMode = 1; # by default use winner takes all filter
}

my $fastafile = $opts{"i"};
print "Alignment file: $fastafile\n";

if (! -e $fastafile) {
    die "Could not find fasta file! Specify with option -a filename\n";
}

my $outnameBase = $opts{"o"};

if (length($outnameBase) < 1) {
    $outnameBase = basename($fastafile, ".fasta");
    $outnameBase = $outnameBase . "_knet";
}

print "Prefix for output files: $outnameBase\n";

my $numiter = $opts{"n"};
if ($debugMode) {
    print "Number iterations: $numiter\n";
}

my $qsubMode = $opts{"q"};
if (length($qsubMode) == 0) {
    $qsubMode = 0;
}

my $queue = $opts{"b"};

print "Qsub mode: $qsubMode\n";

# resetting random number seed:
my $rseed = $opts{"r"};
if (length($rseed) > 0) {
  srand($rseed); #   
}

my $secondary =  $opts{"s"};

my $stemMin = $opts{"m"};
if (length($stemMin) == 0) {
    $stemMin = 2; # minimum stem length
}

my $tarMode = $opts{"t"};
if (length($tarMode) == 0) {
    $tarMode = 0; # if set to 1, generate tar file
}

# used for potential speedup: do not call knetfoldsingle if the RNAfold non-linear average matrix is bad
my $rnafoldsum =  $opts{"y"};
if (length($rnafoldsum) == 0) {
    $rnafoldsum = 0.0;
}

# avoid key word. Only works for RFAM entries like "RF00233" etc
my $avoid =  $opts{"v"};

my $qsubscratchdir = "$KNETFOLD_TMP/knnlaunchcollagepar_scratch.$$";

`mkdir -p $qsubscratchdir`;

my $qsubscratchbase = "$qsubscratchdir/knnlaunchcollagepar.$$.tmp";

###### IMPORTANT CONSTANTS ##################
my $BIN = "$KNETFOLD_HOME/bin";
my $EDITMATRIX = "$BIN/editmatrix";
my $FASTA2SIZE = "$BIN/fasta2sizes.pl";
my $COMPASS = "$BIN/compass -p $BIN/compass.prm"; #
my $RESCALE = "$BIN/rescalescores -p $KNETFOLD_HOME/prm/likelihoodratios_v1.hist3d --verbose 0";
my $STEMCONVERT = "$BIN/stemconvert";

my $fastabase = basename($fastafile, ".fasta");

my $fastaoutbase = "$fastabase.test";
my $exceptionFileName = "exception.msg";
my $scratch = $qsubscratchdir; # "$fastabase.collage.scratch";
my $probmatrixorig = "$scratch/$fastaoutbase" . ".prob.matrix";
my $probmatrixorigdir = "$scratch/$fastabase.dir";

################## MAIN PROGRAM ###############

`mkdir -p $scratch`;
`mkdir -p $probmatrixorigdir`;

my $pass1filebase = "$fastabase.pass1";
my $pass1file = "$scratch/$pass1filebase";

my $cpcom = "cp $fastafile $pass1file";

if ($debugMode) {
    print "$cpcom\n";
}
`$cpcom`;

my $nrep = 10; # maximum number of collapsing steps

if (length($numiter) > 0) {
    $nrep = $numiter;
}

# read alignment:
my @aliSequences = &readFasta($fastafile);
my $numAliSeq = scalar(@aliSequences);
my $aliLen = length($aliSequences[0]);
print "Alignment with $numAliSeq sequences with length $aliLen read!\n";

# find out number of sequences:
my $nseqcom = "$FASTA2SIZE < $fastafile";
my $nseq = `$nseqcom`;
chomp($nseq);
if ($nseq != scalar(@aliSequences)) {
    die "Internal error in line 213!\n";
}

if ($nrep > $nseq) {
    $nrep = $nseq;
}

my $thresh = 0.05; # TODO : make threshold variable

# compute RNAfold matrix once:

if (! -e $probmatrixorig) {
    my $probcom = "$BIN/fasta2rnaprob.pl $fastafile $probmatrixorig $probmatrixorigdir";
    if ($debugMode) {
	print "Computing RNAfold probability matrix: \n$probcom\n";
    }
    `$probcom`;
}
if (! -e $probmatrixorig) {
    die "Could not generate probability file: $probmatrixorig\n";
}
if ($debugMode) {
    print "Finished computing probability matrix!\n";
}

# expand alignment with respect to randomly chosen representatives:
my $matrixavgcomm = "$BIN/addmatrices --average 1 -i";

# print "Exiting before calling qsub\n";
# exit(0);

my @sofarchosen;

my @sofargaps;
my @qsubids;

for (my $nrandseq = 0; $nrandseq < $nrep; $nrandseq++) {
    if ($debugMode) {
	print "Loop iteration: $nrandseq\n";
    }
    # choose random sequence or iterative sequence. Plus one because collapse option starts counting from one.
    # TODO: not very elegant algorithm, much better would be to use C++ style random_shuffle
    my $ns;
    if ($nseq > $nrep) {
	my $found = 0;
	do {
	    $ns = int(rand($nseq)) + 1;
	    # check if found in sofarchosen:
	    $found = 0;
	    for (my $k = 0; $k < scalar(@sofarchosen); $k++) {
		if ($sofarchosen[$k] == $ns) {
		    $found = 1;
		    last;
		}
	    }
	}
	while ($found == 1); #  get new random number if sequence was already chosen
	push(@sofarchosen, $ns);
    }
    else {
	$ns = ($nrandseq + 1);
    }
    my $currSeq = $aliSequences[$ns];
    if ($debugMode) {
	print "Chosen sequence: $ns $currSeq\n";
    }
    my $gapPattern = &getGapPattern($currSeq);
    my $gapFoundId = -1;
    if (scalar(@sofargaps) > 0) {
	$gapFoundId = &findGapId($gapPattern, @sofargaps);
    }
    if ($gapFoundId < 0) {
	if ($debugMode) {
	    print "Gap pattern not found: $gapPattern\n";
	}
	push(@sofargaps, $gapPattern);
    }
    else {
	push(@sofargaps, "dummy"); # add dummy value
	# gap pattern already found!
	# skip submitting of jobs, only add to matrixavgcomm:
	if ($debugMode) {
	    print "Skipping because alignment pattern already found: $nrandseq $gapFoundId $sofargaps[$gapFoundId]\n";
	}
	my $alioutfilebase = "$pass1filebase.coll.$gapFoundId";
	my $matrixoutfile = "$scratch/$alioutfilebase.test.expanded.matrix";
	$matrixavgcomm = $matrixavgcomm . " $matrixoutfile";
	next;
    }
    if ($debugMode) {
	print "Collapsing with respect to sequence $ns\n";
    }
    my $alioutfilebase = "$pass1filebase.coll.$nrandseq";
    my $alioutfilelocal = "$alioutfilebase.fasta";
    my $alioutfile = "$scratch/$alioutfilelocal";
    my $probmatrixlocal = "$alioutfilebase.prob.matrix";
    my $probmatrix = "$scratch/$probmatrixlocal";
    my $stemoutfile = "$scratch/$alioutfilebase.reg";
    my $matrixtmpfile = "$scratch/$alioutfilebase.test.matrix";
    my $matrixoutfile = "$scratch/$alioutfilebase.test.expanded.matrix";

    if (-e $matrixoutfile) {
	if ($debugMode) {
	    print "$matrixoutfile exists, skipping to next iteration!\n";
	}
	if (! -z $matrixoutfile) {
	    $matrixavgcomm = $matrixavgcomm . " $matrixoutfile";
	}
	else {
	    print "Warning: $matrixavgcomm has zero size!\n";
	}
	next;
    }
    my $qsubscratchfilename = "/dev/null"; # do not really use qsub file
    if ($qsubMode == 1) {
	$qsubscratchfilename = "$qsubscratchbase" . "$nrandseq.qsub";
    }

    open (SCRATCH, ">$qsubscratchfilename") or die "Could not open scratch qsub file: $qsubscratchfilename";

    # collapse secondary structure, alignment and probability matrix from RNAfold superposition:
    if (! -e $pass1file) {
	die "Could not find $pass1file!\n";
    }
    my $comcoll = "$COMPASS -i $pass1file --collapse $ns --ali-out $alioutfile --prob-matrix $probmatrixorig --prob-matrix-format 3 --prob-matrix-out $probmatrix --overwrite 1";
    if (length($secondary) > 0) {
	$comcoll = $comcoll . " --stem-file $secondary --stem-format 2 --stem-outfile $stemoutfile --stem-outfile-format 1";
    }
    if ($debugMode) {
	print "$comcoll\n";
    }
    if ($qsubMode == 0) {
	`$comcoll`;
	if (! -e $alioutfile) {
	    die "Error: did not create $alioutfile\n"; 
	}
    }
    print SCRATCH "$comcoll\n";
    
# collapse with respect to this sequence:
    
    my $com1 = "cd $scratch; $BIN/knetfoldsingle.pl -i $alioutfilelocal -p $probmatrixlocal -h $KNETFOLD_HOME";
    
    if (length($secondary) > 0) {
	$com1 = "$com1" . " -s $stemoutfile";
    }
    if (length($avoid) > 0) {
	$com1 = "$com1" . " -v  $avoid";
    }
    if ($debugMode) {
	print "$com1\n";
    }
    my $result;
    if ($qsubMode == 0) {
	$result = `$com1`;
	chomp($result);
    }

    print SCRATCH "$com1\n";
    
    if ($debugMode) {
	print "Results of command: $result\n";
    }

    # expand matrix:
    my $matrixconvertcom = "$STEMCONVERT -i $matrixtmpfile --if 3 --of 3 --expand $ns -a $pass1file > $matrixoutfile";
    if ($debugMode) {
	print "$matrixconvertcom\n";
    }
    if ($qsubMode == 0) {
	`$matrixconvertcom`;
    }
    
    print SCRATCH "$matrixconvertcom\n";

    $matrixavgcomm = $matrixavgcomm . " $matrixoutfile";

    close(SCRATCH);

    # central qsub command: run script
    if ($qsubMode == 1) {
	my $qsubcom;
	if (length($queue) > 0) {
	    $qsubcom = "qsub -q $queue $qsubscratchfilename";
	}
	else {
	    $qsubcom = "qsub $qsubscratchfilename";
	}
	if ($debugMode) {
	    print "$qsubcom\n";
	}
	my $qsubid = `$qsubcom`;
	chomp($qsubid);
# add to list of qsub ids:
	
	push(@qsubids, $qsubid);

    }
}

if ($qsubMode == 1) {
    print "Known qsub ids:\n";
    for (my $i = 0; $i < scalar(@qsubids); $i++) {
	print "$qsubids[$i]\n";
    }
    print "Waiting until all processes are finished...\n";
# wait until all processes are finished:
    &wait_until_finished(@qsubids);
    print "All processes finished! Combining results!\n";
}

# combine results
my $matrixRnafoldfile = "$outnameBase.mx0"; # constains NL-RNAfold scores
my $matrixavgfile = "$outnameBase.mx1"; # constains raw scores
my $matrixavgfile2 = "$outnameBase.mx2"; # after projecting scores to "best" structure
my $matrixavgfile3 = "$outnameBase.mx3"; # contains probabilities
my $ctoutfile = "$outnameBase.ct";
my $collapsefile = "$outnameBase.coll";
my $secoutfile = "$outnameBase.sec";
my $pdfoutfile = "$outnameBase.pdf";
my $jpgoutfile = "$outnameBase.jpg";
my $psoutfile = "$outnameBase.ps";
my $fastaoutfile = "$outnameBase.fasta";
if (length($outnameBase) == 0) {
    die "Internal error: variable outnameBase undefined!\n";
    $matrixavgfile = "$fastabase.test.avg.matrix";
    $matrixavgfile2 = "$fastabase.test.avg.winner.matrix";
    $matrixavgfile3 = "$fastabase.test.avg.probabilities.matrix";
    $ctoutfile = "$fastabase.test.avg.winner.ct";
    $secoutfile = "$fastabase.test.avg.winner.sec";
    $collapsefile = "$fastabase.test.avg.winner.coll";
    $pdfoutfile = "$fastabase.test.avg.winner.pdf";
    $jpgoutfile = "$fastabase.test.avg.winner.jpg";
    $psoutfile = "$fastabase.test.avg.winner.ps";
}
else {
    `cp $fastafile $fastaoutfile`;
}

$matrixavgcomm = $matrixavgcomm . " > $matrixavgfile";

if ($debugMode) {
    print "$matrixavgcomm\n";
}
`$matrixavgcomm`;


my $matrixavgcomm2;

# rescale to matrix with confidence values:
my $matrixavgcomm3 = "$RESCALE -i $fastafile -m $matrixavgfile > $matrixavgfile3";
if ($debugMode) {
    print "$matrixavgcomm3\n";
}
`$matrixavgcomm3`;

if ($filterMode == 1) {
# use winner-takes-all filter:
    if ($debugMode) {
	print "Applying winner-takes all filter to score matrix.\n";
    }
    $matrixavgcomm2 = "$EDITMATRIX -i $matrixavgfile --winner 1 --if 3 --of 3 --verbose 0 > $matrixavgfile2";
}
else {
    die "Unknown filter mode: $filterMode\n";
}

if ($debugMode) {
    print "$matrixavgcomm2\n";
}
`$matrixavgcomm2`;

# get dimensions of final matrix:
my $matLengthComm = "head -n1 $matrixavgfile | wc | tr \" \" -s | cut -f3 -d\" \"";
if ($debugMode) {
    print "$matLengthComm\n";
}
my $matLength = `$matLengthComm`;
chomp($matLength);
if ($debugMode) {
    print "Detected matrix dimension: $matLength\n";
}
my $finalstdout = "$fastaoutbase.stdout";

my $finalcomm = "$COMPASS -i $fastafile --matrix $matrixavgfile2 --matrix-format 3  -o $fastaoutbase --overwrite 1";
if (length($secondary) > 0) {
    $finalcomm = $finalcomm . " --stem-file $secondary --stem-format 2";
}
$finalcomm = $finalcomm . " > $finalstdout";
if ($debugMode) {
    print "$finalcomm\n";
}
`$finalcomm`;

my $finalcomm2 = "$STEMCONVERT -a $fastafile -i $matrixavgfile2 -l $thresh --if 3 --of 4 --stem-min $stemMin > $ctoutfile";
if ($debugMode) {
    print "$finalcomm2\n";
}
`$finalcomm2`;
my $finalcomm3 = "$STEMCONVERT -a $fastafile -i $matrixavgfile2 -l $thresh --if 3 --of 2 --stem-min $stemMin > $secoutfile";
if ($debugMode) {
    print "$finalcomm3\n";
}
`$finalcomm3`;
my $finalcomm4 = "$STEMCONVERT -a $fastafile -i $matrixavgfile2 -l $thresh --if 3 --of 10 --stem-min $stemMin > $collapsefile";
if ($debugMode) {
    print "$finalcomm4\n";
}
`$finalcomm4`;

# copy NL-RNAfold file to standard location:
`cp $probmatrixorig $matrixRnafoldfile`;

# generate pdf file if statistics program "R" exists:
my $rExist = &checkExistence("R");
if ($rExist == 1) {
    my $pdfcomm1 = "cp $matrixavgfile3 mx3.dat"; # version BEFORE winner-takes-all filter but after prob rescaling
    if ($debugMode) {
	print "$pdfcomm1\n";
    }
    `$pdfcomm1`; 
# pass length as command line parameter:
    my $pdfcomm2 = "R --silent --no-save --no-restore --vanilla --args $matLength < $BIN/matrix2pdf.R";
    if ($debugMode) {
	print "$pdfcomm2\n";
    }
    `$pdfcomm2`;
    my $pdfcomm3 = "cp mx3.pdf $pdfoutfile\n";
    if ($debugMode) {
	print "$pdfcomm3\n";
    }
    `$pdfcomm3`;
}

# removing scratch directory:
if ($debugMode == 0) {
    if (length($qsubscratchdir) > 4) { # just to be save not to have a dangerous name like ". or "/" "
	my $rmcom = "rm -rf $qsubscratchdir";
	if ($debugMode) {
	    print "$rmcom\n";
	}
	`$rmcom`;
    }
    # delete redundant output from compass program and R script:
    `rm $fastabase.test.predict.reg`;
    `rm $fastabase.test.predictlow.reg`;
    `rm $fastabase.test.predictmediumhigh.reg`;
    `rm $fastabase.test.predicthigh.reg`;
    `rm $fastabase.test.matrix`;
    `rm $fastabase.test.stdout`;
    `rm $fastabase.test.individual.reg`;
    if (-e "mx3.dat") {
	`rm mx3.dat`;
    }
    if (-e "mx3.pdf") {
	`rm mx3.pdf`;
    }
}

# generate tar file for whole directory
if ($tarMode) {
    my $cwd = getcwd;
    my $tardirname = basename($cwd);
    my $tarfilename = "$tardirname.tgz";
    my $tarcom = "tar cvfz $tarfilename .";
    `$tarcom`;
}

print "KNetFold prediction finished.\n";

################## SUBROUTINES ##################

# returns 1 if given program name exists in path, zero otherwise
sub checkExistence
{
    my $name = $_[0];
    if (length($name) < 1) {
	return 0;
    }
    my $rExist = `which $name`;
    chomp($rExist);
    $rExist =~ tr/ //s; # squeeze repeats of space character
    my @rWords = split/ /, $rExist;
    if (scalar(@rWords) == 1) {
	return 1;
    }
    return 0;
}

# returns 1 if called with line beginning with ">", 0 otherwise
sub isFastaHeader
{
    my $seq = @_[0];
    if (substr($seq, 0, 1) eq ">") {
	return 1;
    }
    return 0;
}

# readFasta($fileName) : returns array with sequences (names are ignored)
sub readFasta
{
    my $fileName = @_[0];
    open(INPUT, "$fileName") or die "Error opening file: $fileName";
    my $tmp;
    my @sequences;
    while ($tmp = <INPUT>) {
	chomp($tmp);
	if (&isFastaHeader($tmp)) {
	    last;
	}
    }
    my $count = 0;
    while (1)
    {
	my $currSeq = "";
	while ($tmp = <INPUT>) {
	    chomp($tmp);
	    $tmp =~ s/\s+//g; # tr/\t //d; # deletes space and tabe
	    if (!&isFastaHeader($tmp)) {
		$currSeq = $currSeq . $tmp;
	    }
	    else {
		last;
	    }
	};
# upper case:
	$currSeq = uc($currSeq);
	$currSeq =~ tr/T/U/; # translate T to U residues
	push(@sequences, $currSeq);
	$count++;
	if (!&isFastaHeader($tmp)) {
	    last; # end of file
	}
    }
    close(INPUT);
    return @sequences;
}

# waits until all processes are finished
sub wait_until_finished
{
    my $unknown = "Unknown"; # look for this word as result of qstat command
    my @qsubids = @_;
    my $all_finished = 1;
    my $result;
    do {
	$all_finished = 1;
	for (my $i = 0; $i < scalar(@qsubids); $i++) {
	    $result = `qstat $qsubids[$i]`;
	    chomp($result);
	    $_ = $result;
	    # if process exists: wait and quit for loop:
	    if (length($result) > 0) {
		# print "Process $qsubids[$i] exists, keep waiting...\n";
		sleep ($sleepintervall);
		$all_finished = 0;
		last; # quit for loop
	    }
	}
    }
    while ($all_finished == 0);
}

# returns if given character is gap character */
sub isGap
{
    my $char = substr($_[0], 0, 1);
    if (($char eq "_") || ($char eq ".")) {
	return 1;
    }
    return 0;
}

sub gapConvert
{
    my $char = $_[0];
    if (&isGap($char) != 0) {
	return "-";
    }
    return "X"; # non-gap character
}

# converts nongap characters to "X", gap characters to "-"
sub getGapPattern
{
    my $seq = $_[0];
    my $result = "";
    for (my $i = 0; $i < length($seq); $i++) {
	my $c = substr($seq, $i, 1);
	if (($c eq "-") || ($c eq ".")) {
	    $result = $result . "-";
	}
	else {
	    $result = $result . "X";
	}
    }
    # print "Gap pattern of $seq is: $result\n";
    return $result;
}

# checks which gap pattern has been seen before, returns corresponding id
sub findGapId
{
    my $gapSeq = $_[0];
    my @gapSoFar = @_;
    my $num = scalar(@gapSoFar);
    for (my $i = 1; $i < scalar(@gapSoFar); $i++) {
	if ($gapSeq eq $gapSoFar[$i]) {
	    return ($i-1);
	}
    }
    return -1;
}

################## END OF SUBROUTINES ##################
