#!/usr/bin/perl -w
#This is the script to rule them all (blastranpetit.pl)
#Discovering selenoproteins in Chlorocebus sabaeus
#Ricard Argelaguet, Estel Collado, Claudia Fontserè, Silvia Galan

#It requires a previous export!
#export PATH=/cursos/BI/bin/ncbiblast/bin:$PATH       
#cp /cursos/BI/bin/ncbiblast/.ncbirc ~/
#export PATH=/cursos/BI/soft/exonerate/i386/bin:$PATH 
#export PATH=/cursos/BI/bin:$PATH 
#export WISECONFIGDIR=/cursos/BI/soft/wise-2.2.0/wisecfg

use strict;
use List::Util qw(max);

#The number of arguments that the program gets must be equal to 1
my $num_args = $#ARGV + 1;
if ($num_args != 1) {
    print "\n Wrong usage: Blastranpetit.pl multifasta.fa \n";
    exit;
}
my $multifasta = $ARGV[0]; #Queries, multifasta file
my $genome_file="/cursos/BI/genomes/project_2014/Chlorocebus_sabaeus/genome.fa"; #Genome

open MULTIFASTA, $multifasta or die "\nFail at opening the multifasta file \n";

my $seq;
my %sequence;
my $header;

#Extraction of the headers and the sequences from the multifastafile 
while () {
    chomp($_);
    if ($_=~/^>(.*)/) 
    {
	if ($seq) 
	{ $sequence{$header}=$seq;}
	$header = $1;
	$seq = '';
    } 
    else 
    { $seq.=$_;}
}
$sequence{$header}=$seq;
close (MULTIFASTA); 

while ((my $query_name_string, my $query_seq_string) = each %sequence) 
{
    my $query_file = $query_name_string;
    $query_name_string = '>'.$query_name_string;

    #Substitute U for X in the query sequence
    open (QUERY_MODIFIED, ">", $query_file);
    print QUERY_MODIFIED "$query_name_string \n";
    
    for (my $i = 1; $i < length($query_seq_string); $i++)
    {
	if (substr($query_seq_string,$i,1) eq "U")
	{
	    print QUERY_MODIFIED "X";
	}
	else
	{
	    print QUERY_MODIFIED substr($query_seq_string,$i,1);
	}
    }
    close (QUERY_MODIFIED); 
     
     
    #Execute BLAST
    my $blast = "";
    print "Doing blast.. \n\n";
    $blast = "blastall -p tblastn -i '$query_file' -d $genome_file -o $query_file.blast -m9 -e 0.0001";
    system($blast);


    #Analyze the BLAST output
    my @scaffolds = ();
    my @blast_output_array = ();
    my @hit_output_array = ();
    my @hit_output_array_sorted = ();
    my @splited_blast = ();
    my @blast_line = ();
    my @hit_line = ();

    open BLAST_OUTPUT, "$query_file.blast";
    while ()
    { 
	if (substr($_,0,1) ne "#")
	{
	    @blast_line = split("\t", $_);
	     #   for (my $j = 0; $j < scalar(@blast_line); $j++) {
	      #    $blast_output_array[$index][$j] = $blast_line[$j];
	    push(@blast_output_array,[@blast_line]); 
	}
    }
    close(BLAST_OUTPUT);

    #Extract all the non-repeated scaffolds from the blast output 
    for(my $i = 0; $i <= $#blast_output_array; $i++)
    {
	push(@scaffolds,$blast_output_array[$i][1]);
    }
    my %hash = map { $_, 1 } @scaffolds; 
    my @unique = keys %hash;  #The unique array contain all the different scaffolds



    #Processing the BLAST output in order to create an input for fastasubseq
    for(my $i = 0; $i <= $#unique; $i++)
    {
	system("grep  '$unique[$i]' '$query_file.blast' > Hit_$i.txt");
    }

    my %hash_inici = ();
    my %hash_fi = (); #%hash_fi contains the final position + 50.000
    my %hash_initial_pos = (); #%hash_initial_pos contains the initial position - 50.000
    my %hash_length = (); #%hash_length contains the difference between the final position and the initial position
    my %hash_sentit = (); #%hash_sentit contains the value for positive sense chain (+1) and for negative sense chain (-1)

    for(my $i = 0; $i <= $#unique; $i++)
    {
	my $hit_file = "Hit_$i.txt";
	open HIT, $hit_file;
	while ()
	{ 
	    my @hit_line = split("\t", $_);
	    push(@hit_output_array,[@hit_line]);
	}
	for(my $j = 0; $j <= $#hit_output_array; $j++)
	{
	    #The variable $hash_inici contains the lowest value of q.start (the beginning of the protein)
	    @hit_output_array_sorted = sort { $a->[6] <=> $b->[6] } @hit_output_array;
	    $hash_inici{"$hit_output_array[$j][1]"} = $hit_output_array_sorted[0][8];
	    
	    #The variable $hash_fi contains the highest value of q.end (the end of the protein)
	    @hit_output_array_sorted = sort { $b->[7] <=> $a->[7] } @hit_output_array;
	    $hash_fi{"$hit_output_array[$j][1]"} = $hit_output_array_sorted[0][9];
	    
	    #The gene is in the reverse strand if s.start > s.end. Otherwise, it is in the forward strand
	    if ($hash_inici{"$hit_output_array[$j][1]"} >  $hash_fi{"$hit_output_array[$j][1]"})
	    {
		#If we are in the negative chain, the $hash_initial_pos is in q.end (-50.000) and the $hash_fi is in q.start (+50.000)
		$hash_sentit{"$hit_output_array[$j][1]"} = -1;
		$hash_initial_pos{"$hit_output_array[$j][1]"} = $hash_fi{"$hit_output_array[$j][1]"} - 100000;
		$hash_fi{"$hit_output_array[$j][1]"} = $hash_inici{"$hit_output_array[$j][1]"} + 100000;
		$hash_length{"$hit_output_array[$j][1]"} = $hash_fi{"$hit_output_array[$j][1]"} - $hash_initial_pos{"$hit_output_array[$j][1]"};
	    }
	    else
	    {
		#If we are in the positive chain, the $hash_initial_pos is in q.start (-50.000) and the $hash_fi is in q.end (+50.000)
		$hash_sentit{"$hit_output_array[$j][1]"} = 1;
		$hash_initial_pos{"$hit_output_array[$j][1]"} = $hash_inici{"$hit_output_array[$j][1]"} - 100000;
		$hash_fi{"$hit_output_array[$j][1]"} = $hash_fi{"$hit_output_array[$j][1]"} + 100000;
		$hash_length{"$hit_output_array[$j][1]"} = $hash_fi{"$hit_output_array[$j][1]"} - $hash_initial_pos{"$hit_output_array[$j][1]"}
	    }
	    undef @hit_output_array_sorted;
	    undef @hit_output_array;
	    
	}
	close(HIT);
    }

    #Indexing the genome
    unless (-e "genome.index")
    {
	print "Indexing the genome \n \n";
	system("fastaindex '$genome_file' genome.index");
    }
    print keys %hash_initial_pos;

    #Multistep process to perform the gene prediction: 
    foreach (keys %hash_initial_pos) 
    {
	print "$_ \n";
	print "$genome_file \n";
	print "Doing fastafetch \n \n";
	system("fastafetch '$genome_file' 'genome.index' '$_' > '$_.fastafetch'");
	print "Doing fastasubseq \n \n";
	system("fastasubseq '$_.fastafetch' '$hash_initial_pos{$_}' '$hash_length{$_}' > '$_.fastasubseq'");
	print "Doing exonerate \n \n";
	system("exonerate -m 'p2g' --showtargetgff -q '$query_file' -t '$_.fastasubseq' > '$_.gff'");
	print "Doing egrep \n \n";
	system("egrep -w exon '$_.gff' > '$_.exonerate.gff'");
	print "Doing fastaseqfromGFF.pl \n \n";
	system("fastaseqfromGFF.pl '$_.fastasubseq' '$_.exonerate.gff' > '$_.fastaseqfromgff'");
	print "Doing fastatranslate \n \n";
	system("fastatranslate -F 1 '$_.fastaseqfromgff' > '$_.translated.fa'"); 
	print "Doing genewise \n \n";
	if ($hash_sentit{$_} == 1) #Forward strand
	{
	    system("genewise -pep -pretty -cdna -gff '$query_file' '$_.fastasubseq' > '$_.genewise'");
	}
	elsif ($hash_sentit{$_} == -1) #Reverse strand
	{
	    system("genewise -pep -pretty -cdna -gff -trev '$query_file' '$_.fastasubseq' > '$_.genewise'");
	}
	print "Doing t_coffee \n \n";
	system("t_coffee '$_.translated.fa' '$query_file' > '$_.tcoffee'");
	system("mkdir '$query_file':'$_'");
	system("mv '$_'* '$query_file':'$_'");
	
    }
}