view annotate.pl @ 2:d1197d8ee0bb draft default tip

Uploaded
author elixir-it
date Tue, 16 Feb 2021 09:05:48 +0000
parents
children
line wrap: on
line source

$fss=13468;

unless (-e "GCF_009858895.2_ASM985889v3_genomic.fna")
{
	
	system("wget -i https://raw.githubusercontent.com/matteo14c/CorGAT_galaxy/master/ann.txt");
	system("gzip -d GCF_009858895.2_ASM985889v3_genomic.fna.gz");
}

$gen_code="genetic_code";
die("need genetic code file in the current folder\n") unless -e "genetic_code";
open(IN,$gen_code);
while(<IN>)
{
	($triplet,$oneL)=(split());
	$code{$triplet}=$oneL;
}

$genome="GCF_009858895.2_ASM985889v3_genomic.fna ";
die("need reference genome file in the current folder\n") unless -e "GCF_009858895.2_ASM985889v3_genomic.fna";
open(IN,$genome);
while(<IN>)
{
	next if $_=~/^>/;
	chomp;
	$seq.=$_;
}
$table="annot_table.pl";#"simple_annot_mirror";#"annot_table.pl";#sl5     29728   29768   reg
die("need detailed annotation file for SARS-CoV-2 in the current folder") unless -e "annot_table.pl";
open(IN,$table);


while(<IN>)
{
	chomp();
	($gene,$b1,$b2,$e,$annot,$notes)=(split(/\t/));
	$annot{$b1}{$b2}=[$gene,$e,$annot,$notes];
	#print "$gene $b1 $b2 $e $annot $notes\n";
	$len=$b2-$b1+1;
	if ($gene ne "nsp12" && $gene ne "orf1ab")
	{
		$seqgene=substr($seq,$b1-1,$len+3); #un codone inizio in più e un codone fine in più
	}else{
		$len1=$fss-$b1+1;
                $part1=substr($seq,$b1-1,$len1);
                $len2=$b2-$fss+1;
                $part2=substr($seq,$fss-1,$len2+3);
                $seqgene="$part1$part2";

	}
	$annot_seq{$gene}=$seqgene;
	$Lgenes{$gene}=length($seqgene);
	($NSTOP_G,$Tseq,$pos_Stop_R)=translate($seqgene,\%code);
	@seq_res=split('',$Tseq);
	for ($i=0;$i<=$#seq_res;$i++)
	{
		$pos=$i+1;
		$res=$seq_res[$i];
	}
}
%AF_data=%{read_simple_table("AF_current.csv")};
%MFE_data=%{read_simple_table("MFE_annot.csv")};
%epi_data=%{read_epitopes("epitopes_annot.csv")};
%hyphy_data=%{read_hyphy("hyphy_current.csv")};



$var_File=shift;#""cl7.csv";#"phenetic_indels_sars_cov2.csv";
$out_File=shift;
open(OUT,">$out_File");
die("no input\n") unless -e $var_File;
open(IN,$var_File);
$header=<IN>;
@header=(split(/\s+/,$header));
print OUT "POS\tREF\tALT\tannot\tAF\tEpitopes\tHyphy\tMFE\n";
while(<IN>)
{
	($change,@pos)=(split());
	next unless $change=~/\|/;
	($pos,$allele)=(split(/_/,$change))[0,1];
	($ref,$alt)=(split(/\|/,$allele))[0,1];
	$AF=$AF_data{"$pos$ref$alt"} ? $AF_data{"$pos$ref$alt"} : 0;
	next if $alt=~/N/;
	$annot_string="";
	$contained=0;
	$epitope_string="";
	$hyphy_string="";
	$MFE_string= $MFE_data{"$pos$ref$alt"} ? $MFE_data{"$pos$ref$alt"} : "NA";
	#next if $ref eq"." || $alt eq ".";
	foreach $b1 (sort{$a<=>$b} keys %annot)
	{
		foreach $b2 (sort{$a<=>$b} keys %{$annot{$b1}})
		{
			if ($pos<=$b2 && $pos>=$b1)
			{
				$contained=1;
				$type=$annot{$b1}{$b2}[1];
				$namegene=$annot{$b1}{$b2}[0];
				if ($type eq "cds")
				{
					#print "cds";
					@res=annot_CDS($pos,$ref,$alt,$namegene);
					$annot_string.=$res[0];
					if ($namegene ne "orf1ab")
					{
						$hyphy_string.=$res[1];
						$epitope_string.=$res[2];
					}
				}else{
					$rel_pos=$pos-$b1+1;
					$annot_string.="$namegene:nc.$ref$rel_pos$alt,NA,NA;";
					$epitope_string="NA" if $epitope_string eq "";
                                	$hyphy_string="NA" if $hyphy_string eq "";
				}
			}
		}
	}
	$epitope_string=~s/\s+/;/g;
	#$epitope_string="EpiT:$epitope_string" unless $epitope_string eq "NA";
	print OUT "$pos\t$ref\t$alt\t$annot_string\t$AF\t$epitope_string\t$hyphy_string\t$MFE_string\n" #if $contained==1;
}

sub translate
{
	@orig_seq=split('',$_[0]);
	%gen_code=%{$_[1]};
	$type="";
	$NSTOP=0;
	$Tseq="";
	$pos_Stop=0;
	for ($i=0;$i<=$#orig_seq;$i+=3)
	{
		$AA=join('',@orig_seq[$i..$i+2]);
		$res=$gen_code{$AA};
		$pos_Stop=$i if $pos_Stop ==0 && $res eq "*";
		$NSTOP++ if $res eq "*";
		$Tseq.=$res;
	}
	#print "$seq\n";
	return($NSTOP,$Tseq,$pos_Stop);
}

sub annot_CDS
{
	$pos=$_[0]; 
	$ref=$_[1]; 
	$alt=$_[2];
	$namegene=$_[3];
	my $hyphy_string="NA";
	my $epitopes_string="NA";
	#print "$pos $ref $alt $namegene\n";
	$pos_inG=$pos-$b1+1;
        $pos_inG++ if $pos >$fss && ($annot{$b1}{$b2}[0] eq "nsp12" || $annot{$b1}{$b2}[0] eq "orf1ab");
	$mod=$pos_inG%3;
	$rel_pos=int($pos_inG/3);
	$rel_pos++ if $mod !=0;
	
	if (length($ref)==1 && $ref ne "."&& $alt ne ".")
	{
	
		if ($mod ==1)
		{
			$triplet=substr($seq,$pos-1,3);
			@Bs=split('',$triplet);
			die("1\n $triplet b:$Bs[0] r:$ref") unless ($Bs[0] eq $ref);
			$Bs[0]=$alt;
		}elsif ($mod==2){
			$triplet=substr($seq,$pos-2,3);
			@Bs=split('',$triplet);
			die("2\n $triplet b:$Bs[1] r:$ref")unless ($Bs[1] eq $ref);
			$Bs[1]=$alt;
		}elsif ($mod==0){
			$triplet=substr($seq,$pos-3,3);
			@Bs=split('',$triplet);
			die("$_ 3\n $triplet b:$Bs[2] r:$ref")unless ($Bs[2] eq $ref);
			$Bs[2]=$alt;
		}
		#print "$pos_inG $relpos $mod @Bs\n";
		$Atriplet=join("",@Bs);
		$A1=$code{$triplet};
		$A2=$code{$Atriplet};
		$eff=$A1 eq $A2 ? "synonymous" : "missense";
		$eff="start_lost" if $eff eq "missense" && $rel_pos ==1;
		$eff="stopGain" if $A2 eq "*" && $A1 ne "*"; #stopG
		$eff="stopLoss" if $A1 eq "*" && $A2 ne "*"; #stopL
		$eff="S" if $A2 eq "*" && $A1 eq "*";
		$hyphy_string=$hyphy_data{"$namegene$rel_pos"} if $hyphy_data{"$namegene$rel_pos"};
		$epitopes_string=join(" ",@{$epi_data{"$namegene$rel_pos$A1"}}) if $epi_data{"$namegene$rel_pos$A1"};
		#return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$mod,$eff;",$hyphy_string,$epitopes_string);
		return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$eff;",$hyphy_string,$epitopes_string);
	}else{
		$eff="";
		$gene=$annot_seq{$namegene};
		$lgene=length($gene);
		$upstream=substr($gene,0,$pos_inG-1);
		$change=substr($gene,$pos_inG,length($ref));
		$downstream=substr($gene,$pos_inG+length($ref)-1);
		$len=length($alt);
		unless ($ref=~/\./)
		{
			$modSeq="$upstream$alt$downstream";
		}else{
			$modSeq="$upstream$alt$change$downstream";
		}
		$modSeq=~s/\.//g;

		($NSTOP_G,$Tseq_R,$pos_Stop_R)=translate($gene,\%code);
		($NSTOP_R,$Tseq_alt,$pos_Stop_T)=translate($modSeq,\%code);
		$CDS_annot_string=".";
		#print "$namegene,aa: sr:$NSTOP_G sa:$NSTOP_R psr:$pos_Stop_R psa:$pos_Stop_T\n";
		if ($NSTOP_R>$NSTOP_G && ($ref=~/\./ || $alt=~/\./))
		{
			
			$eff="frameshift";
			$eff.="Ins" if $ref=~/\./;
			$eff.="Del" if $alt=~/\./;
			$truncation=$pos_Stop_T/3;
			$ref_Seq=substr($Tseq_R,$truncation-1,1);
			return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string);
			#return("$namegene:$pos_inG,$rel_pos,$mod,Frameshift,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string");
		}elsif($NSTOP_R==$NSTOP_G && $pos_Stop_T<($pos_Stop_R-$len) ){
		#	print "2\n";
			$truncation=$pos_Stop_T/3;
			$ref_Seq=substr($Tseq_R,$truncation-1,1);	
			$eff="Truncating";
			$eff.="Ins" if $ref=~/\./;
			$eff.="Del" if $alt=~/\./;
			return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string);
			#return("$namegene:$pos_inG,$rel_pos,$mod,Trunc:$truncation,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string=");
		}else{
			%used_epitopes=();
			$hyphy_string="";
			$epitopes_string="";
			$pre="";
			$Cref=$ref;
                        $Calt=$alt;
			if ($mod==0)
			{
				$pre=substr($gene,$pos_inG-3,2);
			}elsif($mod==2){
				$pre=substr($gene,$pos_inG-2,1);
			}
			$Cref="$pre$Cref";
			$Calt="$pre$Calt";
			$post=$pos_inG+length($ref)-1;
			while (length($Cref) %3!=0)
			{
				$Cref.=substr($gene,$post,1);
				$Calt.=substr($gene,$post,1);
				$post++;
				if ($post>length($gene))
				{
					$local_gene=substr($gene,0,length($gene)-3);
					$eff="TruncatingDel";
					#print "$post\n";
					$copy_pos=$pos_inG;
					#print "$pos_inG\n";
					#print length($gene)."\n";
					#die("muoro");
					$Loc_Ref=substr($local_gene,$copy_pos);
					$Talt="-";
					while(length($Loc_Ref)%3!=0)
					{
						$copy_pos--;
						$Loc_Ref=substr($local_gene,$copy_pos);
					}
					$rel_pos=int($copy_pos/3);
        				$rel_pos++ if $mod !=0;
					$Tref=(translate($Loc_Ref,\%code))[1];
					return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string);
					#return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string)
				}
			}
			if ($alt=~/\./ || $ref=~/\./)
			{
				$Tref="-";
				$Talt="-";
				$eff="inframe";
				if ($ref=~/\./)
				{
					$eff.="Ins";
					$Talt=(translate($Calt,\%code))[1];
					if ($Cref=~/[ACTG]{1,}/)
                                        {
                                                $Cref=~s/\.//g;
                                                $Tref=(translate($Cref,\%code))[1];
                                        }

				}
                                if ($alt=~/\./)
				{
					$eff.="Del";
					$Tref=(translate($Cref,\%code))[1];
					if ($Calt=~/[ACTG]{1,}/)
                                        {
                                                $Calt=~s/\.//g;
                                                $Talt=(translate($Calt,\%code))[1];
                                        }
				}
				if ($eff=~/Del/)
				{
					@Tref=split('',$Ttref);
					for ($i=0;$i<=$#Tref;$i++)
					{
						$cur_res=$Tref[$i];
						$cur_pos=$rel_pos+$i;
						if ($epi_data{"$namegene$cur_res$cur_pos"})
						{
							$used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1;
						}
						if ($hyphy_data{"$namegene$curpos"})
						{
							$hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";";
						}
					}
					$Nkeys=keys %used_epitopes;
					$epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1;
				}
				$hyphy_string="NA" if $hyphy_string eq "";
				$epitopes_string="NA" if $epitopes_string eq "";
				#print "$Talt\n";
				return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string);
				#return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string");
			}else{
				$Tref=(translate($Cref,\%code))[1];
				$Talt=(translate($Calt,\%code))[1];
				$eff="synonymous"  if $Tref eq $Talt;
				$eff="missense" if $Tref ne $Talt;
				$eff="stopGain" if $Talt=~/\*/;
				$eff="stopLoss" if $Tref=~/\*/ && !$Talt=~/\*/;
				@Tref=split('',$Ttref);
                                for ($i=0;$i<=$#Tref;$i++)
                                {
                                	$cur_res=$Tref[$i];
                                        $cur_pos=$rel_pos+$i;
                                        if ($epi_data{"$namegene$cur_res$cur_pos"})
                                        {
                                        	$used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1;
                                        }
                                        if ($hyphy_data{"$namegene$curpos"})
                                      	{
                                         	$hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";";
                                        }
                      		}
                                $Nkeys=keys %used_epitopes;
                                $epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1;
				$hyphy_string="NA" if $hyphy_string eq "";
                                $epitopes_string="NA" if $epitopes_string eq "";
				return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string);
				#return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string");
			}
		}
	}	
}


sub read_simple_table
{
	$file=$_[0];
	die ("$file does not exist") unless -e $file; 
	my %data=();
	open(IN,$file);
	while(<IN>)
	{
		($pos,$ref,$alt,$annot)=(split());
		if ($annot=~/;/)
		{
		#	$annot=(split(/\;/,$annot))[0];
		}
		$data{"$pos$ref$alt"}=$annot;
	}
	return \%data;
}

sub read_epitopes
{
	$file=$_[0];
	die("$file does not exist") unless -e $file;
	my %data=();
	open(IN,$file);
        while(<IN>)
	{
		($gene,$pos,$res,$EPIseq,$num,$HLA)=(split);
		push(@{$data{"$gene$pos$res"}},"$EPIseq,$num,$HLA");
	}
	return \%data;	
}

sub read_hyphy
{
	$file=$_[0];
	die("$file does not exist") unless -e $file;
	my %data=();
        open(IN,$file);
	%kw=("fel"=>1,"meme"=>1,"kind"=>1);#"betas"=>1);
        while(<IN>)
	{
		chomp();
		($gene,$pos,@annot)=(split(/\,/));
		foreach $a (@annot)
		{
			$a=~s/{//g;
			$a=~s/}//g;
			($ref,$key)=(split(/\:/,$a));
			$ref=~s/"//g;
			$key=~s/"//g;
			#print "$gene $pos $ref $key\n";
			if ($kw{$ref})
			{
				$data{"$gene$pos"}.="$ref:$key;";
			}
		}
			
	}
	return \%data;
}