Mercurial > repos > elixir-it > corgat_funann
comparison annotate.pl @ 2:d1197d8ee0bb draft default tip
Uploaded
| author | elixir-it | 
|---|---|
| date | Tue, 16 Feb 2021 09:05:48 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 1:9b8afa0b6da4 | 2:d1197d8ee0bb | 
|---|---|
| 1 $fss=13468; | |
| 2 | |
| 3 unless (-e "GCF_009858895.2_ASM985889v3_genomic.fna") | |
| 4 { | |
| 5 | |
| 6 system("wget -i https://raw.githubusercontent.com/matteo14c/CorGAT_galaxy/master/ann.txt"); | |
| 7 system("gzip -d GCF_009858895.2_ASM985889v3_genomic.fna.gz"); | |
| 8 } | |
| 9 | |
| 10 $gen_code="genetic_code"; | |
| 11 die("need genetic code file in the current folder\n") unless -e "genetic_code"; | |
| 12 open(IN,$gen_code); | |
| 13 while(<IN>) | |
| 14 { | |
| 15 ($triplet,$oneL)=(split()); | |
| 16 $code{$triplet}=$oneL; | |
| 17 } | |
| 18 | |
| 19 $genome="GCF_009858895.2_ASM985889v3_genomic.fna "; | |
| 20 die("need reference genome file in the current folder\n") unless -e "GCF_009858895.2_ASM985889v3_genomic.fna"; | |
| 21 open(IN,$genome); | |
| 22 while(<IN>) | |
| 23 { | |
| 24 next if $_=~/^>/; | |
| 25 chomp; | |
| 26 $seq.=$_; | |
| 27 } | |
| 28 $table="annot_table.pl";#"simple_annot_mirror";#"annot_table.pl";#sl5 29728 29768 reg | |
| 29 die("need detailed annotation file for SARS-CoV-2 in the current folder") unless -e "annot_table.pl"; | |
| 30 open(IN,$table); | |
| 31 | |
| 32 | |
| 33 while(<IN>) | |
| 34 { | |
| 35 chomp(); | |
| 36 ($gene,$b1,$b2,$e,$annot,$notes)=(split(/\t/)); | |
| 37 $annot{$b1}{$b2}=[$gene,$e,$annot,$notes]; | |
| 38 #print "$gene $b1 $b2 $e $annot $notes\n"; | |
| 39 $len=$b2-$b1+1; | |
| 40 if ($gene ne "nsp12" && $gene ne "orf1ab") | |
| 41 { | |
| 42 $seqgene=substr($seq,$b1-1,$len+3); #un codone inizio in più e un codone fine in più | |
| 43 }else{ | |
| 44 $len1=$fss-$b1+1; | |
| 45 $part1=substr($seq,$b1-1,$len1); | |
| 46 $len2=$b2-$fss+1; | |
| 47 $part2=substr($seq,$fss-1,$len2+3); | |
| 48 $seqgene="$part1$part2"; | |
| 49 | |
| 50 } | |
| 51 $annot_seq{$gene}=$seqgene; | |
| 52 $Lgenes{$gene}=length($seqgene); | |
| 53 ($NSTOP_G,$Tseq,$pos_Stop_R)=translate($seqgene,\%code); | |
| 54 @seq_res=split('',$Tseq); | |
| 55 for ($i=0;$i<=$#seq_res;$i++) | |
| 56 { | |
| 57 $pos=$i+1; | |
| 58 $res=$seq_res[$i]; | |
| 59 } | |
| 60 } | |
| 61 %AF_data=%{read_simple_table("AF_current.csv")}; | |
| 62 %MFE_data=%{read_simple_table("MFE_annot.csv")}; | |
| 63 %epi_data=%{read_epitopes("epitopes_annot.csv")}; | |
| 64 %hyphy_data=%{read_hyphy("hyphy_current.csv")}; | |
| 65 | |
| 66 | |
| 67 | |
| 68 $var_File=shift;#""cl7.csv";#"phenetic_indels_sars_cov2.csv"; | |
| 69 $out_File=shift; | |
| 70 open(OUT,">$out_File"); | |
| 71 die("no input\n") unless -e $var_File; | |
| 72 open(IN,$var_File); | |
| 73 $header=<IN>; | |
| 74 @header=(split(/\s+/,$header)); | |
| 75 print OUT "POS\tREF\tALT\tannot\tAF\tEpitopes\tHyphy\tMFE\n"; | |
| 76 while(<IN>) | |
| 77 { | |
| 78 ($change,@pos)=(split()); | |
| 79 next unless $change=~/\|/; | |
| 80 ($pos,$allele)=(split(/_/,$change))[0,1]; | |
| 81 ($ref,$alt)=(split(/\|/,$allele))[0,1]; | |
| 82 $AF=$AF_data{"$pos$ref$alt"} ? $AF_data{"$pos$ref$alt"} : 0; | |
| 83 next if $alt=~/N/; | |
| 84 $annot_string=""; | |
| 85 $contained=0; | |
| 86 $epitope_string=""; | |
| 87 $hyphy_string=""; | |
| 88 $MFE_string= $MFE_data{"$pos$ref$alt"} ? $MFE_data{"$pos$ref$alt"} : "NA"; | |
| 89 #next if $ref eq"." || $alt eq "."; | |
| 90 foreach $b1 (sort{$a<=>$b} keys %annot) | |
| 91 { | |
| 92 foreach $b2 (sort{$a<=>$b} keys %{$annot{$b1}}) | |
| 93 { | |
| 94 if ($pos<=$b2 && $pos>=$b1) | |
| 95 { | |
| 96 $contained=1; | |
| 97 $type=$annot{$b1}{$b2}[1]; | |
| 98 $namegene=$annot{$b1}{$b2}[0]; | |
| 99 if ($type eq "cds") | |
| 100 { | |
| 101 #print "cds"; | |
| 102 @res=annot_CDS($pos,$ref,$alt,$namegene); | |
| 103 $annot_string.=$res[0]; | |
| 104 if ($namegene ne "orf1ab") | |
| 105 { | |
| 106 $hyphy_string.=$res[1]; | |
| 107 $epitope_string.=$res[2]; | |
| 108 } | |
| 109 }else{ | |
| 110 $rel_pos=$pos-$b1+1; | |
| 111 $annot_string.="$namegene:nc.$ref$rel_pos$alt,NA,NA;"; | |
| 112 $epitope_string="NA" if $epitope_string eq ""; | |
| 113 $hyphy_string="NA" if $hyphy_string eq ""; | |
| 114 } | |
| 115 } | |
| 116 } | |
| 117 } | |
| 118 $epitope_string=~s/\s+/;/g; | |
| 119 #$epitope_string="EpiT:$epitope_string" unless $epitope_string eq "NA"; | |
| 120 print OUT "$pos\t$ref\t$alt\t$annot_string\t$AF\t$epitope_string\t$hyphy_string\t$MFE_string\n" #if $contained==1; | |
| 121 } | |
| 122 | |
| 123 sub translate | |
| 124 { | |
| 125 @orig_seq=split('',$_[0]); | |
| 126 %gen_code=%{$_[1]}; | |
| 127 $type=""; | |
| 128 $NSTOP=0; | |
| 129 $Tseq=""; | |
| 130 $pos_Stop=0; | |
| 131 for ($i=0;$i<=$#orig_seq;$i+=3) | |
| 132 { | |
| 133 $AA=join('',@orig_seq[$i..$i+2]); | |
| 134 $res=$gen_code{$AA}; | |
| 135 $pos_Stop=$i if $pos_Stop ==0 && $res eq "*"; | |
| 136 $NSTOP++ if $res eq "*"; | |
| 137 $Tseq.=$res; | |
| 138 } | |
| 139 #print "$seq\n"; | |
| 140 return($NSTOP,$Tseq,$pos_Stop); | |
| 141 } | |
| 142 | |
| 143 sub annot_CDS | |
| 144 { | |
| 145 $pos=$_[0]; | |
| 146 $ref=$_[1]; | |
| 147 $alt=$_[2]; | |
| 148 $namegene=$_[3]; | |
| 149 my $hyphy_string="NA"; | |
| 150 my $epitopes_string="NA"; | |
| 151 #print "$pos $ref $alt $namegene\n"; | |
| 152 $pos_inG=$pos-$b1+1; | |
| 153 $pos_inG++ if $pos >$fss && ($annot{$b1}{$b2}[0] eq "nsp12" || $annot{$b1}{$b2}[0] eq "orf1ab"); | |
| 154 $mod=$pos_inG%3; | |
| 155 $rel_pos=int($pos_inG/3); | |
| 156 $rel_pos++ if $mod !=0; | |
| 157 | |
| 158 if (length($ref)==1 && $ref ne "."&& $alt ne ".") | |
| 159 { | |
| 160 | |
| 161 if ($mod ==1) | |
| 162 { | |
| 163 $triplet=substr($seq,$pos-1,3); | |
| 164 @Bs=split('',$triplet); | |
| 165 die("1\n $triplet b:$Bs[0] r:$ref") unless ($Bs[0] eq $ref); | |
| 166 $Bs[0]=$alt; | |
| 167 }elsif ($mod==2){ | |
| 168 $triplet=substr($seq,$pos-2,3); | |
| 169 @Bs=split('',$triplet); | |
| 170 die("2\n $triplet b:$Bs[1] r:$ref")unless ($Bs[1] eq $ref); | |
| 171 $Bs[1]=$alt; | |
| 172 }elsif ($mod==0){ | |
| 173 $triplet=substr($seq,$pos-3,3); | |
| 174 @Bs=split('',$triplet); | |
| 175 die("$_ 3\n $triplet b:$Bs[2] r:$ref")unless ($Bs[2] eq $ref); | |
| 176 $Bs[2]=$alt; | |
| 177 } | |
| 178 #print "$pos_inG $relpos $mod @Bs\n"; | |
| 179 $Atriplet=join("",@Bs); | |
| 180 $A1=$code{$triplet}; | |
| 181 $A2=$code{$Atriplet}; | |
| 182 $eff=$A1 eq $A2 ? "synonymous" : "missense"; | |
| 183 $eff="start_lost" if $eff eq "missense" && $rel_pos ==1; | |
| 184 $eff="stopGain" if $A2 eq "*" && $A1 ne "*"; #stopG | |
| 185 $eff="stopLoss" if $A1 eq "*" && $A2 ne "*"; #stopL | |
| 186 $eff="S" if $A2 eq "*" && $A1 eq "*"; | |
| 187 $hyphy_string=$hyphy_data{"$namegene$rel_pos"} if $hyphy_data{"$namegene$rel_pos"}; | |
| 188 $epitopes_string=join(" ",@{$epi_data{"$namegene$rel_pos$A1"}}) if $epi_data{"$namegene$rel_pos$A1"}; | |
| 189 #return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$mod,$eff;",$hyphy_string,$epitopes_string); | |
| 190 return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$eff;",$hyphy_string,$epitopes_string); | |
| 191 }else{ | |
| 192 $eff=""; | |
| 193 $gene=$annot_seq{$namegene}; | |
| 194 $lgene=length($gene); | |
| 195 $upstream=substr($gene,0,$pos_inG-1); | |
| 196 $change=substr($gene,$pos_inG,length($ref)); | |
| 197 $downstream=substr($gene,$pos_inG+length($ref)-1); | |
| 198 $len=length($alt); | |
| 199 unless ($ref=~/\./) | |
| 200 { | |
| 201 $modSeq="$upstream$alt$downstream"; | |
| 202 }else{ | |
| 203 $modSeq="$upstream$alt$change$downstream"; | |
| 204 } | |
| 205 $modSeq=~s/\.//g; | |
| 206 | |
| 207 ($NSTOP_G,$Tseq_R,$pos_Stop_R)=translate($gene,\%code); | |
| 208 ($NSTOP_R,$Tseq_alt,$pos_Stop_T)=translate($modSeq,\%code); | |
| 209 $CDS_annot_string="."; | |
| 210 #print "$namegene,aa: sr:$NSTOP_G sa:$NSTOP_R psr:$pos_Stop_R psa:$pos_Stop_T\n"; | |
| 211 if ($NSTOP_R>$NSTOP_G && ($ref=~/\./ || $alt=~/\./)) | |
| 212 { | |
| 213 | |
| 214 $eff="frameshift"; | |
| 215 $eff.="Ins" if $ref=~/\./; | |
| 216 $eff.="Del" if $alt=~/\./; | |
| 217 $truncation=$pos_Stop_T/3; | |
| 218 $ref_Seq=substr($Tseq_R,$truncation-1,1); | |
| 219 return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string); | |
| 220 #return("$namegene:$pos_inG,$rel_pos,$mod,Frameshift,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
| 221 }elsif($NSTOP_R==$NSTOP_G && $pos_Stop_T<($pos_Stop_R-$len) ){ | |
| 222 # print "2\n"; | |
| 223 $truncation=$pos_Stop_T/3; | |
| 224 $ref_Seq=substr($Tseq_R,$truncation-1,1); | |
| 225 $eff="Truncating"; | |
| 226 $eff.="Ins" if $ref=~/\./; | |
| 227 $eff.="Del" if $alt=~/\./; | |
| 228 return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string); | |
| 229 #return("$namegene:$pos_inG,$rel_pos,$mod,Trunc:$truncation,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string="); | |
| 230 }else{ | |
| 231 %used_epitopes=(); | |
| 232 $hyphy_string=""; | |
| 233 $epitopes_string=""; | |
| 234 $pre=""; | |
| 235 $Cref=$ref; | |
| 236 $Calt=$alt; | |
| 237 if ($mod==0) | |
| 238 { | |
| 239 $pre=substr($gene,$pos_inG-3,2); | |
| 240 }elsif($mod==2){ | |
| 241 $pre=substr($gene,$pos_inG-2,1); | |
| 242 } | |
| 243 $Cref="$pre$Cref"; | |
| 244 $Calt="$pre$Calt"; | |
| 245 $post=$pos_inG+length($ref)-1; | |
| 246 while (length($Cref) %3!=0) | |
| 247 { | |
| 248 $Cref.=substr($gene,$post,1); | |
| 249 $Calt.=substr($gene,$post,1); | |
| 250 $post++; | |
| 251 if ($post>length($gene)) | |
| 252 { | |
| 253 $local_gene=substr($gene,0,length($gene)-3); | |
| 254 $eff="TruncatingDel"; | |
| 255 #print "$post\n"; | |
| 256 $copy_pos=$pos_inG; | |
| 257 #print "$pos_inG\n"; | |
| 258 #print length($gene)."\n"; | |
| 259 #die("muoro"); | |
| 260 $Loc_Ref=substr($local_gene,$copy_pos); | |
| 261 $Talt="-"; | |
| 262 while(length($Loc_Ref)%3!=0) | |
| 263 { | |
| 264 $copy_pos--; | |
| 265 $Loc_Ref=substr($local_gene,$copy_pos); | |
| 266 } | |
| 267 $rel_pos=int($copy_pos/3); | |
| 268 $rel_pos++ if $mod !=0; | |
| 269 $Tref=(translate($Loc_Ref,\%code))[1]; | |
| 270 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
| 271 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string) | |
| 272 } | |
| 273 } | |
| 274 if ($alt=~/\./ || $ref=~/\./) | |
| 275 { | |
| 276 $Tref="-"; | |
| 277 $Talt="-"; | |
| 278 $eff="inframe"; | |
| 279 if ($ref=~/\./) | |
| 280 { | |
| 281 $eff.="Ins"; | |
| 282 $Talt=(translate($Calt,\%code))[1]; | |
| 283 if ($Cref=~/[ACTG]{1,}/) | |
| 284 { | |
| 285 $Cref=~s/\.//g; | |
| 286 $Tref=(translate($Cref,\%code))[1]; | |
| 287 } | |
| 288 | |
| 289 } | |
| 290 if ($alt=~/\./) | |
| 291 { | |
| 292 $eff.="Del"; | |
| 293 $Tref=(translate($Cref,\%code))[1]; | |
| 294 if ($Calt=~/[ACTG]{1,}/) | |
| 295 { | |
| 296 $Calt=~s/\.//g; | |
| 297 $Talt=(translate($Calt,\%code))[1]; | |
| 298 } | |
| 299 } | |
| 300 if ($eff=~/Del/) | |
| 301 { | |
| 302 @Tref=split('',$Ttref); | |
| 303 for ($i=0;$i<=$#Tref;$i++) | |
| 304 { | |
| 305 $cur_res=$Tref[$i]; | |
| 306 $cur_pos=$rel_pos+$i; | |
| 307 if ($epi_data{"$namegene$cur_res$cur_pos"}) | |
| 308 { | |
| 309 $used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1; | |
| 310 } | |
| 311 if ($hyphy_data{"$namegene$curpos"}) | |
| 312 { | |
| 313 $hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";"; | |
| 314 } | |
| 315 } | |
| 316 $Nkeys=keys %used_epitopes; | |
| 317 $epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1; | |
| 318 } | |
| 319 $hyphy_string="NA" if $hyphy_string eq ""; | |
| 320 $epitopes_string="NA" if $epitopes_string eq ""; | |
| 321 #print "$Talt\n"; | |
| 322 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
| 323 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
| 324 }else{ | |
| 325 $Tref=(translate($Cref,\%code))[1]; | |
| 326 $Talt=(translate($Calt,\%code))[1]; | |
| 327 $eff="synonymous" if $Tref eq $Talt; | |
| 328 $eff="missense" if $Tref ne $Talt; | |
| 329 $eff="stopGain" if $Talt=~/\*/; | |
| 330 $eff="stopLoss" if $Tref=~/\*/ && !$Talt=~/\*/; | |
| 331 @Tref=split('',$Ttref); | |
| 332 for ($i=0;$i<=$#Tref;$i++) | |
| 333 { | |
| 334 $cur_res=$Tref[$i]; | |
| 335 $cur_pos=$rel_pos+$i; | |
| 336 if ($epi_data{"$namegene$cur_res$cur_pos"}) | |
| 337 { | |
| 338 $used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1; | |
| 339 } | |
| 340 if ($hyphy_data{"$namegene$curpos"}) | |
| 341 { | |
| 342 $hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";"; | |
| 343 } | |
| 344 } | |
| 345 $Nkeys=keys %used_epitopes; | |
| 346 $epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1; | |
| 347 $hyphy_string="NA" if $hyphy_string eq ""; | |
| 348 $epitopes_string="NA" if $epitopes_string eq ""; | |
| 349 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
| 350 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
| 351 } | |
| 352 } | |
| 353 } | |
| 354 } | |
| 355 | |
| 356 | |
| 357 sub read_simple_table | |
| 358 { | |
| 359 $file=$_[0]; | |
| 360 die ("$file does not exist") unless -e $file; | |
| 361 my %data=(); | |
| 362 open(IN,$file); | |
| 363 while(<IN>) | |
| 364 { | |
| 365 ($pos,$ref,$alt,$annot)=(split()); | |
| 366 if ($annot=~/;/) | |
| 367 { | |
| 368 # $annot=(split(/\;/,$annot))[0]; | |
| 369 } | |
| 370 $data{"$pos$ref$alt"}=$annot; | |
| 371 } | |
| 372 return \%data; | |
| 373 } | |
| 374 | |
| 375 sub read_epitopes | |
| 376 { | |
| 377 $file=$_[0]; | |
| 378 die("$file does not exist") unless -e $file; | |
| 379 my %data=(); | |
| 380 open(IN,$file); | |
| 381 while(<IN>) | |
| 382 { | |
| 383 ($gene,$pos,$res,$EPIseq,$num,$HLA)=(split); | |
| 384 push(@{$data{"$gene$pos$res"}},"$EPIseq,$num,$HLA"); | |
| 385 } | |
| 386 return \%data; | |
| 387 } | |
| 388 | |
| 389 sub read_hyphy | |
| 390 { | |
| 391 $file=$_[0]; | |
| 392 die("$file does not exist") unless -e $file; | |
| 393 my %data=(); | |
| 394 open(IN,$file); | |
| 395 %kw=("fel"=>1,"meme"=>1,"kind"=>1);#"betas"=>1); | |
| 396 while(<IN>) | |
| 397 { | |
| 398 chomp(); | |
| 399 ($gene,$pos,@annot)=(split(/\,/)); | |
| 400 foreach $a (@annot) | |
| 401 { | |
| 402 $a=~s/{//g; | |
| 403 $a=~s/}//g; | |
| 404 ($ref,$key)=(split(/\:/,$a)); | |
| 405 $ref=~s/"//g; | |
| 406 $key=~s/"//g; | |
| 407 #print "$gene $pos $ref $key\n"; | |
| 408 if ($kw{$ref}) | |
| 409 { | |
| 410 $data{"$gene$pos"}.="$ref:$key;"; | |
| 411 } | |
| 412 } | |
| 413 | |
| 414 } | |
| 415 return \%data; | |
| 416 } | 
