Mercurial > repos > elixir-it > corgat_funann
comparison annotate.pl @ 2:d1197d8ee0bb draft default tip
Uploaded
author | elixir-it |
---|---|
date | Tue, 16 Feb 2021 09:05:48 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:9b8afa0b6da4 | 2:d1197d8ee0bb |
---|---|
1 $fss=13468; | |
2 | |
3 unless (-e "GCF_009858895.2_ASM985889v3_genomic.fna") | |
4 { | |
5 | |
6 system("wget -i https://raw.githubusercontent.com/matteo14c/CorGAT_galaxy/master/ann.txt"); | |
7 system("gzip -d GCF_009858895.2_ASM985889v3_genomic.fna.gz"); | |
8 } | |
9 | |
10 $gen_code="genetic_code"; | |
11 die("need genetic code file in the current folder\n") unless -e "genetic_code"; | |
12 open(IN,$gen_code); | |
13 while(<IN>) | |
14 { | |
15 ($triplet,$oneL)=(split()); | |
16 $code{$triplet}=$oneL; | |
17 } | |
18 | |
19 $genome="GCF_009858895.2_ASM985889v3_genomic.fna "; | |
20 die("need reference genome file in the current folder\n") unless -e "GCF_009858895.2_ASM985889v3_genomic.fna"; | |
21 open(IN,$genome); | |
22 while(<IN>) | |
23 { | |
24 next if $_=~/^>/; | |
25 chomp; | |
26 $seq.=$_; | |
27 } | |
28 $table="annot_table.pl";#"simple_annot_mirror";#"annot_table.pl";#sl5 29728 29768 reg | |
29 die("need detailed annotation file for SARS-CoV-2 in the current folder") unless -e "annot_table.pl"; | |
30 open(IN,$table); | |
31 | |
32 | |
33 while(<IN>) | |
34 { | |
35 chomp(); | |
36 ($gene,$b1,$b2,$e,$annot,$notes)=(split(/\t/)); | |
37 $annot{$b1}{$b2}=[$gene,$e,$annot,$notes]; | |
38 #print "$gene $b1 $b2 $e $annot $notes\n"; | |
39 $len=$b2-$b1+1; | |
40 if ($gene ne "nsp12" && $gene ne "orf1ab") | |
41 { | |
42 $seqgene=substr($seq,$b1-1,$len+3); #un codone inizio in più e un codone fine in più | |
43 }else{ | |
44 $len1=$fss-$b1+1; | |
45 $part1=substr($seq,$b1-1,$len1); | |
46 $len2=$b2-$fss+1; | |
47 $part2=substr($seq,$fss-1,$len2+3); | |
48 $seqgene="$part1$part2"; | |
49 | |
50 } | |
51 $annot_seq{$gene}=$seqgene; | |
52 $Lgenes{$gene}=length($seqgene); | |
53 ($NSTOP_G,$Tseq,$pos_Stop_R)=translate($seqgene,\%code); | |
54 @seq_res=split('',$Tseq); | |
55 for ($i=0;$i<=$#seq_res;$i++) | |
56 { | |
57 $pos=$i+1; | |
58 $res=$seq_res[$i]; | |
59 } | |
60 } | |
61 %AF_data=%{read_simple_table("AF_current.csv")}; | |
62 %MFE_data=%{read_simple_table("MFE_annot.csv")}; | |
63 %epi_data=%{read_epitopes("epitopes_annot.csv")}; | |
64 %hyphy_data=%{read_hyphy("hyphy_current.csv")}; | |
65 | |
66 | |
67 | |
68 $var_File=shift;#""cl7.csv";#"phenetic_indels_sars_cov2.csv"; | |
69 $out_File=shift; | |
70 open(OUT,">$out_File"); | |
71 die("no input\n") unless -e $var_File; | |
72 open(IN,$var_File); | |
73 $header=<IN>; | |
74 @header=(split(/\s+/,$header)); | |
75 print OUT "POS\tREF\tALT\tannot\tAF\tEpitopes\tHyphy\tMFE\n"; | |
76 while(<IN>) | |
77 { | |
78 ($change,@pos)=(split()); | |
79 next unless $change=~/\|/; | |
80 ($pos,$allele)=(split(/_/,$change))[0,1]; | |
81 ($ref,$alt)=(split(/\|/,$allele))[0,1]; | |
82 $AF=$AF_data{"$pos$ref$alt"} ? $AF_data{"$pos$ref$alt"} : 0; | |
83 next if $alt=~/N/; | |
84 $annot_string=""; | |
85 $contained=0; | |
86 $epitope_string=""; | |
87 $hyphy_string=""; | |
88 $MFE_string= $MFE_data{"$pos$ref$alt"} ? $MFE_data{"$pos$ref$alt"} : "NA"; | |
89 #next if $ref eq"." || $alt eq "."; | |
90 foreach $b1 (sort{$a<=>$b} keys %annot) | |
91 { | |
92 foreach $b2 (sort{$a<=>$b} keys %{$annot{$b1}}) | |
93 { | |
94 if ($pos<=$b2 && $pos>=$b1) | |
95 { | |
96 $contained=1; | |
97 $type=$annot{$b1}{$b2}[1]; | |
98 $namegene=$annot{$b1}{$b2}[0]; | |
99 if ($type eq "cds") | |
100 { | |
101 #print "cds"; | |
102 @res=annot_CDS($pos,$ref,$alt,$namegene); | |
103 $annot_string.=$res[0]; | |
104 if ($namegene ne "orf1ab") | |
105 { | |
106 $hyphy_string.=$res[1]; | |
107 $epitope_string.=$res[2]; | |
108 } | |
109 }else{ | |
110 $rel_pos=$pos-$b1+1; | |
111 $annot_string.="$namegene:nc.$ref$rel_pos$alt,NA,NA;"; | |
112 $epitope_string="NA" if $epitope_string eq ""; | |
113 $hyphy_string="NA" if $hyphy_string eq ""; | |
114 } | |
115 } | |
116 } | |
117 } | |
118 $epitope_string=~s/\s+/;/g; | |
119 #$epitope_string="EpiT:$epitope_string" unless $epitope_string eq "NA"; | |
120 print OUT "$pos\t$ref\t$alt\t$annot_string\t$AF\t$epitope_string\t$hyphy_string\t$MFE_string\n" #if $contained==1; | |
121 } | |
122 | |
123 sub translate | |
124 { | |
125 @orig_seq=split('',$_[0]); | |
126 %gen_code=%{$_[1]}; | |
127 $type=""; | |
128 $NSTOP=0; | |
129 $Tseq=""; | |
130 $pos_Stop=0; | |
131 for ($i=0;$i<=$#orig_seq;$i+=3) | |
132 { | |
133 $AA=join('',@orig_seq[$i..$i+2]); | |
134 $res=$gen_code{$AA}; | |
135 $pos_Stop=$i if $pos_Stop ==0 && $res eq "*"; | |
136 $NSTOP++ if $res eq "*"; | |
137 $Tseq.=$res; | |
138 } | |
139 #print "$seq\n"; | |
140 return($NSTOP,$Tseq,$pos_Stop); | |
141 } | |
142 | |
143 sub annot_CDS | |
144 { | |
145 $pos=$_[0]; | |
146 $ref=$_[1]; | |
147 $alt=$_[2]; | |
148 $namegene=$_[3]; | |
149 my $hyphy_string="NA"; | |
150 my $epitopes_string="NA"; | |
151 #print "$pos $ref $alt $namegene\n"; | |
152 $pos_inG=$pos-$b1+1; | |
153 $pos_inG++ if $pos >$fss && ($annot{$b1}{$b2}[0] eq "nsp12" || $annot{$b1}{$b2}[0] eq "orf1ab"); | |
154 $mod=$pos_inG%3; | |
155 $rel_pos=int($pos_inG/3); | |
156 $rel_pos++ if $mod !=0; | |
157 | |
158 if (length($ref)==1 && $ref ne "."&& $alt ne ".") | |
159 { | |
160 | |
161 if ($mod ==1) | |
162 { | |
163 $triplet=substr($seq,$pos-1,3); | |
164 @Bs=split('',$triplet); | |
165 die("1\n $triplet b:$Bs[0] r:$ref") unless ($Bs[0] eq $ref); | |
166 $Bs[0]=$alt; | |
167 }elsif ($mod==2){ | |
168 $triplet=substr($seq,$pos-2,3); | |
169 @Bs=split('',$triplet); | |
170 die("2\n $triplet b:$Bs[1] r:$ref")unless ($Bs[1] eq $ref); | |
171 $Bs[1]=$alt; | |
172 }elsif ($mod==0){ | |
173 $triplet=substr($seq,$pos-3,3); | |
174 @Bs=split('',$triplet); | |
175 die("$_ 3\n $triplet b:$Bs[2] r:$ref")unless ($Bs[2] eq $ref); | |
176 $Bs[2]=$alt; | |
177 } | |
178 #print "$pos_inG $relpos $mod @Bs\n"; | |
179 $Atriplet=join("",@Bs); | |
180 $A1=$code{$triplet}; | |
181 $A2=$code{$Atriplet}; | |
182 $eff=$A1 eq $A2 ? "synonymous" : "missense"; | |
183 $eff="start_lost" if $eff eq "missense" && $rel_pos ==1; | |
184 $eff="stopGain" if $A2 eq "*" && $A1 ne "*"; #stopG | |
185 $eff="stopLoss" if $A1 eq "*" && $A2 ne "*"; #stopL | |
186 $eff="S" if $A2 eq "*" && $A1 eq "*"; | |
187 $hyphy_string=$hyphy_data{"$namegene$rel_pos"} if $hyphy_data{"$namegene$rel_pos"}; | |
188 $epitopes_string=join(" ",@{$epi_data{"$namegene$rel_pos$A1"}}) if $epi_data{"$namegene$rel_pos$A1"}; | |
189 #return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$mod,$eff;",$hyphy_string,$epitopes_string); | |
190 return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$eff;",$hyphy_string,$epitopes_string); | |
191 }else{ | |
192 $eff=""; | |
193 $gene=$annot_seq{$namegene}; | |
194 $lgene=length($gene); | |
195 $upstream=substr($gene,0,$pos_inG-1); | |
196 $change=substr($gene,$pos_inG,length($ref)); | |
197 $downstream=substr($gene,$pos_inG+length($ref)-1); | |
198 $len=length($alt); | |
199 unless ($ref=~/\./) | |
200 { | |
201 $modSeq="$upstream$alt$downstream"; | |
202 }else{ | |
203 $modSeq="$upstream$alt$change$downstream"; | |
204 } | |
205 $modSeq=~s/\.//g; | |
206 | |
207 ($NSTOP_G,$Tseq_R,$pos_Stop_R)=translate($gene,\%code); | |
208 ($NSTOP_R,$Tseq_alt,$pos_Stop_T)=translate($modSeq,\%code); | |
209 $CDS_annot_string="."; | |
210 #print "$namegene,aa: sr:$NSTOP_G sa:$NSTOP_R psr:$pos_Stop_R psa:$pos_Stop_T\n"; | |
211 if ($NSTOP_R>$NSTOP_G && ($ref=~/\./ || $alt=~/\./)) | |
212 { | |
213 | |
214 $eff="frameshift"; | |
215 $eff.="Ins" if $ref=~/\./; | |
216 $eff.="Del" if $alt=~/\./; | |
217 $truncation=$pos_Stop_T/3; | |
218 $ref_Seq=substr($Tseq_R,$truncation-1,1); | |
219 return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string); | |
220 #return("$namegene:$pos_inG,$rel_pos,$mod,Frameshift,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
221 }elsif($NSTOP_R==$NSTOP_G && $pos_Stop_T<($pos_Stop_R-$len) ){ | |
222 # print "2\n"; | |
223 $truncation=$pos_Stop_T/3; | |
224 $ref_Seq=substr($Tseq_R,$truncation-1,1); | |
225 $eff="Truncating"; | |
226 $eff.="Ins" if $ref=~/\./; | |
227 $eff.="Del" if $alt=~/\./; | |
228 return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string); | |
229 #return("$namegene:$pos_inG,$rel_pos,$mod,Trunc:$truncation,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string="); | |
230 }else{ | |
231 %used_epitopes=(); | |
232 $hyphy_string=""; | |
233 $epitopes_string=""; | |
234 $pre=""; | |
235 $Cref=$ref; | |
236 $Calt=$alt; | |
237 if ($mod==0) | |
238 { | |
239 $pre=substr($gene,$pos_inG-3,2); | |
240 }elsif($mod==2){ | |
241 $pre=substr($gene,$pos_inG-2,1); | |
242 } | |
243 $Cref="$pre$Cref"; | |
244 $Calt="$pre$Calt"; | |
245 $post=$pos_inG+length($ref)-1; | |
246 while (length($Cref) %3!=0) | |
247 { | |
248 $Cref.=substr($gene,$post,1); | |
249 $Calt.=substr($gene,$post,1); | |
250 $post++; | |
251 if ($post>length($gene)) | |
252 { | |
253 $local_gene=substr($gene,0,length($gene)-3); | |
254 $eff="TruncatingDel"; | |
255 #print "$post\n"; | |
256 $copy_pos=$pos_inG; | |
257 #print "$pos_inG\n"; | |
258 #print length($gene)."\n"; | |
259 #die("muoro"); | |
260 $Loc_Ref=substr($local_gene,$copy_pos); | |
261 $Talt="-"; | |
262 while(length($Loc_Ref)%3!=0) | |
263 { | |
264 $copy_pos--; | |
265 $Loc_Ref=substr($local_gene,$copy_pos); | |
266 } | |
267 $rel_pos=int($copy_pos/3); | |
268 $rel_pos++ if $mod !=0; | |
269 $Tref=(translate($Loc_Ref,\%code))[1]; | |
270 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
271 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string) | |
272 } | |
273 } | |
274 if ($alt=~/\./ || $ref=~/\./) | |
275 { | |
276 $Tref="-"; | |
277 $Talt="-"; | |
278 $eff="inframe"; | |
279 if ($ref=~/\./) | |
280 { | |
281 $eff.="Ins"; | |
282 $Talt=(translate($Calt,\%code))[1]; | |
283 if ($Cref=~/[ACTG]{1,}/) | |
284 { | |
285 $Cref=~s/\.//g; | |
286 $Tref=(translate($Cref,\%code))[1]; | |
287 } | |
288 | |
289 } | |
290 if ($alt=~/\./) | |
291 { | |
292 $eff.="Del"; | |
293 $Tref=(translate($Cref,\%code))[1]; | |
294 if ($Calt=~/[ACTG]{1,}/) | |
295 { | |
296 $Calt=~s/\.//g; | |
297 $Talt=(translate($Calt,\%code))[1]; | |
298 } | |
299 } | |
300 if ($eff=~/Del/) | |
301 { | |
302 @Tref=split('',$Ttref); | |
303 for ($i=0;$i<=$#Tref;$i++) | |
304 { | |
305 $cur_res=$Tref[$i]; | |
306 $cur_pos=$rel_pos+$i; | |
307 if ($epi_data{"$namegene$cur_res$cur_pos"}) | |
308 { | |
309 $used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1; | |
310 } | |
311 if ($hyphy_data{"$namegene$curpos"}) | |
312 { | |
313 $hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";"; | |
314 } | |
315 } | |
316 $Nkeys=keys %used_epitopes; | |
317 $epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1; | |
318 } | |
319 $hyphy_string="NA" if $hyphy_string eq ""; | |
320 $epitopes_string="NA" if $epitopes_string eq ""; | |
321 #print "$Talt\n"; | |
322 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
323 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
324 }else{ | |
325 $Tref=(translate($Cref,\%code))[1]; | |
326 $Talt=(translate($Calt,\%code))[1]; | |
327 $eff="synonymous" if $Tref eq $Talt; | |
328 $eff="missense" if $Tref ne $Talt; | |
329 $eff="stopGain" if $Talt=~/\*/; | |
330 $eff="stopLoss" if $Tref=~/\*/ && !$Talt=~/\*/; | |
331 @Tref=split('',$Ttref); | |
332 for ($i=0;$i<=$#Tref;$i++) | |
333 { | |
334 $cur_res=$Tref[$i]; | |
335 $cur_pos=$rel_pos+$i; | |
336 if ($epi_data{"$namegene$cur_res$cur_pos"}) | |
337 { | |
338 $used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1; | |
339 } | |
340 if ($hyphy_data{"$namegene$curpos"}) | |
341 { | |
342 $hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";"; | |
343 } | |
344 } | |
345 $Nkeys=keys %used_epitopes; | |
346 $epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1; | |
347 $hyphy_string="NA" if $hyphy_string eq ""; | |
348 $epitopes_string="NA" if $epitopes_string eq ""; | |
349 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
350 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
351 } | |
352 } | |
353 } | |
354 } | |
355 | |
356 | |
357 sub read_simple_table | |
358 { | |
359 $file=$_[0]; | |
360 die ("$file does not exist") unless -e $file; | |
361 my %data=(); | |
362 open(IN,$file); | |
363 while(<IN>) | |
364 { | |
365 ($pos,$ref,$alt,$annot)=(split()); | |
366 if ($annot=~/;/) | |
367 { | |
368 # $annot=(split(/\;/,$annot))[0]; | |
369 } | |
370 $data{"$pos$ref$alt"}=$annot; | |
371 } | |
372 return \%data; | |
373 } | |
374 | |
375 sub read_epitopes | |
376 { | |
377 $file=$_[0]; | |
378 die("$file does not exist") unless -e $file; | |
379 my %data=(); | |
380 open(IN,$file); | |
381 while(<IN>) | |
382 { | |
383 ($gene,$pos,$res,$EPIseq,$num,$HLA)=(split); | |
384 push(@{$data{"$gene$pos$res"}},"$EPIseq,$num,$HLA"); | |
385 } | |
386 return \%data; | |
387 } | |
388 | |
389 sub read_hyphy | |
390 { | |
391 $file=$_[0]; | |
392 die("$file does not exist") unless -e $file; | |
393 my %data=(); | |
394 open(IN,$file); | |
395 %kw=("fel"=>1,"meme"=>1,"kind"=>1);#"betas"=>1); | |
396 while(<IN>) | |
397 { | |
398 chomp(); | |
399 ($gene,$pos,@annot)=(split(/\,/)); | |
400 foreach $a (@annot) | |
401 { | |
402 $a=~s/{//g; | |
403 $a=~s/}//g; | |
404 ($ref,$key)=(split(/\:/,$a)); | |
405 $ref=~s/"//g; | |
406 $key=~s/"//g; | |
407 #print "$gene $pos $ref $key\n"; | |
408 if ($kw{$ref}) | |
409 { | |
410 $data{"$gene$pos"}.="$ref:$key;"; | |
411 } | |
412 } | |
413 | |
414 } | |
415 return \%data; | |
416 } |