Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/LiveSeq/IO/Loader.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Loader.pm,v 1.15 2001/10/22 08:22:51 heikki Exp $ | |
2 # | |
3 # bioperl module for Bio::LiveSeq::IO::Loader | |
4 # | |
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net> | |
6 # | |
7 # Copyright Joseph Insana | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 #documentation needed | |
20 | |
21 =head1 DESCRIPTION | |
22 | |
23 This package holds common methods used by BioPerl, SRS and file loaders. | |
24 It contains methods to create LiveSeq objects out of entire entries or from a | |
25 localized sequence region surrounding a particular gene. | |
26 | |
27 =head1 AUTHOR - Joseph A.L. Insana | |
28 | |
29 Email: Insana@ebi.ac.uk, jinsana@gmx.net | |
30 | |
31 Address: | |
32 | |
33 EMBL Outstation, European Bioinformatics Institute | |
34 Wellcome Trust Genome Campus, Hinxton | |
35 Cambs. CB10 1SD, United Kingdom | |
36 | |
37 =head1 APPENDIX | |
38 | |
39 The rest of the documentation details each of the object | |
40 methods. Internal methods are usually preceded with a _ | |
41 | |
42 =cut | |
43 | |
44 # Let the code begin... | |
45 | |
46 package Bio::LiveSeq::IO::Loader; | |
47 $VERSION=4.44; | |
48 | |
49 # Version history: | |
50 # Wed Feb 16 17:55:01 GMT 2000 0.1a was a general EMBL entry printer with SRS | |
51 # Wed Mar 29 16:53:30 BST 2000 0.2 rewrote as SRS LiveSeq Loader | |
52 # Wed Mar 29 19:11:21 BST 2000 0.2.1 used successfully by liveseq3.pl | |
53 # Fri Mar 31 02:33:43 BST 2000 v 1.0 begun wrapping into this package | |
54 # Fri Mar 31 03:58:43 BST 2000 v 1.1 finished wrapping | |
55 # Fri Mar 31 04:24:50 BST 2000 v 1.2 added test_transl | |
56 # Fri Mar 31 04:34:48 BST 2000: noticed problem with K02083, if translation is | |
57 # included as valid qualifier name -> investigate | |
58 # Fri Mar 31 16:55:07 BST 2000 v 1.21 removed chop in test_transl() | |
59 # Mon Apr 3 18:25:27 BST 2000 v 1.3 begun working at lightweight loader | |
60 # Mon Apr 3 18:42:30 BST 2000 v 1.31 started changing so that CDS is no more | |
61 # the only default feature possibly asked for | |
62 # Tue Apr 4 16:19:09 BST 2000 v 1.4 started creating hash2gene | |
63 # Tue Apr 4 16:41:56 BST 2000 v 1.42 created location2range and rewritten | |
64 # cdslocation2transcript | |
65 # Tue Apr 4 18:18:42 BST 2000 v 1.44 finished (maybe) hash2gene | |
66 # Tue Apr 4 19:14:33 BST 2000 v 1.49 temporary printgene done. All working :) | |
67 # Wed Apr 5 02:04:01 BST 2000 v 1.5 added upbound,downbound to hash2gene | |
68 # Wed Apr 5 13:06:43 BST 2000 v 2.0 started obj_oriented and inheritance | |
69 # Thu Apr 6 03:11:29 BST 2000 v 2.2 transition from $location to @range | |
70 # Thu Apr 6 04:26:04 BST 2000 v 2.3 both SRS and BP work with gene and entry! | |
71 # Fri Apr 7 01:47:51 BST 2000 v 2.4 genes() created | |
72 # Fri Apr 7 03:01:46 BST 2000 v 2.5 changed hash2gene so that if there is | |
73 # just 1 CDS in entry it will use all | |
74 # features of the entry as Gene features | |
75 # Tue Apr 18 18:14:19 BST 2000 v 3.0 printswissprot added | |
76 # Wed Apr 19 22:15:12 BST 2000 v 3.2 swisshash2liveseq created | |
77 # Thu Apr 20 00:14:09 BST 2000 v 3.4 swisshash2liveseq updated: now it correctly handles cleaved_met and conflicts/mod_res/variants recorded differences between EMBL and SWISSPROT translations sequences. Still some not-recorded conflicts are possible and in these cases the program won't create the AARange -> this could change in the future, if a better stringcomparison is introduced | |
78 # Thu Apr 20 01:14:16 BST 2000 v 3.6 changed entry2liveseq and gene2liveseq to namedargument input format; added getswissprotinfo flag/option | |
79 # Thu Apr 20 02:18:58 BST 2000 v 3.7 mRNA added as valid_feature -> it gets recorded as prim_transcript object | |
80 # Thu Apr 27 16:19:43 BST 2000 v 3.8 translation_table set added to hash2gene | |
81 # Mon May 1 22:16:18 BST 2000 v 3.9 -position option added to gene2liveseq | |
82 # Tue May 2 02:43:05 BST 2000 v 4.0 moved some code in _findgenefeatures, added the possibility of using cds_position information, created _checkfeatureproximity | |
83 # Tue May 2 03:20:20 BST 2000 v 4.01 findgenefeatures debugged | |
84 # Wed May 31 13:59:09 BST 2000 v 4.02 chopped $translated to take away STOP | |
85 # Fri Jun 2 14:49:12 BST 2000 v 4.1 prints alignment with CLUSTALW | |
86 # Wed Jun 7 02:07:54 BST 2000 v 4.2 added code for "simplifying" joinedlocation features (e.g. join() in mRNA features), changing them to plain start-end ones | |
87 # Wed Jun 7 04:20:15 BST 2000 v 4.22 added translation->{'offset'} for INIT_MET | |
88 # Tue Jun 27 14:05:19 BST 2000 v. 4.3 added if() conditions so that if new() of object creation failed, the object is not passed on | |
89 # Tue Jul 4 14:15:58 BST 2000 v 4.4 note and number qualifier added to exon and intron descriptions | |
90 # Wed Jul 12 14:06:38 BST 2000 v 4.41 added if() condition out of transcript creation in transexoncreation() | |
91 # Fri Sep 15 15:41:02 BST 2000 v 4.44 created _common_novelaasequence2gene | |
92 | |
93 # Note: test_transl has been left as deprecated and is not really supported now | |
94 | |
95 use strict; | |
96 use Carp qw(cluck croak carp); | |
97 use vars qw($VERSION @ISA); | |
98 use Bio::LiveSeq::DNA 1.2; | |
99 use Bio::LiveSeq::Exon 1.0; | |
100 use Bio::LiveSeq::Transcript 2.4; | |
101 use Bio::LiveSeq::Translation 1.4; | |
102 use Bio::LiveSeq::Gene 1.1; | |
103 use Bio::LiveSeq::Intron 1.0; | |
104 use Bio::LiveSeq::Prim_Transcript 1.0; | |
105 use Bio::LiveSeq::Repeat_Region 1.0; | |
106 use Bio::LiveSeq::Repeat_Unit 1.0; | |
107 use Bio::LiveSeq::AARange 1.4; | |
108 use Bio::Tools::CodonTable; | |
109 | |
110 #@ISA=qw(Bio::LiveSeq::); # not useful now | |
111 | |
112 =head2 entry2liveseq | |
113 | |
114 Title : entry2liveseq | |
115 Usage : @translationobjects=$loader->entry2liveseq(); | |
116 : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0); | |
117 Function: creates LiveSeq objects from an entry previously loaded | |
118 Returns : array of references to objects of class Translation | |
119 Errorcode 0 | |
120 Args : optional boolean flag to avoid the retrieval of SwissProt | |
121 informations for all Transcripts containing SwissProt x-reference | |
122 default is 1 (to retrieve those informations and create AARange | |
123 LiveSeq objects) | |
124 Note : this method can get really slow for big entries. The lightweight | |
125 gene2liveseq method is recommended | |
126 | |
127 =cut | |
128 | |
129 sub entry2liveseq { | |
130 my ($self, %args) = @_; | |
131 my ($getswissprotinfo)=($args{-getswissprotinfo}); | |
132 if (defined($getswissprotinfo)) { | |
133 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { | |
134 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; | |
135 $getswissprotinfo=0; | |
136 } | |
137 } else { | |
138 $getswissprotinfo=1; | |
139 } | |
140 my $hashref=$self->{'hash'}; | |
141 unless ($hashref) { return (0); } | |
142 my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo); | |
143 my $test_transl=0; | |
144 if ($test_transl) { $self->test_transl($hashref,\@translationobjects);} | |
145 return @translationobjects; | |
146 } | |
147 | |
148 =head2 novelaasequence2gene | |
149 | |
150 Title : novelaasequence2gene | |
151 Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); | |
152 : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); | |
153 -taxon => 9606, | |
154 -gene_name => "tyr-kinase"); | |
155 | |
156 Function: creates LiveSeq objects from a novel amino acid sequence, | |
157 using codon usage database to choose codons according to | |
158 relative frequencies. | |
159 If a taxon ID is not specified, the default is to use the human | |
160 one (taxonomy ID 9606). | |
161 Returns : reference to a Gene object containing references to LiveSeq objects | |
162 Errorcode 0 | |
163 Args : string containing an amino acid sequence | |
164 integer (optional) with a taxonomy ID | |
165 string specifying a gene name | |
166 | |
167 =cut | |
168 | |
169 =head2 gene2liveseq | |
170 | |
171 Title : gene2liveseq | |
172 Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name"); | |
173 : $gene=$loader->gene2liveseq(-gene_name => "gene name", | |
174 -flanking => 64); | |
175 : $gene=$loader->gene2liveseq(-gene_name => "gene name", | |
176 -getswissprotinfo => 0); | |
177 : $gene=$loader->gene2liveseq(-position => 4); | |
178 | |
179 Function: creates LiveSeq objects from an entry previously loaded | |
180 It is a "light weight" creation because it creates | |
181 a LiveSequence just for the interesting region in an entry | |
182 (instead than for the total entry, like in entry2liveseq) and for | |
183 the flanking regions up to 500 nucleotides (default) or up to | |
184 the specified amount of nucleotides (given as argument) around the | |
185 Gene. | |
186 Returns : reference to a Gene object containing possibly alternative | |
187 Transcripts. | |
188 Errorcode 0 | |
189 Args : string containing the gene name as in the EMBL feature qualifier | |
190 integer (optional) "flanking": amount of flanking bases to be kept | |
191 boolean (optional) "getswissprotinfo": if set to "0" it will avoid | |
192 trying to fetch information from a crossreference to a SwissProt | |
193 entry, avoding the process of creation of AARange objects | |
194 It is "1" (on) by default | |
195 | |
196 Alternative to a gene_name, a position can be given: an | |
197 integer (1-) containing the position of the desired CDS in the | |
198 loaded entry | |
199 | |
200 =cut | |
201 | |
202 sub gene2liveseq { | |
203 my ($self, %args) = @_; | |
204 my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position}); | |
205 my $input; | |
206 unless (($gene_name)||($cds_position)) { | |
207 carp "Gene_Name or Position not specified for gene2liveseq loading function"; | |
208 return (0); | |
209 } | |
210 if (($gene_name)&&($cds_position)) { | |
211 carp "Gene_Name and Position cannot be given together, use one"; | |
212 return (0); | |
213 } elsif ($gene_name) { | |
214 $input=$gene_name; | |
215 } else { | |
216 $input="cds-position:".$cds_position; | |
217 } | |
218 | |
219 if (defined($getswissprotinfo)) { | |
220 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { | |
221 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; | |
222 $getswissprotinfo=0; | |
223 } | |
224 } else { | |
225 $getswissprotinfo=1; | |
226 } | |
227 | |
228 if (defined($flanking)) { | |
229 unless ($flanking >= 0) { | |
230 carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function"; | |
231 return (0); | |
232 } | |
233 } else { | |
234 $flanking=500; # the default flanking length | |
235 } | |
236 my $hashref=$self->{'hash'}; | |
237 unless ($hashref) { return (0); } | |
238 my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo); | |
239 unless ($gene) { # if $gene == 0 it means problems in hash2gene | |
240 carp "gene2liveseq produced error"; | |
241 return (0); | |
242 } | |
243 return $gene; | |
244 } | |
245 | |
246 # TODO: update so that it will work even if CDS is not only accepted FEATURE!! | |
247 # this method is for now deprecated and not supported | |
248 sub test_transl { | |
249 my ($self,$entry)=@_; | |
250 my @features=@{$entry->{'Features'}}; | |
251 my @translationobjects=@{$_[1]}; | |
252 my ($i,$translation); | |
253 my ($obj_transl,$hash_transl); | |
254 my @cds=@{$entry->{'CDS'}}; | |
255 foreach $translation (@translationobjects) { | |
256 $obj_transl=$translation->seq; | |
257 $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'}; | |
258 #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*" | |
259 unless ($obj_transl eq $hash_transl) { | |
260 cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i"; | |
261 carp "\nEntry's transl: ",$hash_transl,"\n"; | |
262 carp "\nObject's transl: ",$obj_transl,"\n"; | |
263 exit; | |
264 } | |
265 $i++; | |
266 } | |
267 } | |
268 | |
269 # argument: hashref containing the EMBL entry datas, | |
270 # getswissprotinfo boolean flag | |
271 # creates the liveseq objects | |
272 # returns: an array of Translation object references | |
273 sub hash2liveseq { | |
274 my ($self,$entry,$getswissprotinfo)=@_; | |
275 my $i; | |
276 my @transcripts; | |
277 my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} ); | |
278 $dna->alphabet(lc($entry->{'Molecule'})); | |
279 $dna->display_id($entry->{'ID'}); | |
280 $dna->accession_number($entry->{'AccNumber'}); | |
281 $dna->desc($entry->{'Description'}); | |
282 my @cds=@{$entry->{'CDS'}}; | |
283 my ($swissacc,$swisshash); my @swisshashes; | |
284 for $i (0..$#cds) { | |
285 #my @transcript=@{$cds[$i]->{'range'}}; | |
286 #$transcript=\@transcript; | |
287 #push (@transcripts,$transcript); | |
288 push (@transcripts,$cds[$i]->{'range'}); | |
289 if ($getswissprotinfo) { | |
290 $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'}; | |
291 $swisshash=$self->get_swisshash($swissacc); | |
292 #$self->printswissprot($swisshash); # DEBUG | |
293 push (@swisshashes,$swisshash); | |
294 } | |
295 } | |
296 my @translations=($self->transexonscreation($dna,\@transcripts)); | |
297 my $translation; my $j=0; | |
298 foreach $translation (@translations) { | |
299 if ($swisshashes[$j]) { # if not 0 | |
300 $self->swisshash2liveseq($swisshashes[$j],$translation); | |
301 } | |
302 $j++; | |
303 } | |
304 return (@translations); | |
305 } | |
306 | |
307 # only features pertaining to a specified gene are created | |
308 # only the sequence of the gene and appropriate context flanking regions | |
309 # are created as chain | |
310 # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag | |
311 # returns: reference to Gene object | |
312 # | |
313 # Note: if entry contains just one CDS, all the features get added | |
314 # this is useful because often the features in these entries do not | |
315 # carry the /gene qualifier | |
316 # | |
317 # errorcode: 0 | |
318 sub hash2gene { | |
319 my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_; | |
320 my $entryfeature; | |
321 my $genefeatureshash; | |
322 | |
323 my @cds=@{$entry->{'CDS'}}; | |
324 | |
325 # checking if a position has been given instead than a gene_name | |
326 if (index($input,"cds-position:") == 0 ) { | |
327 my $cds_position=substr($input,13); # extracting the cds position | |
328 if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) { | |
329 $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo); | |
330 } | |
331 } else { | |
332 $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo); | |
333 } | |
334 | |
335 unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found | |
336 my @genes=$self->genes($entry); | |
337 my $cds_number=scalar(@cds); | |
338 warn "Warning! Not even one genefeature found for /$input/.... | |
339 The genes present in this entry are:\n\t@genes\n | |
340 The number of CDS in this entry is:\n\t$cds_number\n"; | |
341 return(0); | |
342 } | |
343 | |
344 # get max and min, check flankings | |
345 my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries" | |
346 my $seqlength=$entry->{'SeqLength'}; | |
347 my ($mindna,$maxdna); # some flanking region next to the gene "boundaries" | |
348 if ($min-$flanking < 1) { | |
349 $mindna=1; | |
350 } else { | |
351 $mindna=$min-$flanking; | |
352 } | |
353 if ($max+$flanking > $seqlength) { | |
354 $maxdna=$seqlength; | |
355 } else { | |
356 $maxdna=$max+$flanking; | |
357 } | |
358 my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1); | |
359 | |
360 # create LiveSeq objects | |
361 | |
362 # create DNA | |
363 my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna); | |
364 $dna->alphabet(lc($entry->{'Molecule'})); | |
365 $dna->source($entry->{'Organism'}); | |
366 $dna->display_id($entry->{'ID'}); | |
367 $dna->accession_number($entry->{'AccNumber'}); | |
368 $dna->desc($entry->{'Description'}); | |
369 | |
370 my @transcripts=@{$genefeatureshash->{'transcripts'}}; | |
371 # create Translations, Transcripts, Exons out of the CDS | |
372 unless (@transcripts) { | |
373 cluck "no CDS feature found for /$input/...."; | |
374 return(0); | |
375 } | |
376 my @translationobjs=$self->transexonscreation($dna,\@transcripts); | |
377 my @transcriptobjs; | |
378 | |
379 # get the Transcript obj_refs | |
380 my $translation; | |
381 my $j=0; | |
382 my @ttables=@{$genefeatureshash->{'ttables'}}; | |
383 my @swisshashes=@{$genefeatureshash->{'swisshashes'}}; | |
384 foreach $translation (@translationobjs) { | |
385 push(@transcriptobjs,$translation->get_Transcript); | |
386 if ($ttables[$j]) { # if not undef | |
387 $translation->get_Transcript->translation_table($ttables[$j]); | |
388 #} else { # DEBUG | |
389 # print "\n\t\tno translation table information....\n"; | |
390 } | |
391 if ($swisshashes[$j]) { # if not 0 | |
392 $self->swisshash2liveseq($swisshashes[$j],$translation); | |
393 } | |
394 $j++; | |
395 } | |
396 | |
397 my %gene; # this is the hash to store created object references | |
398 $gene{DNA}=$dna; | |
399 $gene{Transcripts}=\@transcriptobjs; | |
400 $gene{Translations}=\@translationobjs; | |
401 | |
402 my @exonobjs; my @intronobjs; | |
403 my @repeatunitobjs; my @repeatregionobjs; | |
404 my @primtranscriptobjs; | |
405 | |
406 my ($object,$range,$start,$end,$strand); | |
407 | |
408 my @exons=@{$genefeatureshash->{'exons'}}; | |
409 my @exondescs=@{$genefeatureshash->{'exondescs'}}; | |
410 if (@exons) { | |
411 my $exoncount = 0; | |
412 foreach $range (@exons) { | |
413 ($start,$end,$strand)=@{$range}; | |
414 $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
415 if ($object != -1) { | |
416 $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount]; | |
417 $exoncount++; | |
418 push (@exonobjs,$object); | |
419 } else { | |
420 $exoncount++; | |
421 } | |
422 } | |
423 $gene{Exons}=\@exonobjs; | |
424 } | |
425 my @introns=@{$genefeatureshash->{'introns'}}; | |
426 my @introndescs=@{$genefeatureshash->{'introndescs'}}; | |
427 if (@introns) { | |
428 my $introncount = 0; | |
429 foreach $range (@introns) { | |
430 ($start,$end,$strand)=@{$range}; | |
431 $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
432 if ($object != -1) { | |
433 $object->desc($introndescs[$introncount]); | |
434 $introncount++; | |
435 push (@intronobjs,$object); | |
436 } else { | |
437 $introncount++; | |
438 } | |
439 } | |
440 $gene{Introns}=\@intronobjs; | |
441 } | |
442 my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}}; | |
443 if (@prim_transcripts) { | |
444 foreach $range (@prim_transcripts) { | |
445 ($start,$end,$strand)=@{$range}; | |
446 $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
447 if ($object != -1) { push (@primtranscriptobjs,$object); } | |
448 } | |
449 $gene{Prim_Transcripts}=\@primtranscriptobjs; | |
450 } | |
451 my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}}; | |
452 my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}}; | |
453 if (@repeat_regions) { | |
454 my $k=0; | |
455 foreach $range (@repeat_regions) { | |
456 ($start,$end,$strand)=@{$range}; | |
457 $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
458 if ($object != -1) { | |
459 $object->desc($repeat_regions_family[$k]); | |
460 $k++; | |
461 push (@repeatregionobjs,$object); | |
462 } else { | |
463 $k++; | |
464 } | |
465 } | |
466 $gene{Repeat_Regions}=\@repeatregionobjs; | |
467 } | |
468 my @repeat_units=@{$genefeatureshash->{'repeat_units'}}; | |
469 my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}}; | |
470 if (@repeat_units) { | |
471 my $k=0; | |
472 foreach $range (@repeat_units) { | |
473 ($start,$end,$strand)=@{$range}; | |
474 $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
475 if ($object != -1) { | |
476 $object->desc($repeat_units_family[$k]); | |
477 $k++; | |
478 push (@repeatunitobjs,$object); | |
479 } else { | |
480 $k++; | |
481 } | |
482 } | |
483 $gene{Repeat_Units}=\@repeatunitobjs; | |
484 } | |
485 | |
486 # create the Gene | |
487 my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos | |
488 return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene, | |
489 -upbound=>$min,-downbound=>$max)); | |
490 } | |
491 | |
492 # maybe this function will be moved to general utility package | |
493 # argument: array of numbers | |
494 # returns: (min,max) numbers in the array | |
495 sub rangeofarray { | |
496 my $self=shift; | |
497 my @array=@_; | |
498 #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n"; | |
499 my ($max,$min,$element); | |
500 $min=$max=shift(@array); | |
501 foreach $element (@array) { | |
502 $element = 0 unless defined $element; | |
503 if ($element < $min) { | |
504 $min=$element; | |
505 } | |
506 if ($element > $max) { | |
507 $max=$element; | |
508 } | |
509 } | |
510 #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n"; | |
511 return ($min,$max); | |
512 } | |
513 | |
514 | |
515 # argument: reference to DNA object, reference to array of transcripts | |
516 # returns: an array of Translation object references | |
517 sub transexonscreation { | |
518 my $self=shift; | |
519 my $dna=$_[0]; | |
520 my @transcripts=@{$_[1]}; | |
521 | |
522 my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj); | |
523 my $translationobj; | |
524 my @translationobjects; | |
525 foreach $transcript (@transcripts) { | |
526 foreach $exonref (@{$transcript}) { | |
527 ($start,$end,$strand)=@{$exonref}; | |
528 #print "Creating Exon: start $start end $end strand $strand\n"; | |
529 $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
530 #push (@exonobjects,$exonobj); | |
531 push (@transexons,$exonobj); | |
532 } | |
533 $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons ); | |
534 if ($transcriptobj != -1) { | |
535 $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj); | |
536 @transexons=(); # cleans it | |
537 #push (@transcriptobjects,$transcriptobj); | |
538 push (@translationobjects,$translationobj); | |
539 } | |
540 } | |
541 return (@translationobjects); | |
542 } | |
543 | |
544 #sub printgene { | |
545 # deleted. Some functionality placed in Gene->printfeaturesnum | |
546 | |
547 =head2 printswissprot | |
548 | |
549 Title : printswissprot | |
550 Usage : $loader->printswissprot($hashref); | |
551 Function: prints out all informations loaded from a database entry into the | |
552 loader. Mainly used for testing purposes. | |
553 Args : a hashref containing the SWISSPROT entry datas | |
554 Note : the hashref can be obtained with a call to the method | |
555 $loader->get_swisshash() (only with SRS loader or | |
556 BioPerl via Bio::DB::EMBL.pm) | |
557 that takes as argument a string like "SWISS-PROT:P10275" or | |
558 from $loader->swissprot2hash() that takes an SRS query string | |
559 as its argument (e.g. "swissprot-acc:P10275") | |
560 | |
561 =cut | |
562 | |
563 # argument: hashref containing the SWISSPROT entry datas | |
564 # prints out that hash, showing the informations loaded | |
565 sub printswissprot { | |
566 my ($self,$entry)=@_; | |
567 unless ($entry) { | |
568 return; | |
569 } | |
570 printf "ID: %s\n", | |
571 $entry->{'ID'}; | |
572 printf "ACC: %s\n", | |
573 $entry->{'AccNumber'}; | |
574 printf "GENE: %s\n", | |
575 $entry->{'Gene'}; | |
576 printf "DES: %s\n", | |
577 $entry->{'Description'}; | |
578 printf "ORG: %s\n", | |
579 $entry->{'Organism'}; | |
580 printf "SEQLN: %s\n", | |
581 $entry->{'SeqLength'}; | |
582 printf "SEQ: %s\n", | |
583 substr($entry->{'Sequence'},0,64); | |
584 if ($entry->{'Features'}) { | |
585 my @features=@{$entry->{'Features'}}; | |
586 my $i; | |
587 for $i (0..$#features) { | |
588 print "|",$features[$i]->{'name'},"|"; | |
589 print " at ",$features[$i]->{'location'},": "; | |
590 print "",$features[$i]->{'desc'},"\n"; | |
591 } | |
592 } | |
593 } | |
594 | |
595 =head2 printembl | |
596 | |
597 Title : printembl | |
598 Usage : $loader->printembl(); | |
599 Function: prints out all informations loaded from a database entry into the | |
600 loader. Mainly used for testing purposes. | |
601 Args : none | |
602 | |
603 =cut | |
604 | |
605 # argument: hashref containing the EMBL entry datas | |
606 # prints out that hash, showing the informations loaded | |
607 sub printembl { | |
608 my ($self,$entry)=@_; | |
609 unless ($entry) { | |
610 $entry=$self->{'hash'}; | |
611 } | |
612 my ($i,$featurename); | |
613 printf "ID: %s\n", | |
614 $entry->{'ID'}; | |
615 printf "ACC: %s\n", | |
616 $entry->{'AccNumber'}; | |
617 printf "MOL: %s\n", | |
618 $entry->{'Molecule'}; | |
619 printf "DIV: %s\n", | |
620 $entry->{'Division'}; | |
621 printf "DES: %s\n", | |
622 $entry->{'Description'}; | |
623 printf "ORG: %s\n", | |
624 $entry->{'Organism'}; | |
625 printf "SEQLN: %s\n", | |
626 $entry->{'SeqLength'}; | |
627 printf "SEQ: %s\n", | |
628 substr($entry->{'Sequence'},0,64); | |
629 my @features=@{$entry->{'Features'}}; | |
630 my @cds=@{$entry->{'CDS'}}; | |
631 print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n"; | |
632 my ($exonref,$transcript); | |
633 my ($qualifiernumber,$qualifiers,$key); | |
634 my ($start,$end,$strand); | |
635 my $j=0; | |
636 for $i (0..$#features) { | |
637 $featurename=$features[$i]->{'name'}; | |
638 if ($featurename eq "CDS") { | |
639 print "|CDS| number $j at feature position: $i\n"; | |
640 #print $features[$i]->{'location'},"\n"; | |
641 $transcript=$features[$i]->{'range'}; | |
642 foreach $exonref (@{$transcript}) { | |
643 ($start,$end,$strand)=@{$exonref}; | |
644 print "\tExon: start $start end $end strand $strand\n"; | |
645 } | |
646 $j++; | |
647 } else { | |
648 print "|$featurename| at feature position: $i\n"; | |
649 print "\trange: "; | |
650 print join("\t",@{$features[$i]->{'range'}}),"\n"; | |
651 #print $features[$i]->{'location'},"\n"; | |
652 } | |
653 $qualifiernumber=$features[$i]->{'qual_number'}; | |
654 $qualifiers=$features[$i]->{'qualifiers'}; # hash | |
655 foreach $key (keys (%{$qualifiers})) { | |
656 print "\t\t",$key,": "; | |
657 print $qualifiers->{$key},"\n"; | |
658 } | |
659 } | |
660 } | |
661 | |
662 =head2 genes | |
663 | |
664 Title : genes | |
665 Usage : $loader->genes(); | |
666 Function: Returns an array of gene_names (strings) contained in the loaded | |
667 entry. | |
668 Args : none | |
669 | |
670 =cut | |
671 | |
672 # argument: entryhashref | |
673 # returns: array of genenames found in the entry | |
674 sub genes { | |
675 my ($self,$entry)=@_; | |
676 unless ($entry) { | |
677 $entry=$self->{'hash'}; | |
678 } | |
679 my @entryfeatures=@{$entry->{'Features'}}; | |
680 my ($genename,$genenames,$entryfeature); | |
681 for $entryfeature (@entryfeatures) { | |
682 $genename=$entryfeature->{'qualifiers'}->{'gene'}; | |
683 if ($genename) { | |
684 if (index($genenames,$genename) == -1) { # if name is new | |
685 $genenames .= $genename . " "; # add the name | |
686 } | |
687 } | |
688 } | |
689 return (split(/ /,$genenames)); # assumes no space inbetween each genename | |
690 } | |
691 | |
692 # arguments: swisshash, translation objref | |
693 # adds information to the Translation, creates AARange objects, sets the | |
694 # aa_range attribute on the Translation, pointing to those objects | |
695 sub swisshash2liveseq { | |
696 my ($self,$entry,$translation)=@_; | |
697 my $translength=$translation->length; | |
698 $translation->desc($translation->desc . $entry->{'Description'}); | |
699 $translation->display_id("SWISSPROT:" . $entry->{'ID'}); | |
700 $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'}); | |
701 $translation->name($entry->{'Gene'}); | |
702 $translation->source($entry->{'Organism'}); | |
703 my @aarangeobjects; | |
704 my ($start,$end,$aarangeobj,$feature); | |
705 my @features; my @newfeatures; | |
706 if ($entry->{'Features'}) { | |
707 @features=@{$entry->{'Features'}}; | |
708 } | |
709 my $cleavedmet=0; | |
710 # check for cleaved Met | |
711 foreach $feature (@features) { | |
712 if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) { | |
713 $cleavedmet=1; | |
714 $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence | |
715 } else { | |
716 push(@newfeatures,$feature); | |
717 } | |
718 } | |
719 | |
720 my $swissseq=$entry->{'Sequence'}; | |
721 my $liveseqtransl=$translation->seq; | |
722 chop $liveseqtransl; # to take away the trailing STOP "*" | |
723 my $translated=substr($liveseqtransl,$cleavedmet); | |
724 | |
725 my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met | |
726 | |
727 if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed? | |
728 print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n"; | |
729 carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!"; | |
730 return; | |
731 } | |
732 | |
733 #my $i=0; # debug | |
734 @features=@newfeatures; | |
735 foreach $feature (@features) { | |
736 #print "Processing SwissProtFeature: $i\n"; # debug | |
737 ($start,$end)=split(/ /,$feature->{'location'}); | |
738 # Note: cleavedmet is taken in account for updating numbering | |
739 $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -desc => $feature->{'desc'}, -translation => $translation, -translength => $translength); | |
740 if ($aarangeobj != -1) { | |
741 push(@aarangeobjects,$aarangeobj); | |
742 } | |
743 # $i++; # debug | |
744 } | |
745 $translation->{'aa_ranges'}=\@aarangeobjects; | |
746 } | |
747 | |
748 # if there is no SRS support, the default will be to return 0 | |
749 # i.e. this function is overridden in SRS package | |
750 sub get_swisshash { | |
751 return (0); | |
752 } | |
753 | |
754 # Args: $entry hashref, gene_name OR cds_position (undef is used to | |
755 # choose between the two), getswissprotinfo boolean flag | |
756 # Returns: an hash holding various arrayref used in the hash2gene method | |
757 # Function: examines the nucleotide entry, identifying features belonging | |
758 # to the gene (defined either by its name or by the position of its CDS in | |
759 # the entry) | |
760 | |
761 sub _findgenefeatures { | |
762 my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_; | |
763 | |
764 my @entryfeatures=@{$entry->{'Features'}}; | |
765 my @exons; my @introns; my @prim_transcripts; my @transcripts; | |
766 my @repeat_units; my @repeat_regions; | |
767 my @repeat_units_family; my @repeat_regions_family; my $rpt_family; | |
768 my $entryfeature; my @genefeatures; | |
769 my $desc; my @exondescs; my @introndescs; | |
770 | |
771 # for swissprot xreference | |
772 my ($swissacc,$swisshash); my @swisshashes; | |
773 | |
774 # for translation_tables | |
775 my @ttables; | |
776 | |
777 # to create labels | |
778 my ($name,$exon); | |
779 my @range; my @cdsexons; my @labels; | |
780 | |
781 # maybe here also could be added special case when there is no CDS feature | |
782 # in the entry (e.g. tRNA entry -> TOCHECK). | |
783 # let's deal with the special case in which there is just one gene per entry | |
784 # usually without /gene qualifier | |
785 my @cds=@{$entry->{'CDS'}}; | |
786 | |
787 my $skipgenematch=0; | |
788 if (scalar(@cds) == 1) { | |
789 #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features."; | |
790 $skipgenematch=1; | |
791 } | |
792 | |
793 my ($cds_begin,$cds_end,$proximity); | |
794 if ($cds_position) { # if a position has been requested | |
795 my @cds_exons=@{$cds[$cds_position-1]->{'range'}}; | |
796 ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS | |
797 $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'}; | |
798 # DEBUG | |
799 unless ($skipgenematch) { | |
800 carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------"; | |
801 } | |
802 $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS | |
803 } | |
804 | |
805 for $entryfeature (@entryfeatures) { # get only features for the desired gene | |
806 if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) { | |
807 push(@genefeatures,$entryfeature); | |
808 | |
809 my @range=@{$entryfeature->{'range'}}; | |
810 $name=$entryfeature->{'name'}; | |
811 my %qualifierhash=%{$entryfeature->{'qualifiers'}}; | |
812 if ($name eq "CDS") { # that has range containing array of exons | |
813 | |
814 # swissprot crossindexing (if without SRS support it will fill array | |
815 # with zeros and do nothing | |
816 if ($getswissprotinfo) { | |
817 $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'}; | |
818 $swisshash=$self->get_swisshash($swissacc); | |
819 #$self->printswissprot($swisshash); # DEBUG | |
820 push (@swisshashes,$swisshash); | |
821 } | |
822 | |
823 push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified | |
824 | |
825 # create labels array | |
826 for $exon (@range) { | |
827 push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS | |
828 } | |
829 push (@transcripts,$entryfeature->{'range'}); | |
830 } else { | |
831 # "simplifying" the joinedlocation features. I.e. changing them from | |
832 # multijoined ones to simple plain start-end features, taking only | |
833 # the start of the first "exon" and the end of the last "exon" as | |
834 # start and end of the entire feature | |
835 if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location | |
836 @range=($range[0]->[0],$range[-1]->[1]); | |
837 } | |
838 push(@labels,$range[0],$range[1]); # start and end of every feature | |
839 if ($name eq "exon") { | |
840 $desc=$entryfeature->{'qualifiers'}->{'number'}; | |
841 if ($entryfeature->{'qualifiers'}->{'note'}) { | |
842 if ($desc) { | |
843 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; | |
844 } else { | |
845 $desc = $entryfeature->{'qualifiers'}->{'note'}; | |
846 } | |
847 } | |
848 push (@exondescs,$desc || "unknown"); | |
849 push(@exons,\@range); | |
850 } | |
851 if ($name eq "intron") { | |
852 $desc=$entryfeature->{'qualifiers'}->{'number'}; | |
853 if ($desc) { | |
854 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; | |
855 } else { | |
856 $desc = $entryfeature->{'qualifiers'}->{'note'}; | |
857 } | |
858 push (@introndescs,$desc || "unknown"); | |
859 push(@introns,\@range); | |
860 } | |
861 if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); } | |
862 if ($name eq "repeat_unit") { push(@repeat_units,\@range); | |
863 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; | |
864 push (@repeat_units_family,$rpt_family || "unknown"); | |
865 } | |
866 if ($name eq "repeat_region") { push(@repeat_regions,\@range); | |
867 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; | |
868 push (@repeat_regions_family,$rpt_family || "unknown"); | |
869 } | |
870 } | |
871 } | |
872 } | |
873 unless ($gene_name) { $gene_name="cds-position:".$cds_position; } | |
874 my %genefeatureshash; | |
875 $genefeatureshash{gene_name}=$gene_name; | |
876 $genefeatureshash{genefeatures}=\@genefeatures; | |
877 $genefeatureshash{labels}=\@labels; | |
878 $genefeatureshash{ttables}=\@ttables; | |
879 $genefeatureshash{swisshashes}=\@swisshashes; | |
880 $genefeatureshash{transcripts}=\@transcripts; | |
881 $genefeatureshash{exons}=\@exons; | |
882 $genefeatureshash{exondescs}=\@exondescs; | |
883 $genefeatureshash{introns}=\@introns; | |
884 $genefeatureshash{introndescs}=\@introndescs; | |
885 $genefeatureshash{prim_transcripts}=\@prim_transcripts; | |
886 $genefeatureshash{repeat_units}=\@repeat_units; | |
887 $genefeatureshash{repeat_regions}=\@repeat_regions; | |
888 $genefeatureshash{repeat_units_family}=\@repeat_units_family; | |
889 $genefeatureshash{repeat_regions_family}=\@repeat_regions_family; | |
890 return (\%genefeatureshash); | |
891 } | |
892 | |
893 | |
894 # used by _findgenefeatures, when a CDS at a certain position is requested, | |
895 # to retrieve only features quite close to the wanted CDS. | |
896 # Args: range hashref, begin and end positions of the CDS, $proximity | |
897 # $proximity holds the maximum distance between the extremes of the CDS | |
898 # and of the feature under exam. | |
899 # Returns: boolean | |
900 sub _checkfeatureproximity { | |
901 my ($self,$range,$cds_begin,$cds_end,$proximity)=@_; | |
902 my @range=@{$range}; | |
903 my ($begin,$end,$strand); | |
904 if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons | |
905 ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]); | |
906 } else { | |
907 ($begin,$end,$strand)=@range; | |
908 } | |
909 if ($cds_begin > $cds_end) { # i.e. reverse strand CDS | |
910 ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries | |
911 } | |
912 if ($strand == -1) { # reverse strand | |
913 ($begin,$end)=($end,$begin); # swap boundaries | |
914 } | |
915 if (($cds_begin-$end)>$proximity) { | |
916 carp "--DEBUG-- feature rejected: begin $begin end $end -------"; | |
917 return (0); | |
918 } | |
919 if (($begin-$cds_end)>$proximity) { | |
920 carp "--DEBUG-- feature rejected: begin $begin end $end -------"; | |
921 return (0); | |
922 } | |
923 carp "--DEBUG-- feature accepted: begin $begin end $end -------"; | |
924 return (1); # otherwise ok, feature considered next to CDS | |
925 } | |
926 | |
927 | |
928 # function that calls the external program "align" (on the fasta2 package) | |
929 # to create an alignment between two sequences, returning the aligned | |
930 # strings and the codes for the identity (:: ::::) | |
931 | |
932 sub _get_alignment { | |
933 my ($self,$seq1,$seq2)=@_; | |
934 my $fastafile1="/tmp/tmpfastafile1"; | |
935 my $fastafile2="/tmp/tmpfastafile2"; | |
936 my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut | |
937 my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>/dev/null | $grepcut"; # ALIGN | |
938 open TMPFASTAFILE1,">$fastafile1" || croak "Cannot write into $fastafile1 for aa alignment"; | |
939 open TMPFASTAFILE2,">$fastafile2" || croak "Cannot write into $fastafile1 for aa alignment"; | |
940 print TMPFASTAFILE1 ">firstseq\n$seq1\n"; | |
941 print TMPFASTAFILE2 ">secondseq\n$seq2\n"; | |
942 close TMPFASTAFILE1; | |
943 close TMPFASTAFILE2; | |
944 my $alignment=`$alignprogram`; | |
945 my @alignlines=split(/\n/,$alignment); | |
946 my ($linecount,$seq1_aligned,$seq2_aligned,$codes); | |
947 for ($linecount=0; $linecount < @alignlines; $linecount+=3) { | |
948 $seq1_aligned .= $alignlines[$linecount]; | |
949 $codes .= $alignlines[$linecount+1]; | |
950 $seq2_aligned .= $alignlines[$linecount+2]; | |
951 } | |
952 return ($seq1_aligned,$seq2_aligned,$codes); | |
953 } | |
954 | |
955 # common part of the function to create a novel liveseq gene structure | |
956 # from an amino acid sequence, using codon usage frequencies | |
957 # args: codon_usage_array transltableid aasequence gene_name | |
958 sub _common_novelaasequence2gene { | |
959 my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_; | |
960 my @species_codon_usage=@{$species_codon_usage}; | |
961 my @codon_usage_label= | |
962 qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg | |
963 tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga | |
964 ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa | |
965 gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag | |
966 tga); | |
967 my ($i,$j); | |
968 my %codon_usage_value; | |
969 my $aa_codon_total; | |
970 for ($i=0;$i<64;$i++) { | |
971 $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i]; | |
972 } | |
973 | |
974 my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid ); | |
975 my @aminoacids = split(//,uc($aasequence)); | |
976 my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon); | |
977 for $i (@aminoacids) { | |
978 @alt_codons = $CodonTable->revtranslate($i); | |
979 unless (@alt_codons) { | |
980 carp "No reverse translation possible for aminoacid \'$i\'"; | |
981 $dnasequence .= "???"; | |
982 } else { | |
983 $aa_codon_total=0; | |
984 for $j (@alt_codons) { | |
985 $aa_codon_total+=$codon_usage_value{$j}; | |
986 } | |
987 # print "aminoacid $i, codonchoice: "; # verbose | |
988 #$partial=0; | |
989 #for $j (@alt_codons) { | |
990 #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total; | |
991 #$partial+=($codon_usage_value{$j}/$aa_codon_total); | |
992 #} | |
993 #print "\n"; | |
994 $dice=rand(1); | |
995 #print "roulette: $dice\n"; # verbose | |
996 $partial=0; | |
997 $chosen_codon=""; | |
998 CODONCHOICE: | |
999 for $j (0..@alt_codons) { # last one not accounted | |
1000 $thiscodon=$alt_codons[$j]; | |
1001 $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total); | |
1002 if ($dice < $relativeusage+$partial) { | |
1003 $chosen_codon=$thiscodon; | |
1004 last CODONCHOICE; | |
1005 } else { | |
1006 $partial += $relativeusage; | |
1007 } | |
1008 } | |
1009 unless ($chosen_codon) { | |
1010 $chosen_codon = $alt_codons[-1]; # the last one | |
1011 } | |
1012 # print ".....adding $chosen_codon\n"; # verbose | |
1013 $dnasequence .= $chosen_codon; | |
1014 } | |
1015 } | |
1016 | |
1017 my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence); | |
1018 my $min=1; | |
1019 my $max=length($dnasequence); | |
1020 my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1); | |
1021 my @exons=($exon); | |
1022 my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons); | |
1023 $transcript->translation_table($ttabid); | |
1024 my @transcripts=($transcript); | |
1025 my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript); | |
1026 my @translations=($translation); | |
1027 my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations); | |
1028 my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max); | |
1029 | |
1030 # creation of gene | |
1031 unless ($gene) { # if $gene == 0 it means problems in hash2gene | |
1032 carp "Error in Gene creation phase"; | |
1033 return (0); | |
1034 } | |
1035 return $gene; | |
1036 } | |
1037 | |
1038 1; |