0
|
1 # $Id: Loader.pm,v 1.15 2001/10/22 08:22:51 heikki Exp $
|
|
2 #
|
|
3 # bioperl module for Bio::LiveSeq::IO::Loader
|
|
4 #
|
|
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
|
|
6 #
|
|
7 # Copyright Joseph Insana
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 #documentation needed
|
|
20
|
|
21 =head1 DESCRIPTION
|
|
22
|
|
23 This package holds common methods used by BioPerl, SRS and file loaders.
|
|
24 It contains methods to create LiveSeq objects out of entire entries or from a
|
|
25 localized sequence region surrounding a particular gene.
|
|
26
|
|
27 =head1 AUTHOR - Joseph A.L. Insana
|
|
28
|
|
29 Email: Insana@ebi.ac.uk, jinsana@gmx.net
|
|
30
|
|
31 Address:
|
|
32
|
|
33 EMBL Outstation, European Bioinformatics Institute
|
|
34 Wellcome Trust Genome Campus, Hinxton
|
|
35 Cambs. CB10 1SD, United Kingdom
|
|
36
|
|
37 =head1 APPENDIX
|
|
38
|
|
39 The rest of the documentation details each of the object
|
|
40 methods. Internal methods are usually preceded with a _
|
|
41
|
|
42 =cut
|
|
43
|
|
44 # Let the code begin...
|
|
45
|
|
46 package Bio::LiveSeq::IO::Loader;
|
|
47 $VERSION=4.44;
|
|
48
|
|
49 # Version history:
|
|
50 # Wed Feb 16 17:55:01 GMT 2000 0.1a was a general EMBL entry printer with SRS
|
|
51 # Wed Mar 29 16:53:30 BST 2000 0.2 rewrote as SRS LiveSeq Loader
|
|
52 # Wed Mar 29 19:11:21 BST 2000 0.2.1 used successfully by liveseq3.pl
|
|
53 # Fri Mar 31 02:33:43 BST 2000 v 1.0 begun wrapping into this package
|
|
54 # Fri Mar 31 03:58:43 BST 2000 v 1.1 finished wrapping
|
|
55 # Fri Mar 31 04:24:50 BST 2000 v 1.2 added test_transl
|
|
56 # Fri Mar 31 04:34:48 BST 2000: noticed problem with K02083, if translation is
|
|
57 # included as valid qualifier name -> investigate
|
|
58 # Fri Mar 31 16:55:07 BST 2000 v 1.21 removed chop in test_transl()
|
|
59 # Mon Apr 3 18:25:27 BST 2000 v 1.3 begun working at lightweight loader
|
|
60 # Mon Apr 3 18:42:30 BST 2000 v 1.31 started changing so that CDS is no more
|
|
61 # the only default feature possibly asked for
|
|
62 # Tue Apr 4 16:19:09 BST 2000 v 1.4 started creating hash2gene
|
|
63 # Tue Apr 4 16:41:56 BST 2000 v 1.42 created location2range and rewritten
|
|
64 # cdslocation2transcript
|
|
65 # Tue Apr 4 18:18:42 BST 2000 v 1.44 finished (maybe) hash2gene
|
|
66 # Tue Apr 4 19:14:33 BST 2000 v 1.49 temporary printgene done. All working :)
|
|
67 # Wed Apr 5 02:04:01 BST 2000 v 1.5 added upbound,downbound to hash2gene
|
|
68 # Wed Apr 5 13:06:43 BST 2000 v 2.0 started obj_oriented and inheritance
|
|
69 # Thu Apr 6 03:11:29 BST 2000 v 2.2 transition from $location to @range
|
|
70 # Thu Apr 6 04:26:04 BST 2000 v 2.3 both SRS and BP work with gene and entry!
|
|
71 # Fri Apr 7 01:47:51 BST 2000 v 2.4 genes() created
|
|
72 # Fri Apr 7 03:01:46 BST 2000 v 2.5 changed hash2gene so that if there is
|
|
73 # just 1 CDS in entry it will use all
|
|
74 # features of the entry as Gene features
|
|
75 # Tue Apr 18 18:14:19 BST 2000 v 3.0 printswissprot added
|
|
76 # Wed Apr 19 22:15:12 BST 2000 v 3.2 swisshash2liveseq created
|
|
77 # Thu Apr 20 00:14:09 BST 2000 v 3.4 swisshash2liveseq updated: now it correctly handles cleaved_met and conflicts/mod_res/variants recorded differences between EMBL and SWISSPROT translations sequences. Still some not-recorded conflicts are possible and in these cases the program won't create the AARange -> this could change in the future, if a better stringcomparison is introduced
|
|
78 # Thu Apr 20 01:14:16 BST 2000 v 3.6 changed entry2liveseq and gene2liveseq to namedargument input format; added getswissprotinfo flag/option
|
|
79 # Thu Apr 20 02:18:58 BST 2000 v 3.7 mRNA added as valid_feature -> it gets recorded as prim_transcript object
|
|
80 # Thu Apr 27 16:19:43 BST 2000 v 3.8 translation_table set added to hash2gene
|
|
81 # Mon May 1 22:16:18 BST 2000 v 3.9 -position option added to gene2liveseq
|
|
82 # Tue May 2 02:43:05 BST 2000 v 4.0 moved some code in _findgenefeatures, added the possibility of using cds_position information, created _checkfeatureproximity
|
|
83 # Tue May 2 03:20:20 BST 2000 v 4.01 findgenefeatures debugged
|
|
84 # Wed May 31 13:59:09 BST 2000 v 4.02 chopped $translated to take away STOP
|
|
85 # Fri Jun 2 14:49:12 BST 2000 v 4.1 prints alignment with CLUSTALW
|
|
86 # Wed Jun 7 02:07:54 BST 2000 v 4.2 added code for "simplifying" joinedlocation features (e.g. join() in mRNA features), changing them to plain start-end ones
|
|
87 # Wed Jun 7 04:20:15 BST 2000 v 4.22 added translation->{'offset'} for INIT_MET
|
|
88 # Tue Jun 27 14:05:19 BST 2000 v. 4.3 added if() conditions so that if new() of object creation failed, the object is not passed on
|
|
89 # Tue Jul 4 14:15:58 BST 2000 v 4.4 note and number qualifier added to exon and intron descriptions
|
|
90 # Wed Jul 12 14:06:38 BST 2000 v 4.41 added if() condition out of transcript creation in transexoncreation()
|
|
91 # Fri Sep 15 15:41:02 BST 2000 v 4.44 created _common_novelaasequence2gene
|
|
92
|
|
93 # Note: test_transl has been left as deprecated and is not really supported now
|
|
94
|
|
95 use strict;
|
|
96 use Carp qw(cluck croak carp);
|
|
97 use vars qw($VERSION @ISA);
|
|
98 use Bio::LiveSeq::DNA 1.2;
|
|
99 use Bio::LiveSeq::Exon 1.0;
|
|
100 use Bio::LiveSeq::Transcript 2.4;
|
|
101 use Bio::LiveSeq::Translation 1.4;
|
|
102 use Bio::LiveSeq::Gene 1.1;
|
|
103 use Bio::LiveSeq::Intron 1.0;
|
|
104 use Bio::LiveSeq::Prim_Transcript 1.0;
|
|
105 use Bio::LiveSeq::Repeat_Region 1.0;
|
|
106 use Bio::LiveSeq::Repeat_Unit 1.0;
|
|
107 use Bio::LiveSeq::AARange 1.4;
|
|
108 use Bio::Tools::CodonTable;
|
|
109
|
|
110 #@ISA=qw(Bio::LiveSeq::); # not useful now
|
|
111
|
|
112 =head2 entry2liveseq
|
|
113
|
|
114 Title : entry2liveseq
|
|
115 Usage : @translationobjects=$loader->entry2liveseq();
|
|
116 : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0);
|
|
117 Function: creates LiveSeq objects from an entry previously loaded
|
|
118 Returns : array of references to objects of class Translation
|
|
119 Errorcode 0
|
|
120 Args : optional boolean flag to avoid the retrieval of SwissProt
|
|
121 informations for all Transcripts containing SwissProt x-reference
|
|
122 default is 1 (to retrieve those informations and create AARange
|
|
123 LiveSeq objects)
|
|
124 Note : this method can get really slow for big entries. The lightweight
|
|
125 gene2liveseq method is recommended
|
|
126
|
|
127 =cut
|
|
128
|
|
129 sub entry2liveseq {
|
|
130 my ($self, %args) = @_;
|
|
131 my ($getswissprotinfo)=($args{-getswissprotinfo});
|
|
132 if (defined($getswissprotinfo)) {
|
|
133 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
|
|
134 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
|
|
135 $getswissprotinfo=0;
|
|
136 }
|
|
137 } else {
|
|
138 $getswissprotinfo=1;
|
|
139 }
|
|
140 my $hashref=$self->{'hash'};
|
|
141 unless ($hashref) { return (0); }
|
|
142 my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo);
|
|
143 my $test_transl=0;
|
|
144 if ($test_transl) { $self->test_transl($hashref,\@translationobjects);}
|
|
145 return @translationobjects;
|
|
146 }
|
|
147
|
|
148 =head2 novelaasequence2gene
|
|
149
|
|
150 Title : novelaasequence2gene
|
|
151 Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
|
|
152 : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
|
|
153 -taxon => 9606,
|
|
154 -gene_name => "tyr-kinase");
|
|
155
|
|
156 Function: creates LiveSeq objects from a novel amino acid sequence,
|
|
157 using codon usage database to choose codons according to
|
|
158 relative frequencies.
|
|
159 If a taxon ID is not specified, the default is to use the human
|
|
160 one (taxonomy ID 9606).
|
|
161 Returns : reference to a Gene object containing references to LiveSeq objects
|
|
162 Errorcode 0
|
|
163 Args : string containing an amino acid sequence
|
|
164 integer (optional) with a taxonomy ID
|
|
165 string specifying a gene name
|
|
166
|
|
167 =cut
|
|
168
|
|
169 =head2 gene2liveseq
|
|
170
|
|
171 Title : gene2liveseq
|
|
172 Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name");
|
|
173 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
|
|
174 -flanking => 64);
|
|
175 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
|
|
176 -getswissprotinfo => 0);
|
|
177 : $gene=$loader->gene2liveseq(-position => 4);
|
|
178
|
|
179 Function: creates LiveSeq objects from an entry previously loaded
|
|
180 It is a "light weight" creation because it creates
|
|
181 a LiveSequence just for the interesting region in an entry
|
|
182 (instead than for the total entry, like in entry2liveseq) and for
|
|
183 the flanking regions up to 500 nucleotides (default) or up to
|
|
184 the specified amount of nucleotides (given as argument) around the
|
|
185 Gene.
|
|
186 Returns : reference to a Gene object containing possibly alternative
|
|
187 Transcripts.
|
|
188 Errorcode 0
|
|
189 Args : string containing the gene name as in the EMBL feature qualifier
|
|
190 integer (optional) "flanking": amount of flanking bases to be kept
|
|
191 boolean (optional) "getswissprotinfo": if set to "0" it will avoid
|
|
192 trying to fetch information from a crossreference to a SwissProt
|
|
193 entry, avoding the process of creation of AARange objects
|
|
194 It is "1" (on) by default
|
|
195
|
|
196 Alternative to a gene_name, a position can be given: an
|
|
197 integer (1-) containing the position of the desired CDS in the
|
|
198 loaded entry
|
|
199
|
|
200 =cut
|
|
201
|
|
202 sub gene2liveseq {
|
|
203 my ($self, %args) = @_;
|
|
204 my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position});
|
|
205 my $input;
|
|
206 unless (($gene_name)||($cds_position)) {
|
|
207 carp "Gene_Name or Position not specified for gene2liveseq loading function";
|
|
208 return (0);
|
|
209 }
|
|
210 if (($gene_name)&&($cds_position)) {
|
|
211 carp "Gene_Name and Position cannot be given together, use one";
|
|
212 return (0);
|
|
213 } elsif ($gene_name) {
|
|
214 $input=$gene_name;
|
|
215 } else {
|
|
216 $input="cds-position:".$cds_position;
|
|
217 }
|
|
218
|
|
219 if (defined($getswissprotinfo)) {
|
|
220 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
|
|
221 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
|
|
222 $getswissprotinfo=0;
|
|
223 }
|
|
224 } else {
|
|
225 $getswissprotinfo=1;
|
|
226 }
|
|
227
|
|
228 if (defined($flanking)) {
|
|
229 unless ($flanking >= 0) {
|
|
230 carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function";
|
|
231 return (0);
|
|
232 }
|
|
233 } else {
|
|
234 $flanking=500; # the default flanking length
|
|
235 }
|
|
236 my $hashref=$self->{'hash'};
|
|
237 unless ($hashref) { return (0); }
|
|
238 my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo);
|
|
239 unless ($gene) { # if $gene == 0 it means problems in hash2gene
|
|
240 carp "gene2liveseq produced error";
|
|
241 return (0);
|
|
242 }
|
|
243 return $gene;
|
|
244 }
|
|
245
|
|
246 # TODO: update so that it will work even if CDS is not only accepted FEATURE!!
|
|
247 # this method is for now deprecated and not supported
|
|
248 sub test_transl {
|
|
249 my ($self,$entry)=@_;
|
|
250 my @features=@{$entry->{'Features'}};
|
|
251 my @translationobjects=@{$_[1]};
|
|
252 my ($i,$translation);
|
|
253 my ($obj_transl,$hash_transl);
|
|
254 my @cds=@{$entry->{'CDS'}};
|
|
255 foreach $translation (@translationobjects) {
|
|
256 $obj_transl=$translation->seq;
|
|
257 $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'};
|
|
258 #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*"
|
|
259 unless ($obj_transl eq $hash_transl) {
|
|
260 cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i";
|
|
261 carp "\nEntry's transl: ",$hash_transl,"\n";
|
|
262 carp "\nObject's transl: ",$obj_transl,"\n";
|
|
263 exit;
|
|
264 }
|
|
265 $i++;
|
|
266 }
|
|
267 }
|
|
268
|
|
269 # argument: hashref containing the EMBL entry datas,
|
|
270 # getswissprotinfo boolean flag
|
|
271 # creates the liveseq objects
|
|
272 # returns: an array of Translation object references
|
|
273 sub hash2liveseq {
|
|
274 my ($self,$entry,$getswissprotinfo)=@_;
|
|
275 my $i;
|
|
276 my @transcripts;
|
|
277 my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} );
|
|
278 $dna->alphabet(lc($entry->{'Molecule'}));
|
|
279 $dna->display_id($entry->{'ID'});
|
|
280 $dna->accession_number($entry->{'AccNumber'});
|
|
281 $dna->desc($entry->{'Description'});
|
|
282 my @cds=@{$entry->{'CDS'}};
|
|
283 my ($swissacc,$swisshash); my @swisshashes;
|
|
284 for $i (0..$#cds) {
|
|
285 #my @transcript=@{$cds[$i]->{'range'}};
|
|
286 #$transcript=\@transcript;
|
|
287 #push (@transcripts,$transcript);
|
|
288 push (@transcripts,$cds[$i]->{'range'});
|
|
289 if ($getswissprotinfo) {
|
|
290 $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'};
|
|
291 $swisshash=$self->get_swisshash($swissacc);
|
|
292 #$self->printswissprot($swisshash); # DEBUG
|
|
293 push (@swisshashes,$swisshash);
|
|
294 }
|
|
295 }
|
|
296 my @translations=($self->transexonscreation($dna,\@transcripts));
|
|
297 my $translation; my $j=0;
|
|
298 foreach $translation (@translations) {
|
|
299 if ($swisshashes[$j]) { # if not 0
|
|
300 $self->swisshash2liveseq($swisshashes[$j],$translation);
|
|
301 }
|
|
302 $j++;
|
|
303 }
|
|
304 return (@translations);
|
|
305 }
|
|
306
|
|
307 # only features pertaining to a specified gene are created
|
|
308 # only the sequence of the gene and appropriate context flanking regions
|
|
309 # are created as chain
|
|
310 # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag
|
|
311 # returns: reference to Gene object
|
|
312 #
|
|
313 # Note: if entry contains just one CDS, all the features get added
|
|
314 # this is useful because often the features in these entries do not
|
|
315 # carry the /gene qualifier
|
|
316 #
|
|
317 # errorcode: 0
|
|
318 sub hash2gene {
|
|
319 my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_;
|
|
320 my $entryfeature;
|
|
321 my $genefeatureshash;
|
|
322
|
|
323 my @cds=@{$entry->{'CDS'}};
|
|
324
|
|
325 # checking if a position has been given instead than a gene_name
|
|
326 if (index($input,"cds-position:") == 0 ) {
|
|
327 my $cds_position=substr($input,13); # extracting the cds position
|
|
328 if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) {
|
|
329 $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo);
|
|
330 }
|
|
331 } else {
|
|
332 $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo);
|
|
333 }
|
|
334
|
|
335 unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found
|
|
336 my @genes=$self->genes($entry);
|
|
337 my $cds_number=scalar(@cds);
|
|
338 warn "Warning! Not even one genefeature found for /$input/....
|
|
339 The genes present in this entry are:\n\t@genes\n
|
|
340 The number of CDS in this entry is:\n\t$cds_number\n";
|
|
341 return(0);
|
|
342 }
|
|
343
|
|
344 # get max and min, check flankings
|
|
345 my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries"
|
|
346 my $seqlength=$entry->{'SeqLength'};
|
|
347 my ($mindna,$maxdna); # some flanking region next to the gene "boundaries"
|
|
348 if ($min-$flanking < 1) {
|
|
349 $mindna=1;
|
|
350 } else {
|
|
351 $mindna=$min-$flanking;
|
|
352 }
|
|
353 if ($max+$flanking > $seqlength) {
|
|
354 $maxdna=$seqlength;
|
|
355 } else {
|
|
356 $maxdna=$max+$flanking;
|
|
357 }
|
|
358 my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1);
|
|
359
|
|
360 # create LiveSeq objects
|
|
361
|
|
362 # create DNA
|
|
363 my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna);
|
|
364 $dna->alphabet(lc($entry->{'Molecule'}));
|
|
365 $dna->source($entry->{'Organism'});
|
|
366 $dna->display_id($entry->{'ID'});
|
|
367 $dna->accession_number($entry->{'AccNumber'});
|
|
368 $dna->desc($entry->{'Description'});
|
|
369
|
|
370 my @transcripts=@{$genefeatureshash->{'transcripts'}};
|
|
371 # create Translations, Transcripts, Exons out of the CDS
|
|
372 unless (@transcripts) {
|
|
373 cluck "no CDS feature found for /$input/....";
|
|
374 return(0);
|
|
375 }
|
|
376 my @translationobjs=$self->transexonscreation($dna,\@transcripts);
|
|
377 my @transcriptobjs;
|
|
378
|
|
379 # get the Transcript obj_refs
|
|
380 my $translation;
|
|
381 my $j=0;
|
|
382 my @ttables=@{$genefeatureshash->{'ttables'}};
|
|
383 my @swisshashes=@{$genefeatureshash->{'swisshashes'}};
|
|
384 foreach $translation (@translationobjs) {
|
|
385 push(@transcriptobjs,$translation->get_Transcript);
|
|
386 if ($ttables[$j]) { # if not undef
|
|
387 $translation->get_Transcript->translation_table($ttables[$j]);
|
|
388 #} else { # DEBUG
|
|
389 # print "\n\t\tno translation table information....\n";
|
|
390 }
|
|
391 if ($swisshashes[$j]) { # if not 0
|
|
392 $self->swisshash2liveseq($swisshashes[$j],$translation);
|
|
393 }
|
|
394 $j++;
|
|
395 }
|
|
396
|
|
397 my %gene; # this is the hash to store created object references
|
|
398 $gene{DNA}=$dna;
|
|
399 $gene{Transcripts}=\@transcriptobjs;
|
|
400 $gene{Translations}=\@translationobjs;
|
|
401
|
|
402 my @exonobjs; my @intronobjs;
|
|
403 my @repeatunitobjs; my @repeatregionobjs;
|
|
404 my @primtranscriptobjs;
|
|
405
|
|
406 my ($object,$range,$start,$end,$strand);
|
|
407
|
|
408 my @exons=@{$genefeatureshash->{'exons'}};
|
|
409 my @exondescs=@{$genefeatureshash->{'exondescs'}};
|
|
410 if (@exons) {
|
|
411 my $exoncount = 0;
|
|
412 foreach $range (@exons) {
|
|
413 ($start,$end,$strand)=@{$range};
|
|
414 $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
|
|
415 if ($object != -1) {
|
|
416 $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount];
|
|
417 $exoncount++;
|
|
418 push (@exonobjs,$object);
|
|
419 } else {
|
|
420 $exoncount++;
|
|
421 }
|
|
422 }
|
|
423 $gene{Exons}=\@exonobjs;
|
|
424 }
|
|
425 my @introns=@{$genefeatureshash->{'introns'}};
|
|
426 my @introndescs=@{$genefeatureshash->{'introndescs'}};
|
|
427 if (@introns) {
|
|
428 my $introncount = 0;
|
|
429 foreach $range (@introns) {
|
|
430 ($start,$end,$strand)=@{$range};
|
|
431 $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
|
|
432 if ($object != -1) {
|
|
433 $object->desc($introndescs[$introncount]);
|
|
434 $introncount++;
|
|
435 push (@intronobjs,$object);
|
|
436 } else {
|
|
437 $introncount++;
|
|
438 }
|
|
439 }
|
|
440 $gene{Introns}=\@intronobjs;
|
|
441 }
|
|
442 my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}};
|
|
443 if (@prim_transcripts) {
|
|
444 foreach $range (@prim_transcripts) {
|
|
445 ($start,$end,$strand)=@{$range};
|
|
446 $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
|
|
447 if ($object != -1) { push (@primtranscriptobjs,$object); }
|
|
448 }
|
|
449 $gene{Prim_Transcripts}=\@primtranscriptobjs;
|
|
450 }
|
|
451 my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}};
|
|
452 my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}};
|
|
453 if (@repeat_regions) {
|
|
454 my $k=0;
|
|
455 foreach $range (@repeat_regions) {
|
|
456 ($start,$end,$strand)=@{$range};
|
|
457 $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
|
|
458 if ($object != -1) {
|
|
459 $object->desc($repeat_regions_family[$k]);
|
|
460 $k++;
|
|
461 push (@repeatregionobjs,$object);
|
|
462 } else {
|
|
463 $k++;
|
|
464 }
|
|
465 }
|
|
466 $gene{Repeat_Regions}=\@repeatregionobjs;
|
|
467 }
|
|
468 my @repeat_units=@{$genefeatureshash->{'repeat_units'}};
|
|
469 my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}};
|
|
470 if (@repeat_units) {
|
|
471 my $k=0;
|
|
472 foreach $range (@repeat_units) {
|
|
473 ($start,$end,$strand)=@{$range};
|
|
474 $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
|
|
475 if ($object != -1) {
|
|
476 $object->desc($repeat_units_family[$k]);
|
|
477 $k++;
|
|
478 push (@repeatunitobjs,$object);
|
|
479 } else {
|
|
480 $k++;
|
|
481 }
|
|
482 }
|
|
483 $gene{Repeat_Units}=\@repeatunitobjs;
|
|
484 }
|
|
485
|
|
486 # create the Gene
|
|
487 my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos
|
|
488 return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene,
|
|
489 -upbound=>$min,-downbound=>$max));
|
|
490 }
|
|
491
|
|
492 # maybe this function will be moved to general utility package
|
|
493 # argument: array of numbers
|
|
494 # returns: (min,max) numbers in the array
|
|
495 sub rangeofarray {
|
|
496 my $self=shift;
|
|
497 my @array=@_;
|
|
498 #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n";
|
|
499 my ($max,$min,$element);
|
|
500 $min=$max=shift(@array);
|
|
501 foreach $element (@array) {
|
|
502 $element = 0 unless defined $element;
|
|
503 if ($element < $min) {
|
|
504 $min=$element;
|
|
505 }
|
|
506 if ($element > $max) {
|
|
507 $max=$element;
|
|
508 }
|
|
509 }
|
|
510 #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n";
|
|
511 return ($min,$max);
|
|
512 }
|
|
513
|
|
514
|
|
515 # argument: reference to DNA object, reference to array of transcripts
|
|
516 # returns: an array of Translation object references
|
|
517 sub transexonscreation {
|
|
518 my $self=shift;
|
|
519 my $dna=$_[0];
|
|
520 my @transcripts=@{$_[1]};
|
|
521
|
|
522 my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj);
|
|
523 my $translationobj;
|
|
524 my @translationobjects;
|
|
525 foreach $transcript (@transcripts) {
|
|
526 foreach $exonref (@{$transcript}) {
|
|
527 ($start,$end,$strand)=@{$exonref};
|
|
528 #print "Creating Exon: start $start end $end strand $strand\n";
|
|
529 $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
|
|
530 #push (@exonobjects,$exonobj);
|
|
531 push (@transexons,$exonobj);
|
|
532 }
|
|
533 $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons );
|
|
534 if ($transcriptobj != -1) {
|
|
535 $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj);
|
|
536 @transexons=(); # cleans it
|
|
537 #push (@transcriptobjects,$transcriptobj);
|
|
538 push (@translationobjects,$translationobj);
|
|
539 }
|
|
540 }
|
|
541 return (@translationobjects);
|
|
542 }
|
|
543
|
|
544 #sub printgene {
|
|
545 # deleted. Some functionality placed in Gene->printfeaturesnum
|
|
546
|
|
547 =head2 printswissprot
|
|
548
|
|
549 Title : printswissprot
|
|
550 Usage : $loader->printswissprot($hashref);
|
|
551 Function: prints out all informations loaded from a database entry into the
|
|
552 loader. Mainly used for testing purposes.
|
|
553 Args : a hashref containing the SWISSPROT entry datas
|
|
554 Note : the hashref can be obtained with a call to the method
|
|
555 $loader->get_swisshash() (only with SRS loader or
|
|
556 BioPerl via Bio::DB::EMBL.pm)
|
|
557 that takes as argument a string like "SWISS-PROT:P10275" or
|
|
558 from $loader->swissprot2hash() that takes an SRS query string
|
|
559 as its argument (e.g. "swissprot-acc:P10275")
|
|
560
|
|
561 =cut
|
|
562
|
|
563 # argument: hashref containing the SWISSPROT entry datas
|
|
564 # prints out that hash, showing the informations loaded
|
|
565 sub printswissprot {
|
|
566 my ($self,$entry)=@_;
|
|
567 unless ($entry) {
|
|
568 return;
|
|
569 }
|
|
570 printf "ID: %s\n",
|
|
571 $entry->{'ID'};
|
|
572 printf "ACC: %s\n",
|
|
573 $entry->{'AccNumber'};
|
|
574 printf "GENE: %s\n",
|
|
575 $entry->{'Gene'};
|
|
576 printf "DES: %s\n",
|
|
577 $entry->{'Description'};
|
|
578 printf "ORG: %s\n",
|
|
579 $entry->{'Organism'};
|
|
580 printf "SEQLN: %s\n",
|
|
581 $entry->{'SeqLength'};
|
|
582 printf "SEQ: %s\n",
|
|
583 substr($entry->{'Sequence'},0,64);
|
|
584 if ($entry->{'Features'}) {
|
|
585 my @features=@{$entry->{'Features'}};
|
|
586 my $i;
|
|
587 for $i (0..$#features) {
|
|
588 print "|",$features[$i]->{'name'},"|";
|
|
589 print " at ",$features[$i]->{'location'},": ";
|
|
590 print "",$features[$i]->{'desc'},"\n";
|
|
591 }
|
|
592 }
|
|
593 }
|
|
594
|
|
595 =head2 printembl
|
|
596
|
|
597 Title : printembl
|
|
598 Usage : $loader->printembl();
|
|
599 Function: prints out all informations loaded from a database entry into the
|
|
600 loader. Mainly used for testing purposes.
|
|
601 Args : none
|
|
602
|
|
603 =cut
|
|
604
|
|
605 # argument: hashref containing the EMBL entry datas
|
|
606 # prints out that hash, showing the informations loaded
|
|
607 sub printembl {
|
|
608 my ($self,$entry)=@_;
|
|
609 unless ($entry) {
|
|
610 $entry=$self->{'hash'};
|
|
611 }
|
|
612 my ($i,$featurename);
|
|
613 printf "ID: %s\n",
|
|
614 $entry->{'ID'};
|
|
615 printf "ACC: %s\n",
|
|
616 $entry->{'AccNumber'};
|
|
617 printf "MOL: %s\n",
|
|
618 $entry->{'Molecule'};
|
|
619 printf "DIV: %s\n",
|
|
620 $entry->{'Division'};
|
|
621 printf "DES: %s\n",
|
|
622 $entry->{'Description'};
|
|
623 printf "ORG: %s\n",
|
|
624 $entry->{'Organism'};
|
|
625 printf "SEQLN: %s\n",
|
|
626 $entry->{'SeqLength'};
|
|
627 printf "SEQ: %s\n",
|
|
628 substr($entry->{'Sequence'},0,64);
|
|
629 my @features=@{$entry->{'Features'}};
|
|
630 my @cds=@{$entry->{'CDS'}};
|
|
631 print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n";
|
|
632 my ($exonref,$transcript);
|
|
633 my ($qualifiernumber,$qualifiers,$key);
|
|
634 my ($start,$end,$strand);
|
|
635 my $j=0;
|
|
636 for $i (0..$#features) {
|
|
637 $featurename=$features[$i]->{'name'};
|
|
638 if ($featurename eq "CDS") {
|
|
639 print "|CDS| number $j at feature position: $i\n";
|
|
640 #print $features[$i]->{'location'},"\n";
|
|
641 $transcript=$features[$i]->{'range'};
|
|
642 foreach $exonref (@{$transcript}) {
|
|
643 ($start,$end,$strand)=@{$exonref};
|
|
644 print "\tExon: start $start end $end strand $strand\n";
|
|
645 }
|
|
646 $j++;
|
|
647 } else {
|
|
648 print "|$featurename| at feature position: $i\n";
|
|
649 print "\trange: ";
|
|
650 print join("\t",@{$features[$i]->{'range'}}),"\n";
|
|
651 #print $features[$i]->{'location'},"\n";
|
|
652 }
|
|
653 $qualifiernumber=$features[$i]->{'qual_number'};
|
|
654 $qualifiers=$features[$i]->{'qualifiers'}; # hash
|
|
655 foreach $key (keys (%{$qualifiers})) {
|
|
656 print "\t\t",$key,": ";
|
|
657 print $qualifiers->{$key},"\n";
|
|
658 }
|
|
659 }
|
|
660 }
|
|
661
|
|
662 =head2 genes
|
|
663
|
|
664 Title : genes
|
|
665 Usage : $loader->genes();
|
|
666 Function: Returns an array of gene_names (strings) contained in the loaded
|
|
667 entry.
|
|
668 Args : none
|
|
669
|
|
670 =cut
|
|
671
|
|
672 # argument: entryhashref
|
|
673 # returns: array of genenames found in the entry
|
|
674 sub genes {
|
|
675 my ($self,$entry)=@_;
|
|
676 unless ($entry) {
|
|
677 $entry=$self->{'hash'};
|
|
678 }
|
|
679 my @entryfeatures=@{$entry->{'Features'}};
|
|
680 my ($genename,$genenames,$entryfeature);
|
|
681 for $entryfeature (@entryfeatures) {
|
|
682 $genename=$entryfeature->{'qualifiers'}->{'gene'};
|
|
683 if ($genename) {
|
|
684 if (index($genenames,$genename) == -1) { # if name is new
|
|
685 $genenames .= $genename . " "; # add the name
|
|
686 }
|
|
687 }
|
|
688 }
|
|
689 return (split(/ /,$genenames)); # assumes no space inbetween each genename
|
|
690 }
|
|
691
|
|
692 # arguments: swisshash, translation objref
|
|
693 # adds information to the Translation, creates AARange objects, sets the
|
|
694 # aa_range attribute on the Translation, pointing to those objects
|
|
695 sub swisshash2liveseq {
|
|
696 my ($self,$entry,$translation)=@_;
|
|
697 my $translength=$translation->length;
|
|
698 $translation->desc($translation->desc . $entry->{'Description'});
|
|
699 $translation->display_id("SWISSPROT:" . $entry->{'ID'});
|
|
700 $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'});
|
|
701 $translation->name($entry->{'Gene'});
|
|
702 $translation->source($entry->{'Organism'});
|
|
703 my @aarangeobjects;
|
|
704 my ($start,$end,$aarangeobj,$feature);
|
|
705 my @features; my @newfeatures;
|
|
706 if ($entry->{'Features'}) {
|
|
707 @features=@{$entry->{'Features'}};
|
|
708 }
|
|
709 my $cleavedmet=0;
|
|
710 # check for cleaved Met
|
|
711 foreach $feature (@features) {
|
|
712 if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) {
|
|
713 $cleavedmet=1;
|
|
714 $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence
|
|
715 } else {
|
|
716 push(@newfeatures,$feature);
|
|
717 }
|
|
718 }
|
|
719
|
|
720 my $swissseq=$entry->{'Sequence'};
|
|
721 my $liveseqtransl=$translation->seq;
|
|
722 chop $liveseqtransl; # to take away the trailing STOP "*"
|
|
723 my $translated=substr($liveseqtransl,$cleavedmet);
|
|
724
|
|
725 my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met
|
|
726
|
|
727 if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed?
|
|
728 print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n";
|
|
729 carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!";
|
|
730 return;
|
|
731 }
|
|
732
|
|
733 #my $i=0; # debug
|
|
734 @features=@newfeatures;
|
|
735 foreach $feature (@features) {
|
|
736 #print "Processing SwissProtFeature: $i\n"; # debug
|
|
737 ($start,$end)=split(/ /,$feature->{'location'});
|
|
738 # Note: cleavedmet is taken in account for updating numbering
|
|
739 $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -desc => $feature->{'desc'}, -translation => $translation, -translength => $translength);
|
|
740 if ($aarangeobj != -1) {
|
|
741 push(@aarangeobjects,$aarangeobj);
|
|
742 }
|
|
743 # $i++; # debug
|
|
744 }
|
|
745 $translation->{'aa_ranges'}=\@aarangeobjects;
|
|
746 }
|
|
747
|
|
748 # if there is no SRS support, the default will be to return 0
|
|
749 # i.e. this function is overridden in SRS package
|
|
750 sub get_swisshash {
|
|
751 return (0);
|
|
752 }
|
|
753
|
|
754 # Args: $entry hashref, gene_name OR cds_position (undef is used to
|
|
755 # choose between the two), getswissprotinfo boolean flag
|
|
756 # Returns: an hash holding various arrayref used in the hash2gene method
|
|
757 # Function: examines the nucleotide entry, identifying features belonging
|
|
758 # to the gene (defined either by its name or by the position of its CDS in
|
|
759 # the entry)
|
|
760
|
|
761 sub _findgenefeatures {
|
|
762 my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_;
|
|
763
|
|
764 my @entryfeatures=@{$entry->{'Features'}};
|
|
765 my @exons; my @introns; my @prim_transcripts; my @transcripts;
|
|
766 my @repeat_units; my @repeat_regions;
|
|
767 my @repeat_units_family; my @repeat_regions_family; my $rpt_family;
|
|
768 my $entryfeature; my @genefeatures;
|
|
769 my $desc; my @exondescs; my @introndescs;
|
|
770
|
|
771 # for swissprot xreference
|
|
772 my ($swissacc,$swisshash); my @swisshashes;
|
|
773
|
|
774 # for translation_tables
|
|
775 my @ttables;
|
|
776
|
|
777 # to create labels
|
|
778 my ($name,$exon);
|
|
779 my @range; my @cdsexons; my @labels;
|
|
780
|
|
781 # maybe here also could be added special case when there is no CDS feature
|
|
782 # in the entry (e.g. tRNA entry -> TOCHECK).
|
|
783 # let's deal with the special case in which there is just one gene per entry
|
|
784 # usually without /gene qualifier
|
|
785 my @cds=@{$entry->{'CDS'}};
|
|
786
|
|
787 my $skipgenematch=0;
|
|
788 if (scalar(@cds) == 1) {
|
|
789 #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features.";
|
|
790 $skipgenematch=1;
|
|
791 }
|
|
792
|
|
793 my ($cds_begin,$cds_end,$proximity);
|
|
794 if ($cds_position) { # if a position has been requested
|
|
795 my @cds_exons=@{$cds[$cds_position-1]->{'range'}};
|
|
796 ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS
|
|
797 $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'};
|
|
798 # DEBUG
|
|
799 unless ($skipgenematch) {
|
|
800 carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------";
|
|
801 }
|
|
802 $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS
|
|
803 }
|
|
804
|
|
805 for $entryfeature (@entryfeatures) { # get only features for the desired gene
|
|
806 if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) {
|
|
807 push(@genefeatures,$entryfeature);
|
|
808
|
|
809 my @range=@{$entryfeature->{'range'}};
|
|
810 $name=$entryfeature->{'name'};
|
|
811 my %qualifierhash=%{$entryfeature->{'qualifiers'}};
|
|
812 if ($name eq "CDS") { # that has range containing array of exons
|
|
813
|
|
814 # swissprot crossindexing (if without SRS support it will fill array
|
|
815 # with zeros and do nothing
|
|
816 if ($getswissprotinfo) {
|
|
817 $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'};
|
|
818 $swisshash=$self->get_swisshash($swissacc);
|
|
819 #$self->printswissprot($swisshash); # DEBUG
|
|
820 push (@swisshashes,$swisshash);
|
|
821 }
|
|
822
|
|
823 push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified
|
|
824
|
|
825 # create labels array
|
|
826 for $exon (@range) {
|
|
827 push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS
|
|
828 }
|
|
829 push (@transcripts,$entryfeature->{'range'});
|
|
830 } else {
|
|
831 # "simplifying" the joinedlocation features. I.e. changing them from
|
|
832 # multijoined ones to simple plain start-end features, taking only
|
|
833 # the start of the first "exon" and the end of the last "exon" as
|
|
834 # start and end of the entire feature
|
|
835 if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location
|
|
836 @range=($range[0]->[0],$range[-1]->[1]);
|
|
837 }
|
|
838 push(@labels,$range[0],$range[1]); # start and end of every feature
|
|
839 if ($name eq "exon") {
|
|
840 $desc=$entryfeature->{'qualifiers'}->{'number'};
|
|
841 if ($entryfeature->{'qualifiers'}->{'note'}) {
|
|
842 if ($desc) {
|
|
843 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
|
|
844 } else {
|
|
845 $desc = $entryfeature->{'qualifiers'}->{'note'};
|
|
846 }
|
|
847 }
|
|
848 push (@exondescs,$desc || "unknown");
|
|
849 push(@exons,\@range);
|
|
850 }
|
|
851 if ($name eq "intron") {
|
|
852 $desc=$entryfeature->{'qualifiers'}->{'number'};
|
|
853 if ($desc) {
|
|
854 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
|
|
855 } else {
|
|
856 $desc = $entryfeature->{'qualifiers'}->{'note'};
|
|
857 }
|
|
858 push (@introndescs,$desc || "unknown");
|
|
859 push(@introns,\@range);
|
|
860 }
|
|
861 if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); }
|
|
862 if ($name eq "repeat_unit") { push(@repeat_units,\@range);
|
|
863 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
|
|
864 push (@repeat_units_family,$rpt_family || "unknown");
|
|
865 }
|
|
866 if ($name eq "repeat_region") { push(@repeat_regions,\@range);
|
|
867 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
|
|
868 push (@repeat_regions_family,$rpt_family || "unknown");
|
|
869 }
|
|
870 }
|
|
871 }
|
|
872 }
|
|
873 unless ($gene_name) { $gene_name="cds-position:".$cds_position; }
|
|
874 my %genefeatureshash;
|
|
875 $genefeatureshash{gene_name}=$gene_name;
|
|
876 $genefeatureshash{genefeatures}=\@genefeatures;
|
|
877 $genefeatureshash{labels}=\@labels;
|
|
878 $genefeatureshash{ttables}=\@ttables;
|
|
879 $genefeatureshash{swisshashes}=\@swisshashes;
|
|
880 $genefeatureshash{transcripts}=\@transcripts;
|
|
881 $genefeatureshash{exons}=\@exons;
|
|
882 $genefeatureshash{exondescs}=\@exondescs;
|
|
883 $genefeatureshash{introns}=\@introns;
|
|
884 $genefeatureshash{introndescs}=\@introndescs;
|
|
885 $genefeatureshash{prim_transcripts}=\@prim_transcripts;
|
|
886 $genefeatureshash{repeat_units}=\@repeat_units;
|
|
887 $genefeatureshash{repeat_regions}=\@repeat_regions;
|
|
888 $genefeatureshash{repeat_units_family}=\@repeat_units_family;
|
|
889 $genefeatureshash{repeat_regions_family}=\@repeat_regions_family;
|
|
890 return (\%genefeatureshash);
|
|
891 }
|
|
892
|
|
893
|
|
894 # used by _findgenefeatures, when a CDS at a certain position is requested,
|
|
895 # to retrieve only features quite close to the wanted CDS.
|
|
896 # Args: range hashref, begin and end positions of the CDS, $proximity
|
|
897 # $proximity holds the maximum distance between the extremes of the CDS
|
|
898 # and of the feature under exam.
|
|
899 # Returns: boolean
|
|
900 sub _checkfeatureproximity {
|
|
901 my ($self,$range,$cds_begin,$cds_end,$proximity)=@_;
|
|
902 my @range=@{$range};
|
|
903 my ($begin,$end,$strand);
|
|
904 if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons
|
|
905 ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]);
|
|
906 } else {
|
|
907 ($begin,$end,$strand)=@range;
|
|
908 }
|
|
909 if ($cds_begin > $cds_end) { # i.e. reverse strand CDS
|
|
910 ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries
|
|
911 }
|
|
912 if ($strand == -1) { # reverse strand
|
|
913 ($begin,$end)=($end,$begin); # swap boundaries
|
|
914 }
|
|
915 if (($cds_begin-$end)>$proximity) {
|
|
916 carp "--DEBUG-- feature rejected: begin $begin end $end -------";
|
|
917 return (0);
|
|
918 }
|
|
919 if (($begin-$cds_end)>$proximity) {
|
|
920 carp "--DEBUG-- feature rejected: begin $begin end $end -------";
|
|
921 return (0);
|
|
922 }
|
|
923 carp "--DEBUG-- feature accepted: begin $begin end $end -------";
|
|
924 return (1); # otherwise ok, feature considered next to CDS
|
|
925 }
|
|
926
|
|
927
|
|
928 # function that calls the external program "align" (on the fasta2 package)
|
|
929 # to create an alignment between two sequences, returning the aligned
|
|
930 # strings and the codes for the identity (:: ::::)
|
|
931
|
|
932 sub _get_alignment {
|
|
933 my ($self,$seq1,$seq2)=@_;
|
|
934 my $fastafile1="/tmp/tmpfastafile1";
|
|
935 my $fastafile2="/tmp/tmpfastafile2";
|
|
936 my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut
|
|
937 my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>/dev/null | $grepcut"; # ALIGN
|
|
938 open TMPFASTAFILE1,">$fastafile1" || croak "Cannot write into $fastafile1 for aa alignment";
|
|
939 open TMPFASTAFILE2,">$fastafile2" || croak "Cannot write into $fastafile1 for aa alignment";
|
|
940 print TMPFASTAFILE1 ">firstseq\n$seq1\n";
|
|
941 print TMPFASTAFILE2 ">secondseq\n$seq2\n";
|
|
942 close TMPFASTAFILE1;
|
|
943 close TMPFASTAFILE2;
|
|
944 my $alignment=`$alignprogram`;
|
|
945 my @alignlines=split(/\n/,$alignment);
|
|
946 my ($linecount,$seq1_aligned,$seq2_aligned,$codes);
|
|
947 for ($linecount=0; $linecount < @alignlines; $linecount+=3) {
|
|
948 $seq1_aligned .= $alignlines[$linecount];
|
|
949 $codes .= $alignlines[$linecount+1];
|
|
950 $seq2_aligned .= $alignlines[$linecount+2];
|
|
951 }
|
|
952 return ($seq1_aligned,$seq2_aligned,$codes);
|
|
953 }
|
|
954
|
|
955 # common part of the function to create a novel liveseq gene structure
|
|
956 # from an amino acid sequence, using codon usage frequencies
|
|
957 # args: codon_usage_array transltableid aasequence gene_name
|
|
958 sub _common_novelaasequence2gene {
|
|
959 my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_;
|
|
960 my @species_codon_usage=@{$species_codon_usage};
|
|
961 my @codon_usage_label=
|
|
962 qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg
|
|
963 tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga
|
|
964 ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa
|
|
965 gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag
|
|
966 tga);
|
|
967 my ($i,$j);
|
|
968 my %codon_usage_value;
|
|
969 my $aa_codon_total;
|
|
970 for ($i=0;$i<64;$i++) {
|
|
971 $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i];
|
|
972 }
|
|
973
|
|
974 my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid );
|
|
975 my @aminoacids = split(//,uc($aasequence));
|
|
976 my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon);
|
|
977 for $i (@aminoacids) {
|
|
978 @alt_codons = $CodonTable->revtranslate($i);
|
|
979 unless (@alt_codons) {
|
|
980 carp "No reverse translation possible for aminoacid \'$i\'";
|
|
981 $dnasequence .= "???";
|
|
982 } else {
|
|
983 $aa_codon_total=0;
|
|
984 for $j (@alt_codons) {
|
|
985 $aa_codon_total+=$codon_usage_value{$j};
|
|
986 }
|
|
987 # print "aminoacid $i, codonchoice: "; # verbose
|
|
988 #$partial=0;
|
|
989 #for $j (@alt_codons) {
|
|
990 #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total;
|
|
991 #$partial+=($codon_usage_value{$j}/$aa_codon_total);
|
|
992 #}
|
|
993 #print "\n";
|
|
994 $dice=rand(1);
|
|
995 #print "roulette: $dice\n"; # verbose
|
|
996 $partial=0;
|
|
997 $chosen_codon="";
|
|
998 CODONCHOICE:
|
|
999 for $j (0..@alt_codons) { # last one not accounted
|
|
1000 $thiscodon=$alt_codons[$j];
|
|
1001 $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total);
|
|
1002 if ($dice < $relativeusage+$partial) {
|
|
1003 $chosen_codon=$thiscodon;
|
|
1004 last CODONCHOICE;
|
|
1005 } else {
|
|
1006 $partial += $relativeusage;
|
|
1007 }
|
|
1008 }
|
|
1009 unless ($chosen_codon) {
|
|
1010 $chosen_codon = $alt_codons[-1]; # the last one
|
|
1011 }
|
|
1012 # print ".....adding $chosen_codon\n"; # verbose
|
|
1013 $dnasequence .= $chosen_codon;
|
|
1014 }
|
|
1015 }
|
|
1016
|
|
1017 my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence);
|
|
1018 my $min=1;
|
|
1019 my $max=length($dnasequence);
|
|
1020 my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1);
|
|
1021 my @exons=($exon);
|
|
1022 my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons);
|
|
1023 $transcript->translation_table($ttabid);
|
|
1024 my @transcripts=($transcript);
|
|
1025 my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript);
|
|
1026 my @translations=($translation);
|
|
1027 my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations);
|
|
1028 my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max);
|
|
1029
|
|
1030 # creation of gene
|
|
1031 unless ($gene) { # if $gene == 0 it means problems in hash2gene
|
|
1032 carp "Error in Gene creation phase";
|
|
1033 return (0);
|
|
1034 }
|
|
1035 return $gene;
|
|
1036 }
|
|
1037
|
|
1038 1;
|