comparison variant_effect_predictor/Bio/LiveSeq/IO/Loader.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: Loader.pm,v 1.15 2001/10/22 08:22:51 heikki Exp $
2 #
3 # bioperl module for Bio::LiveSeq::IO::Loader
4 #
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
6 #
7 # Copyright Joseph Insana
8 #
9 # You may distribute this module under the same terms as perl itself
10 #
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq
16
17 =head1 SYNOPSIS
18
19 #documentation needed
20
21 =head1 DESCRIPTION
22
23 This package holds common methods used by BioPerl, SRS and file loaders.
24 It contains methods to create LiveSeq objects out of entire entries or from a
25 localized sequence region surrounding a particular gene.
26
27 =head1 AUTHOR - Joseph A.L. Insana
28
29 Email: Insana@ebi.ac.uk, jinsana@gmx.net
30
31 Address:
32
33 EMBL Outstation, European Bioinformatics Institute
34 Wellcome Trust Genome Campus, Hinxton
35 Cambs. CB10 1SD, United Kingdom
36
37 =head1 APPENDIX
38
39 The rest of the documentation details each of the object
40 methods. Internal methods are usually preceded with a _
41
42 =cut
43
44 # Let the code begin...
45
46 package Bio::LiveSeq::IO::Loader;
47 $VERSION=4.44;
48
49 # Version history:
50 # Wed Feb 16 17:55:01 GMT 2000 0.1a was a general EMBL entry printer with SRS
51 # Wed Mar 29 16:53:30 BST 2000 0.2 rewrote as SRS LiveSeq Loader
52 # Wed Mar 29 19:11:21 BST 2000 0.2.1 used successfully by liveseq3.pl
53 # Fri Mar 31 02:33:43 BST 2000 v 1.0 begun wrapping into this package
54 # Fri Mar 31 03:58:43 BST 2000 v 1.1 finished wrapping
55 # Fri Mar 31 04:24:50 BST 2000 v 1.2 added test_transl
56 # Fri Mar 31 04:34:48 BST 2000: noticed problem with K02083, if translation is
57 # included as valid qualifier name -> investigate
58 # Fri Mar 31 16:55:07 BST 2000 v 1.21 removed chop in test_transl()
59 # Mon Apr 3 18:25:27 BST 2000 v 1.3 begun working at lightweight loader
60 # Mon Apr 3 18:42:30 BST 2000 v 1.31 started changing so that CDS is no more
61 # the only default feature possibly asked for
62 # Tue Apr 4 16:19:09 BST 2000 v 1.4 started creating hash2gene
63 # Tue Apr 4 16:41:56 BST 2000 v 1.42 created location2range and rewritten
64 # cdslocation2transcript
65 # Tue Apr 4 18:18:42 BST 2000 v 1.44 finished (maybe) hash2gene
66 # Tue Apr 4 19:14:33 BST 2000 v 1.49 temporary printgene done. All working :)
67 # Wed Apr 5 02:04:01 BST 2000 v 1.5 added upbound,downbound to hash2gene
68 # Wed Apr 5 13:06:43 BST 2000 v 2.0 started obj_oriented and inheritance
69 # Thu Apr 6 03:11:29 BST 2000 v 2.2 transition from $location to @range
70 # Thu Apr 6 04:26:04 BST 2000 v 2.3 both SRS and BP work with gene and entry!
71 # Fri Apr 7 01:47:51 BST 2000 v 2.4 genes() created
72 # Fri Apr 7 03:01:46 BST 2000 v 2.5 changed hash2gene so that if there is
73 # just 1 CDS in entry it will use all
74 # features of the entry as Gene features
75 # Tue Apr 18 18:14:19 BST 2000 v 3.0 printswissprot added
76 # Wed Apr 19 22:15:12 BST 2000 v 3.2 swisshash2liveseq created
77 # Thu Apr 20 00:14:09 BST 2000 v 3.4 swisshash2liveseq updated: now it correctly handles cleaved_met and conflicts/mod_res/variants recorded differences between EMBL and SWISSPROT translations sequences. Still some not-recorded conflicts are possible and in these cases the program won't create the AARange -> this could change in the future, if a better stringcomparison is introduced
78 # Thu Apr 20 01:14:16 BST 2000 v 3.6 changed entry2liveseq and gene2liveseq to namedargument input format; added getswissprotinfo flag/option
79 # Thu Apr 20 02:18:58 BST 2000 v 3.7 mRNA added as valid_feature -> it gets recorded as prim_transcript object
80 # Thu Apr 27 16:19:43 BST 2000 v 3.8 translation_table set added to hash2gene
81 # Mon May 1 22:16:18 BST 2000 v 3.9 -position option added to gene2liveseq
82 # Tue May 2 02:43:05 BST 2000 v 4.0 moved some code in _findgenefeatures, added the possibility of using cds_position information, created _checkfeatureproximity
83 # Tue May 2 03:20:20 BST 2000 v 4.01 findgenefeatures debugged
84 # Wed May 31 13:59:09 BST 2000 v 4.02 chopped $translated to take away STOP
85 # Fri Jun 2 14:49:12 BST 2000 v 4.1 prints alignment with CLUSTALW
86 # Wed Jun 7 02:07:54 BST 2000 v 4.2 added code for "simplifying" joinedlocation features (e.g. join() in mRNA features), changing them to plain start-end ones
87 # Wed Jun 7 04:20:15 BST 2000 v 4.22 added translation->{'offset'} for INIT_MET
88 # Tue Jun 27 14:05:19 BST 2000 v. 4.3 added if() conditions so that if new() of object creation failed, the object is not passed on
89 # Tue Jul 4 14:15:58 BST 2000 v 4.4 note and number qualifier added to exon and intron descriptions
90 # Wed Jul 12 14:06:38 BST 2000 v 4.41 added if() condition out of transcript creation in transexoncreation()
91 # Fri Sep 15 15:41:02 BST 2000 v 4.44 created _common_novelaasequence2gene
92
93 # Note: test_transl has been left as deprecated and is not really supported now
94
95 use strict;
96 use Carp qw(cluck croak carp);
97 use vars qw($VERSION @ISA);
98 use Bio::LiveSeq::DNA 1.2;
99 use Bio::LiveSeq::Exon 1.0;
100 use Bio::LiveSeq::Transcript 2.4;
101 use Bio::LiveSeq::Translation 1.4;
102 use Bio::LiveSeq::Gene 1.1;
103 use Bio::LiveSeq::Intron 1.0;
104 use Bio::LiveSeq::Prim_Transcript 1.0;
105 use Bio::LiveSeq::Repeat_Region 1.0;
106 use Bio::LiveSeq::Repeat_Unit 1.0;
107 use Bio::LiveSeq::AARange 1.4;
108 use Bio::Tools::CodonTable;
109
110 #@ISA=qw(Bio::LiveSeq::); # not useful now
111
112 =head2 entry2liveseq
113
114 Title : entry2liveseq
115 Usage : @translationobjects=$loader->entry2liveseq();
116 : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0);
117 Function: creates LiveSeq objects from an entry previously loaded
118 Returns : array of references to objects of class Translation
119 Errorcode 0
120 Args : optional boolean flag to avoid the retrieval of SwissProt
121 informations for all Transcripts containing SwissProt x-reference
122 default is 1 (to retrieve those informations and create AARange
123 LiveSeq objects)
124 Note : this method can get really slow for big entries. The lightweight
125 gene2liveseq method is recommended
126
127 =cut
128
129 sub entry2liveseq {
130 my ($self, %args) = @_;
131 my ($getswissprotinfo)=($args{-getswissprotinfo});
132 if (defined($getswissprotinfo)) {
133 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
134 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
135 $getswissprotinfo=0;
136 }
137 } else {
138 $getswissprotinfo=1;
139 }
140 my $hashref=$self->{'hash'};
141 unless ($hashref) { return (0); }
142 my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo);
143 my $test_transl=0;
144 if ($test_transl) { $self->test_transl($hashref,\@translationobjects);}
145 return @translationobjects;
146 }
147
148 =head2 novelaasequence2gene
149
150 Title : novelaasequence2gene
151 Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
152 : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
153 -taxon => 9606,
154 -gene_name => "tyr-kinase");
155
156 Function: creates LiveSeq objects from a novel amino acid sequence,
157 using codon usage database to choose codons according to
158 relative frequencies.
159 If a taxon ID is not specified, the default is to use the human
160 one (taxonomy ID 9606).
161 Returns : reference to a Gene object containing references to LiveSeq objects
162 Errorcode 0
163 Args : string containing an amino acid sequence
164 integer (optional) with a taxonomy ID
165 string specifying a gene name
166
167 =cut
168
169 =head2 gene2liveseq
170
171 Title : gene2liveseq
172 Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name");
173 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
174 -flanking => 64);
175 : $gene=$loader->gene2liveseq(-gene_name => "gene name",
176 -getswissprotinfo => 0);
177 : $gene=$loader->gene2liveseq(-position => 4);
178
179 Function: creates LiveSeq objects from an entry previously loaded
180 It is a "light weight" creation because it creates
181 a LiveSequence just for the interesting region in an entry
182 (instead than for the total entry, like in entry2liveseq) and for
183 the flanking regions up to 500 nucleotides (default) or up to
184 the specified amount of nucleotides (given as argument) around the
185 Gene.
186 Returns : reference to a Gene object containing possibly alternative
187 Transcripts.
188 Errorcode 0
189 Args : string containing the gene name as in the EMBL feature qualifier
190 integer (optional) "flanking": amount of flanking bases to be kept
191 boolean (optional) "getswissprotinfo": if set to "0" it will avoid
192 trying to fetch information from a crossreference to a SwissProt
193 entry, avoding the process of creation of AARange objects
194 It is "1" (on) by default
195
196 Alternative to a gene_name, a position can be given: an
197 integer (1-) containing the position of the desired CDS in the
198 loaded entry
199
200 =cut
201
202 sub gene2liveseq {
203 my ($self, %args) = @_;
204 my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position});
205 my $input;
206 unless (($gene_name)||($cds_position)) {
207 carp "Gene_Name or Position not specified for gene2liveseq loading function";
208 return (0);
209 }
210 if (($gene_name)&&($cds_position)) {
211 carp "Gene_Name and Position cannot be given together, use one";
212 return (0);
213 } elsif ($gene_name) {
214 $input=$gene_name;
215 } else {
216 $input="cds-position:".$cds_position;
217 }
218
219 if (defined($getswissprotinfo)) {
220 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
221 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
222 $getswissprotinfo=0;
223 }
224 } else {
225 $getswissprotinfo=1;
226 }
227
228 if (defined($flanking)) {
229 unless ($flanking >= 0) {
230 carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function";
231 return (0);
232 }
233 } else {
234 $flanking=500; # the default flanking length
235 }
236 my $hashref=$self->{'hash'};
237 unless ($hashref) { return (0); }
238 my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo);
239 unless ($gene) { # if $gene == 0 it means problems in hash2gene
240 carp "gene2liveseq produced error";
241 return (0);
242 }
243 return $gene;
244 }
245
246 # TODO: update so that it will work even if CDS is not only accepted FEATURE!!
247 # this method is for now deprecated and not supported
248 sub test_transl {
249 my ($self,$entry)=@_;
250 my @features=@{$entry->{'Features'}};
251 my @translationobjects=@{$_[1]};
252 my ($i,$translation);
253 my ($obj_transl,$hash_transl);
254 my @cds=@{$entry->{'CDS'}};
255 foreach $translation (@translationobjects) {
256 $obj_transl=$translation->seq;
257 $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'};
258 #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*"
259 unless ($obj_transl eq $hash_transl) {
260 cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i";
261 carp "\nEntry's transl: ",$hash_transl,"\n";
262 carp "\nObject's transl: ",$obj_transl,"\n";
263 exit;
264 }
265 $i++;
266 }
267 }
268
269 # argument: hashref containing the EMBL entry datas,
270 # getswissprotinfo boolean flag
271 # creates the liveseq objects
272 # returns: an array of Translation object references
273 sub hash2liveseq {
274 my ($self,$entry,$getswissprotinfo)=@_;
275 my $i;
276 my @transcripts;
277 my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} );
278 $dna->alphabet(lc($entry->{'Molecule'}));
279 $dna->display_id($entry->{'ID'});
280 $dna->accession_number($entry->{'AccNumber'});
281 $dna->desc($entry->{'Description'});
282 my @cds=@{$entry->{'CDS'}};
283 my ($swissacc,$swisshash); my @swisshashes;
284 for $i (0..$#cds) {
285 #my @transcript=@{$cds[$i]->{'range'}};
286 #$transcript=\@transcript;
287 #push (@transcripts,$transcript);
288 push (@transcripts,$cds[$i]->{'range'});
289 if ($getswissprotinfo) {
290 $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'};
291 $swisshash=$self->get_swisshash($swissacc);
292 #$self->printswissprot($swisshash); # DEBUG
293 push (@swisshashes,$swisshash);
294 }
295 }
296 my @translations=($self->transexonscreation($dna,\@transcripts));
297 my $translation; my $j=0;
298 foreach $translation (@translations) {
299 if ($swisshashes[$j]) { # if not 0
300 $self->swisshash2liveseq($swisshashes[$j],$translation);
301 }
302 $j++;
303 }
304 return (@translations);
305 }
306
307 # only features pertaining to a specified gene are created
308 # only the sequence of the gene and appropriate context flanking regions
309 # are created as chain
310 # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag
311 # returns: reference to Gene object
312 #
313 # Note: if entry contains just one CDS, all the features get added
314 # this is useful because often the features in these entries do not
315 # carry the /gene qualifier
316 #
317 # errorcode: 0
318 sub hash2gene {
319 my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_;
320 my $entryfeature;
321 my $genefeatureshash;
322
323 my @cds=@{$entry->{'CDS'}};
324
325 # checking if a position has been given instead than a gene_name
326 if (index($input,"cds-position:") == 0 ) {
327 my $cds_position=substr($input,13); # extracting the cds position
328 if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) {
329 $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo);
330 }
331 } else {
332 $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo);
333 }
334
335 unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found
336 my @genes=$self->genes($entry);
337 my $cds_number=scalar(@cds);
338 warn "Warning! Not even one genefeature found for /$input/....
339 The genes present in this entry are:\n\t@genes\n
340 The number of CDS in this entry is:\n\t$cds_number\n";
341 return(0);
342 }
343
344 # get max and min, check flankings
345 my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries"
346 my $seqlength=$entry->{'SeqLength'};
347 my ($mindna,$maxdna); # some flanking region next to the gene "boundaries"
348 if ($min-$flanking < 1) {
349 $mindna=1;
350 } else {
351 $mindna=$min-$flanking;
352 }
353 if ($max+$flanking > $seqlength) {
354 $maxdna=$seqlength;
355 } else {
356 $maxdna=$max+$flanking;
357 }
358 my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1);
359
360 # create LiveSeq objects
361
362 # create DNA
363 my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna);
364 $dna->alphabet(lc($entry->{'Molecule'}));
365 $dna->source($entry->{'Organism'});
366 $dna->display_id($entry->{'ID'});
367 $dna->accession_number($entry->{'AccNumber'});
368 $dna->desc($entry->{'Description'});
369
370 my @transcripts=@{$genefeatureshash->{'transcripts'}};
371 # create Translations, Transcripts, Exons out of the CDS
372 unless (@transcripts) {
373 cluck "no CDS feature found for /$input/....";
374 return(0);
375 }
376 my @translationobjs=$self->transexonscreation($dna,\@transcripts);
377 my @transcriptobjs;
378
379 # get the Transcript obj_refs
380 my $translation;
381 my $j=0;
382 my @ttables=@{$genefeatureshash->{'ttables'}};
383 my @swisshashes=@{$genefeatureshash->{'swisshashes'}};
384 foreach $translation (@translationobjs) {
385 push(@transcriptobjs,$translation->get_Transcript);
386 if ($ttables[$j]) { # if not undef
387 $translation->get_Transcript->translation_table($ttables[$j]);
388 #} else { # DEBUG
389 # print "\n\t\tno translation table information....\n";
390 }
391 if ($swisshashes[$j]) { # if not 0
392 $self->swisshash2liveseq($swisshashes[$j],$translation);
393 }
394 $j++;
395 }
396
397 my %gene; # this is the hash to store created object references
398 $gene{DNA}=$dna;
399 $gene{Transcripts}=\@transcriptobjs;
400 $gene{Translations}=\@translationobjs;
401
402 my @exonobjs; my @intronobjs;
403 my @repeatunitobjs; my @repeatregionobjs;
404 my @primtranscriptobjs;
405
406 my ($object,$range,$start,$end,$strand);
407
408 my @exons=@{$genefeatureshash->{'exons'}};
409 my @exondescs=@{$genefeatureshash->{'exondescs'}};
410 if (@exons) {
411 my $exoncount = 0;
412 foreach $range (@exons) {
413 ($start,$end,$strand)=@{$range};
414 $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
415 if ($object != -1) {
416 $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount];
417 $exoncount++;
418 push (@exonobjs,$object);
419 } else {
420 $exoncount++;
421 }
422 }
423 $gene{Exons}=\@exonobjs;
424 }
425 my @introns=@{$genefeatureshash->{'introns'}};
426 my @introndescs=@{$genefeatureshash->{'introndescs'}};
427 if (@introns) {
428 my $introncount = 0;
429 foreach $range (@introns) {
430 ($start,$end,$strand)=@{$range};
431 $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
432 if ($object != -1) {
433 $object->desc($introndescs[$introncount]);
434 $introncount++;
435 push (@intronobjs,$object);
436 } else {
437 $introncount++;
438 }
439 }
440 $gene{Introns}=\@intronobjs;
441 }
442 my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}};
443 if (@prim_transcripts) {
444 foreach $range (@prim_transcripts) {
445 ($start,$end,$strand)=@{$range};
446 $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
447 if ($object != -1) { push (@primtranscriptobjs,$object); }
448 }
449 $gene{Prim_Transcripts}=\@primtranscriptobjs;
450 }
451 my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}};
452 my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}};
453 if (@repeat_regions) {
454 my $k=0;
455 foreach $range (@repeat_regions) {
456 ($start,$end,$strand)=@{$range};
457 $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
458 if ($object != -1) {
459 $object->desc($repeat_regions_family[$k]);
460 $k++;
461 push (@repeatregionobjs,$object);
462 } else {
463 $k++;
464 }
465 }
466 $gene{Repeat_Regions}=\@repeatregionobjs;
467 }
468 my @repeat_units=@{$genefeatureshash->{'repeat_units'}};
469 my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}};
470 if (@repeat_units) {
471 my $k=0;
472 foreach $range (@repeat_units) {
473 ($start,$end,$strand)=@{$range};
474 $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
475 if ($object != -1) {
476 $object->desc($repeat_units_family[$k]);
477 $k++;
478 push (@repeatunitobjs,$object);
479 } else {
480 $k++;
481 }
482 }
483 $gene{Repeat_Units}=\@repeatunitobjs;
484 }
485
486 # create the Gene
487 my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos
488 return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene,
489 -upbound=>$min,-downbound=>$max));
490 }
491
492 # maybe this function will be moved to general utility package
493 # argument: array of numbers
494 # returns: (min,max) numbers in the array
495 sub rangeofarray {
496 my $self=shift;
497 my @array=@_;
498 #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n";
499 my ($max,$min,$element);
500 $min=$max=shift(@array);
501 foreach $element (@array) {
502 $element = 0 unless defined $element;
503 if ($element < $min) {
504 $min=$element;
505 }
506 if ($element > $max) {
507 $max=$element;
508 }
509 }
510 #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n";
511 return ($min,$max);
512 }
513
514
515 # argument: reference to DNA object, reference to array of transcripts
516 # returns: an array of Translation object references
517 sub transexonscreation {
518 my $self=shift;
519 my $dna=$_[0];
520 my @transcripts=@{$_[1]};
521
522 my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj);
523 my $translationobj;
524 my @translationobjects;
525 foreach $transcript (@transcripts) {
526 foreach $exonref (@{$transcript}) {
527 ($start,$end,$strand)=@{$exonref};
528 #print "Creating Exon: start $start end $end strand $strand\n";
529 $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
530 #push (@exonobjects,$exonobj);
531 push (@transexons,$exonobj);
532 }
533 $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons );
534 if ($transcriptobj != -1) {
535 $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj);
536 @transexons=(); # cleans it
537 #push (@transcriptobjects,$transcriptobj);
538 push (@translationobjects,$translationobj);
539 }
540 }
541 return (@translationobjects);
542 }
543
544 #sub printgene {
545 # deleted. Some functionality placed in Gene->printfeaturesnum
546
547 =head2 printswissprot
548
549 Title : printswissprot
550 Usage : $loader->printswissprot($hashref);
551 Function: prints out all informations loaded from a database entry into the
552 loader. Mainly used for testing purposes.
553 Args : a hashref containing the SWISSPROT entry datas
554 Note : the hashref can be obtained with a call to the method
555 $loader->get_swisshash() (only with SRS loader or
556 BioPerl via Bio::DB::EMBL.pm)
557 that takes as argument a string like "SWISS-PROT:P10275" or
558 from $loader->swissprot2hash() that takes an SRS query string
559 as its argument (e.g. "swissprot-acc:P10275")
560
561 =cut
562
563 # argument: hashref containing the SWISSPROT entry datas
564 # prints out that hash, showing the informations loaded
565 sub printswissprot {
566 my ($self,$entry)=@_;
567 unless ($entry) {
568 return;
569 }
570 printf "ID: %s\n",
571 $entry->{'ID'};
572 printf "ACC: %s\n",
573 $entry->{'AccNumber'};
574 printf "GENE: %s\n",
575 $entry->{'Gene'};
576 printf "DES: %s\n",
577 $entry->{'Description'};
578 printf "ORG: %s\n",
579 $entry->{'Organism'};
580 printf "SEQLN: %s\n",
581 $entry->{'SeqLength'};
582 printf "SEQ: %s\n",
583 substr($entry->{'Sequence'},0,64);
584 if ($entry->{'Features'}) {
585 my @features=@{$entry->{'Features'}};
586 my $i;
587 for $i (0..$#features) {
588 print "|",$features[$i]->{'name'},"|";
589 print " at ",$features[$i]->{'location'},": ";
590 print "",$features[$i]->{'desc'},"\n";
591 }
592 }
593 }
594
595 =head2 printembl
596
597 Title : printembl
598 Usage : $loader->printembl();
599 Function: prints out all informations loaded from a database entry into the
600 loader. Mainly used for testing purposes.
601 Args : none
602
603 =cut
604
605 # argument: hashref containing the EMBL entry datas
606 # prints out that hash, showing the informations loaded
607 sub printembl {
608 my ($self,$entry)=@_;
609 unless ($entry) {
610 $entry=$self->{'hash'};
611 }
612 my ($i,$featurename);
613 printf "ID: %s\n",
614 $entry->{'ID'};
615 printf "ACC: %s\n",
616 $entry->{'AccNumber'};
617 printf "MOL: %s\n",
618 $entry->{'Molecule'};
619 printf "DIV: %s\n",
620 $entry->{'Division'};
621 printf "DES: %s\n",
622 $entry->{'Description'};
623 printf "ORG: %s\n",
624 $entry->{'Organism'};
625 printf "SEQLN: %s\n",
626 $entry->{'SeqLength'};
627 printf "SEQ: %s\n",
628 substr($entry->{'Sequence'},0,64);
629 my @features=@{$entry->{'Features'}};
630 my @cds=@{$entry->{'CDS'}};
631 print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n";
632 my ($exonref,$transcript);
633 my ($qualifiernumber,$qualifiers,$key);
634 my ($start,$end,$strand);
635 my $j=0;
636 for $i (0..$#features) {
637 $featurename=$features[$i]->{'name'};
638 if ($featurename eq "CDS") {
639 print "|CDS| number $j at feature position: $i\n";
640 #print $features[$i]->{'location'},"\n";
641 $transcript=$features[$i]->{'range'};
642 foreach $exonref (@{$transcript}) {
643 ($start,$end,$strand)=@{$exonref};
644 print "\tExon: start $start end $end strand $strand\n";
645 }
646 $j++;
647 } else {
648 print "|$featurename| at feature position: $i\n";
649 print "\trange: ";
650 print join("\t",@{$features[$i]->{'range'}}),"\n";
651 #print $features[$i]->{'location'},"\n";
652 }
653 $qualifiernumber=$features[$i]->{'qual_number'};
654 $qualifiers=$features[$i]->{'qualifiers'}; # hash
655 foreach $key (keys (%{$qualifiers})) {
656 print "\t\t",$key,": ";
657 print $qualifiers->{$key},"\n";
658 }
659 }
660 }
661
662 =head2 genes
663
664 Title : genes
665 Usage : $loader->genes();
666 Function: Returns an array of gene_names (strings) contained in the loaded
667 entry.
668 Args : none
669
670 =cut
671
672 # argument: entryhashref
673 # returns: array of genenames found in the entry
674 sub genes {
675 my ($self,$entry)=@_;
676 unless ($entry) {
677 $entry=$self->{'hash'};
678 }
679 my @entryfeatures=@{$entry->{'Features'}};
680 my ($genename,$genenames,$entryfeature);
681 for $entryfeature (@entryfeatures) {
682 $genename=$entryfeature->{'qualifiers'}->{'gene'};
683 if ($genename) {
684 if (index($genenames,$genename) == -1) { # if name is new
685 $genenames .= $genename . " "; # add the name
686 }
687 }
688 }
689 return (split(/ /,$genenames)); # assumes no space inbetween each genename
690 }
691
692 # arguments: swisshash, translation objref
693 # adds information to the Translation, creates AARange objects, sets the
694 # aa_range attribute on the Translation, pointing to those objects
695 sub swisshash2liveseq {
696 my ($self,$entry,$translation)=@_;
697 my $translength=$translation->length;
698 $translation->desc($translation->desc . $entry->{'Description'});
699 $translation->display_id("SWISSPROT:" . $entry->{'ID'});
700 $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'});
701 $translation->name($entry->{'Gene'});
702 $translation->source($entry->{'Organism'});
703 my @aarangeobjects;
704 my ($start,$end,$aarangeobj,$feature);
705 my @features; my @newfeatures;
706 if ($entry->{'Features'}) {
707 @features=@{$entry->{'Features'}};
708 }
709 my $cleavedmet=0;
710 # check for cleaved Met
711 foreach $feature (@features) {
712 if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) {
713 $cleavedmet=1;
714 $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence
715 } else {
716 push(@newfeatures,$feature);
717 }
718 }
719
720 my $swissseq=$entry->{'Sequence'};
721 my $liveseqtransl=$translation->seq;
722 chop $liveseqtransl; # to take away the trailing STOP "*"
723 my $translated=substr($liveseqtransl,$cleavedmet);
724
725 my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met
726
727 if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed?
728 print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n";
729 carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!";
730 return;
731 }
732
733 #my $i=0; # debug
734 @features=@newfeatures;
735 foreach $feature (@features) {
736 #print "Processing SwissProtFeature: $i\n"; # debug
737 ($start,$end)=split(/ /,$feature->{'location'});
738 # Note: cleavedmet is taken in account for updating numbering
739 $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -desc => $feature->{'desc'}, -translation => $translation, -translength => $translength);
740 if ($aarangeobj != -1) {
741 push(@aarangeobjects,$aarangeobj);
742 }
743 # $i++; # debug
744 }
745 $translation->{'aa_ranges'}=\@aarangeobjects;
746 }
747
748 # if there is no SRS support, the default will be to return 0
749 # i.e. this function is overridden in SRS package
750 sub get_swisshash {
751 return (0);
752 }
753
754 # Args: $entry hashref, gene_name OR cds_position (undef is used to
755 # choose between the two), getswissprotinfo boolean flag
756 # Returns: an hash holding various arrayref used in the hash2gene method
757 # Function: examines the nucleotide entry, identifying features belonging
758 # to the gene (defined either by its name or by the position of its CDS in
759 # the entry)
760
761 sub _findgenefeatures {
762 my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_;
763
764 my @entryfeatures=@{$entry->{'Features'}};
765 my @exons; my @introns; my @prim_transcripts; my @transcripts;
766 my @repeat_units; my @repeat_regions;
767 my @repeat_units_family; my @repeat_regions_family; my $rpt_family;
768 my $entryfeature; my @genefeatures;
769 my $desc; my @exondescs; my @introndescs;
770
771 # for swissprot xreference
772 my ($swissacc,$swisshash); my @swisshashes;
773
774 # for translation_tables
775 my @ttables;
776
777 # to create labels
778 my ($name,$exon);
779 my @range; my @cdsexons; my @labels;
780
781 # maybe here also could be added special case when there is no CDS feature
782 # in the entry (e.g. tRNA entry -> TOCHECK).
783 # let's deal with the special case in which there is just one gene per entry
784 # usually without /gene qualifier
785 my @cds=@{$entry->{'CDS'}};
786
787 my $skipgenematch=0;
788 if (scalar(@cds) == 1) {
789 #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features.";
790 $skipgenematch=1;
791 }
792
793 my ($cds_begin,$cds_end,$proximity);
794 if ($cds_position) { # if a position has been requested
795 my @cds_exons=@{$cds[$cds_position-1]->{'range'}};
796 ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS
797 $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'};
798 # DEBUG
799 unless ($skipgenematch) {
800 carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------";
801 }
802 $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS
803 }
804
805 for $entryfeature (@entryfeatures) { # get only features for the desired gene
806 if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) {
807 push(@genefeatures,$entryfeature);
808
809 my @range=@{$entryfeature->{'range'}};
810 $name=$entryfeature->{'name'};
811 my %qualifierhash=%{$entryfeature->{'qualifiers'}};
812 if ($name eq "CDS") { # that has range containing array of exons
813
814 # swissprot crossindexing (if without SRS support it will fill array
815 # with zeros and do nothing
816 if ($getswissprotinfo) {
817 $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'};
818 $swisshash=$self->get_swisshash($swissacc);
819 #$self->printswissprot($swisshash); # DEBUG
820 push (@swisshashes,$swisshash);
821 }
822
823 push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified
824
825 # create labels array
826 for $exon (@range) {
827 push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS
828 }
829 push (@transcripts,$entryfeature->{'range'});
830 } else {
831 # "simplifying" the joinedlocation features. I.e. changing them from
832 # multijoined ones to simple plain start-end features, taking only
833 # the start of the first "exon" and the end of the last "exon" as
834 # start and end of the entire feature
835 if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location
836 @range=($range[0]->[0],$range[-1]->[1]);
837 }
838 push(@labels,$range[0],$range[1]); # start and end of every feature
839 if ($name eq "exon") {
840 $desc=$entryfeature->{'qualifiers'}->{'number'};
841 if ($entryfeature->{'qualifiers'}->{'note'}) {
842 if ($desc) {
843 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
844 } else {
845 $desc = $entryfeature->{'qualifiers'}->{'note'};
846 }
847 }
848 push (@exondescs,$desc || "unknown");
849 push(@exons,\@range);
850 }
851 if ($name eq "intron") {
852 $desc=$entryfeature->{'qualifiers'}->{'number'};
853 if ($desc) {
854 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
855 } else {
856 $desc = $entryfeature->{'qualifiers'}->{'note'};
857 }
858 push (@introndescs,$desc || "unknown");
859 push(@introns,\@range);
860 }
861 if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); }
862 if ($name eq "repeat_unit") { push(@repeat_units,\@range);
863 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
864 push (@repeat_units_family,$rpt_family || "unknown");
865 }
866 if ($name eq "repeat_region") { push(@repeat_regions,\@range);
867 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
868 push (@repeat_regions_family,$rpt_family || "unknown");
869 }
870 }
871 }
872 }
873 unless ($gene_name) { $gene_name="cds-position:".$cds_position; }
874 my %genefeatureshash;
875 $genefeatureshash{gene_name}=$gene_name;
876 $genefeatureshash{genefeatures}=\@genefeatures;
877 $genefeatureshash{labels}=\@labels;
878 $genefeatureshash{ttables}=\@ttables;
879 $genefeatureshash{swisshashes}=\@swisshashes;
880 $genefeatureshash{transcripts}=\@transcripts;
881 $genefeatureshash{exons}=\@exons;
882 $genefeatureshash{exondescs}=\@exondescs;
883 $genefeatureshash{introns}=\@introns;
884 $genefeatureshash{introndescs}=\@introndescs;
885 $genefeatureshash{prim_transcripts}=\@prim_transcripts;
886 $genefeatureshash{repeat_units}=\@repeat_units;
887 $genefeatureshash{repeat_regions}=\@repeat_regions;
888 $genefeatureshash{repeat_units_family}=\@repeat_units_family;
889 $genefeatureshash{repeat_regions_family}=\@repeat_regions_family;
890 return (\%genefeatureshash);
891 }
892
893
894 # used by _findgenefeatures, when a CDS at a certain position is requested,
895 # to retrieve only features quite close to the wanted CDS.
896 # Args: range hashref, begin and end positions of the CDS, $proximity
897 # $proximity holds the maximum distance between the extremes of the CDS
898 # and of the feature under exam.
899 # Returns: boolean
900 sub _checkfeatureproximity {
901 my ($self,$range,$cds_begin,$cds_end,$proximity)=@_;
902 my @range=@{$range};
903 my ($begin,$end,$strand);
904 if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons
905 ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]);
906 } else {
907 ($begin,$end,$strand)=@range;
908 }
909 if ($cds_begin > $cds_end) { # i.e. reverse strand CDS
910 ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries
911 }
912 if ($strand == -1) { # reverse strand
913 ($begin,$end)=($end,$begin); # swap boundaries
914 }
915 if (($cds_begin-$end)>$proximity) {
916 carp "--DEBUG-- feature rejected: begin $begin end $end -------";
917 return (0);
918 }
919 if (($begin-$cds_end)>$proximity) {
920 carp "--DEBUG-- feature rejected: begin $begin end $end -------";
921 return (0);
922 }
923 carp "--DEBUG-- feature accepted: begin $begin end $end -------";
924 return (1); # otherwise ok, feature considered next to CDS
925 }
926
927
928 # function that calls the external program "align" (on the fasta2 package)
929 # to create an alignment between two sequences, returning the aligned
930 # strings and the codes for the identity (:: ::::)
931
932 sub _get_alignment {
933 my ($self,$seq1,$seq2)=@_;
934 my $fastafile1="/tmp/tmpfastafile1";
935 my $fastafile2="/tmp/tmpfastafile2";
936 my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut
937 my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>/dev/null | $grepcut"; # ALIGN
938 open TMPFASTAFILE1,">$fastafile1" || croak "Cannot write into $fastafile1 for aa alignment";
939 open TMPFASTAFILE2,">$fastafile2" || croak "Cannot write into $fastafile1 for aa alignment";
940 print TMPFASTAFILE1 ">firstseq\n$seq1\n";
941 print TMPFASTAFILE2 ">secondseq\n$seq2\n";
942 close TMPFASTAFILE1;
943 close TMPFASTAFILE2;
944 my $alignment=`$alignprogram`;
945 my @alignlines=split(/\n/,$alignment);
946 my ($linecount,$seq1_aligned,$seq2_aligned,$codes);
947 for ($linecount=0; $linecount < @alignlines; $linecount+=3) {
948 $seq1_aligned .= $alignlines[$linecount];
949 $codes .= $alignlines[$linecount+1];
950 $seq2_aligned .= $alignlines[$linecount+2];
951 }
952 return ($seq1_aligned,$seq2_aligned,$codes);
953 }
954
955 # common part of the function to create a novel liveseq gene structure
956 # from an amino acid sequence, using codon usage frequencies
957 # args: codon_usage_array transltableid aasequence gene_name
958 sub _common_novelaasequence2gene {
959 my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_;
960 my @species_codon_usage=@{$species_codon_usage};
961 my @codon_usage_label=
962 qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg
963 tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga
964 ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa
965 gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag
966 tga);
967 my ($i,$j);
968 my %codon_usage_value;
969 my $aa_codon_total;
970 for ($i=0;$i<64;$i++) {
971 $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i];
972 }
973
974 my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid );
975 my @aminoacids = split(//,uc($aasequence));
976 my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon);
977 for $i (@aminoacids) {
978 @alt_codons = $CodonTable->revtranslate($i);
979 unless (@alt_codons) {
980 carp "No reverse translation possible for aminoacid \'$i\'";
981 $dnasequence .= "???";
982 } else {
983 $aa_codon_total=0;
984 for $j (@alt_codons) {
985 $aa_codon_total+=$codon_usage_value{$j};
986 }
987 # print "aminoacid $i, codonchoice: "; # verbose
988 #$partial=0;
989 #for $j (@alt_codons) {
990 #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total;
991 #$partial+=($codon_usage_value{$j}/$aa_codon_total);
992 #}
993 #print "\n";
994 $dice=rand(1);
995 #print "roulette: $dice\n"; # verbose
996 $partial=0;
997 $chosen_codon="";
998 CODONCHOICE:
999 for $j (0..@alt_codons) { # last one not accounted
1000 $thiscodon=$alt_codons[$j];
1001 $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total);
1002 if ($dice < $relativeusage+$partial) {
1003 $chosen_codon=$thiscodon;
1004 last CODONCHOICE;
1005 } else {
1006 $partial += $relativeusage;
1007 }
1008 }
1009 unless ($chosen_codon) {
1010 $chosen_codon = $alt_codons[-1]; # the last one
1011 }
1012 # print ".....adding $chosen_codon\n"; # verbose
1013 $dnasequence .= $chosen_codon;
1014 }
1015 }
1016
1017 my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence);
1018 my $min=1;
1019 my $max=length($dnasequence);
1020 my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1);
1021 my @exons=($exon);
1022 my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons);
1023 $transcript->translation_table($ttabid);
1024 my @transcripts=($transcript);
1025 my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript);
1026 my @translations=($translation);
1027 my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations);
1028 my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max);
1029
1030 # creation of gene
1031 unless ($gene) { # if $gene == 0 it means problems in hash2gene
1032 carp "Error in Gene creation phase";
1033 return (0);
1034 }
1035 return $gene;
1036 }
1037
1038 1;