Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/LiveSeq/IO/Loader.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Loader.pm,v 1.15 2001/10/22 08:22:51 heikki Exp $ | |
| 2 # | |
| 3 # bioperl module for Bio::LiveSeq::IO::Loader | |
| 4 # | |
| 5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net> | |
| 6 # | |
| 7 # Copyright Joseph Insana | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 #documentation needed | |
| 20 | |
| 21 =head1 DESCRIPTION | |
| 22 | |
| 23 This package holds common methods used by BioPerl, SRS and file loaders. | |
| 24 It contains methods to create LiveSeq objects out of entire entries or from a | |
| 25 localized sequence region surrounding a particular gene. | |
| 26 | |
| 27 =head1 AUTHOR - Joseph A.L. Insana | |
| 28 | |
| 29 Email: Insana@ebi.ac.uk, jinsana@gmx.net | |
| 30 | |
| 31 Address: | |
| 32 | |
| 33 EMBL Outstation, European Bioinformatics Institute | |
| 34 Wellcome Trust Genome Campus, Hinxton | |
| 35 Cambs. CB10 1SD, United Kingdom | |
| 36 | |
| 37 =head1 APPENDIX | |
| 38 | |
| 39 The rest of the documentation details each of the object | |
| 40 methods. Internal methods are usually preceded with a _ | |
| 41 | |
| 42 =cut | |
| 43 | |
| 44 # Let the code begin... | |
| 45 | |
| 46 package Bio::LiveSeq::IO::Loader; | |
| 47 $VERSION=4.44; | |
| 48 | |
| 49 # Version history: | |
| 50 # Wed Feb 16 17:55:01 GMT 2000 0.1a was a general EMBL entry printer with SRS | |
| 51 # Wed Mar 29 16:53:30 BST 2000 0.2 rewrote as SRS LiveSeq Loader | |
| 52 # Wed Mar 29 19:11:21 BST 2000 0.2.1 used successfully by liveseq3.pl | |
| 53 # Fri Mar 31 02:33:43 BST 2000 v 1.0 begun wrapping into this package | |
| 54 # Fri Mar 31 03:58:43 BST 2000 v 1.1 finished wrapping | |
| 55 # Fri Mar 31 04:24:50 BST 2000 v 1.2 added test_transl | |
| 56 # Fri Mar 31 04:34:48 BST 2000: noticed problem with K02083, if translation is | |
| 57 # included as valid qualifier name -> investigate | |
| 58 # Fri Mar 31 16:55:07 BST 2000 v 1.21 removed chop in test_transl() | |
| 59 # Mon Apr 3 18:25:27 BST 2000 v 1.3 begun working at lightweight loader | |
| 60 # Mon Apr 3 18:42:30 BST 2000 v 1.31 started changing so that CDS is no more | |
| 61 # the only default feature possibly asked for | |
| 62 # Tue Apr 4 16:19:09 BST 2000 v 1.4 started creating hash2gene | |
| 63 # Tue Apr 4 16:41:56 BST 2000 v 1.42 created location2range and rewritten | |
| 64 # cdslocation2transcript | |
| 65 # Tue Apr 4 18:18:42 BST 2000 v 1.44 finished (maybe) hash2gene | |
| 66 # Tue Apr 4 19:14:33 BST 2000 v 1.49 temporary printgene done. All working :) | |
| 67 # Wed Apr 5 02:04:01 BST 2000 v 1.5 added upbound,downbound to hash2gene | |
| 68 # Wed Apr 5 13:06:43 BST 2000 v 2.0 started obj_oriented and inheritance | |
| 69 # Thu Apr 6 03:11:29 BST 2000 v 2.2 transition from $location to @range | |
| 70 # Thu Apr 6 04:26:04 BST 2000 v 2.3 both SRS and BP work with gene and entry! | |
| 71 # Fri Apr 7 01:47:51 BST 2000 v 2.4 genes() created | |
| 72 # Fri Apr 7 03:01:46 BST 2000 v 2.5 changed hash2gene so that if there is | |
| 73 # just 1 CDS in entry it will use all | |
| 74 # features of the entry as Gene features | |
| 75 # Tue Apr 18 18:14:19 BST 2000 v 3.0 printswissprot added | |
| 76 # Wed Apr 19 22:15:12 BST 2000 v 3.2 swisshash2liveseq created | |
| 77 # Thu Apr 20 00:14:09 BST 2000 v 3.4 swisshash2liveseq updated: now it correctly handles cleaved_met and conflicts/mod_res/variants recorded differences between EMBL and SWISSPROT translations sequences. Still some not-recorded conflicts are possible and in these cases the program won't create the AARange -> this could change in the future, if a better stringcomparison is introduced | |
| 78 # Thu Apr 20 01:14:16 BST 2000 v 3.6 changed entry2liveseq and gene2liveseq to namedargument input format; added getswissprotinfo flag/option | |
| 79 # Thu Apr 20 02:18:58 BST 2000 v 3.7 mRNA added as valid_feature -> it gets recorded as prim_transcript object | |
| 80 # Thu Apr 27 16:19:43 BST 2000 v 3.8 translation_table set added to hash2gene | |
| 81 # Mon May 1 22:16:18 BST 2000 v 3.9 -position option added to gene2liveseq | |
| 82 # Tue May 2 02:43:05 BST 2000 v 4.0 moved some code in _findgenefeatures, added the possibility of using cds_position information, created _checkfeatureproximity | |
| 83 # Tue May 2 03:20:20 BST 2000 v 4.01 findgenefeatures debugged | |
| 84 # Wed May 31 13:59:09 BST 2000 v 4.02 chopped $translated to take away STOP | |
| 85 # Fri Jun 2 14:49:12 BST 2000 v 4.1 prints alignment with CLUSTALW | |
| 86 # Wed Jun 7 02:07:54 BST 2000 v 4.2 added code for "simplifying" joinedlocation features (e.g. join() in mRNA features), changing them to plain start-end ones | |
| 87 # Wed Jun 7 04:20:15 BST 2000 v 4.22 added translation->{'offset'} for INIT_MET | |
| 88 # Tue Jun 27 14:05:19 BST 2000 v. 4.3 added if() conditions so that if new() of object creation failed, the object is not passed on | |
| 89 # Tue Jul 4 14:15:58 BST 2000 v 4.4 note and number qualifier added to exon and intron descriptions | |
| 90 # Wed Jul 12 14:06:38 BST 2000 v 4.41 added if() condition out of transcript creation in transexoncreation() | |
| 91 # Fri Sep 15 15:41:02 BST 2000 v 4.44 created _common_novelaasequence2gene | |
| 92 | |
| 93 # Note: test_transl has been left as deprecated and is not really supported now | |
| 94 | |
| 95 use strict; | |
| 96 use Carp qw(cluck croak carp); | |
| 97 use vars qw($VERSION @ISA); | |
| 98 use Bio::LiveSeq::DNA 1.2; | |
| 99 use Bio::LiveSeq::Exon 1.0; | |
| 100 use Bio::LiveSeq::Transcript 2.4; | |
| 101 use Bio::LiveSeq::Translation 1.4; | |
| 102 use Bio::LiveSeq::Gene 1.1; | |
| 103 use Bio::LiveSeq::Intron 1.0; | |
| 104 use Bio::LiveSeq::Prim_Transcript 1.0; | |
| 105 use Bio::LiveSeq::Repeat_Region 1.0; | |
| 106 use Bio::LiveSeq::Repeat_Unit 1.0; | |
| 107 use Bio::LiveSeq::AARange 1.4; | |
| 108 use Bio::Tools::CodonTable; | |
| 109 | |
| 110 #@ISA=qw(Bio::LiveSeq::); # not useful now | |
| 111 | |
| 112 =head2 entry2liveseq | |
| 113 | |
| 114 Title : entry2liveseq | |
| 115 Usage : @translationobjects=$loader->entry2liveseq(); | |
| 116 : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0); | |
| 117 Function: creates LiveSeq objects from an entry previously loaded | |
| 118 Returns : array of references to objects of class Translation | |
| 119 Errorcode 0 | |
| 120 Args : optional boolean flag to avoid the retrieval of SwissProt | |
| 121 informations for all Transcripts containing SwissProt x-reference | |
| 122 default is 1 (to retrieve those informations and create AARange | |
| 123 LiveSeq objects) | |
| 124 Note : this method can get really slow for big entries. The lightweight | |
| 125 gene2liveseq method is recommended | |
| 126 | |
| 127 =cut | |
| 128 | |
| 129 sub entry2liveseq { | |
| 130 my ($self, %args) = @_; | |
| 131 my ($getswissprotinfo)=($args{-getswissprotinfo}); | |
| 132 if (defined($getswissprotinfo)) { | |
| 133 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { | |
| 134 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; | |
| 135 $getswissprotinfo=0; | |
| 136 } | |
| 137 } else { | |
| 138 $getswissprotinfo=1; | |
| 139 } | |
| 140 my $hashref=$self->{'hash'}; | |
| 141 unless ($hashref) { return (0); } | |
| 142 my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo); | |
| 143 my $test_transl=0; | |
| 144 if ($test_transl) { $self->test_transl($hashref,\@translationobjects);} | |
| 145 return @translationobjects; | |
| 146 } | |
| 147 | |
| 148 =head2 novelaasequence2gene | |
| 149 | |
| 150 Title : novelaasequence2gene | |
| 151 Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); | |
| 152 : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); | |
| 153 -taxon => 9606, | |
| 154 -gene_name => "tyr-kinase"); | |
| 155 | |
| 156 Function: creates LiveSeq objects from a novel amino acid sequence, | |
| 157 using codon usage database to choose codons according to | |
| 158 relative frequencies. | |
| 159 If a taxon ID is not specified, the default is to use the human | |
| 160 one (taxonomy ID 9606). | |
| 161 Returns : reference to a Gene object containing references to LiveSeq objects | |
| 162 Errorcode 0 | |
| 163 Args : string containing an amino acid sequence | |
| 164 integer (optional) with a taxonomy ID | |
| 165 string specifying a gene name | |
| 166 | |
| 167 =cut | |
| 168 | |
| 169 =head2 gene2liveseq | |
| 170 | |
| 171 Title : gene2liveseq | |
| 172 Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name"); | |
| 173 : $gene=$loader->gene2liveseq(-gene_name => "gene name", | |
| 174 -flanking => 64); | |
| 175 : $gene=$loader->gene2liveseq(-gene_name => "gene name", | |
| 176 -getswissprotinfo => 0); | |
| 177 : $gene=$loader->gene2liveseq(-position => 4); | |
| 178 | |
| 179 Function: creates LiveSeq objects from an entry previously loaded | |
| 180 It is a "light weight" creation because it creates | |
| 181 a LiveSequence just for the interesting region in an entry | |
| 182 (instead than for the total entry, like in entry2liveseq) and for | |
| 183 the flanking regions up to 500 nucleotides (default) or up to | |
| 184 the specified amount of nucleotides (given as argument) around the | |
| 185 Gene. | |
| 186 Returns : reference to a Gene object containing possibly alternative | |
| 187 Transcripts. | |
| 188 Errorcode 0 | |
| 189 Args : string containing the gene name as in the EMBL feature qualifier | |
| 190 integer (optional) "flanking": amount of flanking bases to be kept | |
| 191 boolean (optional) "getswissprotinfo": if set to "0" it will avoid | |
| 192 trying to fetch information from a crossreference to a SwissProt | |
| 193 entry, avoding the process of creation of AARange objects | |
| 194 It is "1" (on) by default | |
| 195 | |
| 196 Alternative to a gene_name, a position can be given: an | |
| 197 integer (1-) containing the position of the desired CDS in the | |
| 198 loaded entry | |
| 199 | |
| 200 =cut | |
| 201 | |
| 202 sub gene2liveseq { | |
| 203 my ($self, %args) = @_; | |
| 204 my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position}); | |
| 205 my $input; | |
| 206 unless (($gene_name)||($cds_position)) { | |
| 207 carp "Gene_Name or Position not specified for gene2liveseq loading function"; | |
| 208 return (0); | |
| 209 } | |
| 210 if (($gene_name)&&($cds_position)) { | |
| 211 carp "Gene_Name and Position cannot be given together, use one"; | |
| 212 return (0); | |
| 213 } elsif ($gene_name) { | |
| 214 $input=$gene_name; | |
| 215 } else { | |
| 216 $input="cds-position:".$cds_position; | |
| 217 } | |
| 218 | |
| 219 if (defined($getswissprotinfo)) { | |
| 220 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { | |
| 221 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; | |
| 222 $getswissprotinfo=0; | |
| 223 } | |
| 224 } else { | |
| 225 $getswissprotinfo=1; | |
| 226 } | |
| 227 | |
| 228 if (defined($flanking)) { | |
| 229 unless ($flanking >= 0) { | |
| 230 carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function"; | |
| 231 return (0); | |
| 232 } | |
| 233 } else { | |
| 234 $flanking=500; # the default flanking length | |
| 235 } | |
| 236 my $hashref=$self->{'hash'}; | |
| 237 unless ($hashref) { return (0); } | |
| 238 my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo); | |
| 239 unless ($gene) { # if $gene == 0 it means problems in hash2gene | |
| 240 carp "gene2liveseq produced error"; | |
| 241 return (0); | |
| 242 } | |
| 243 return $gene; | |
| 244 } | |
| 245 | |
| 246 # TODO: update so that it will work even if CDS is not only accepted FEATURE!! | |
| 247 # this method is for now deprecated and not supported | |
| 248 sub test_transl { | |
| 249 my ($self,$entry)=@_; | |
| 250 my @features=@{$entry->{'Features'}}; | |
| 251 my @translationobjects=@{$_[1]}; | |
| 252 my ($i,$translation); | |
| 253 my ($obj_transl,$hash_transl); | |
| 254 my @cds=@{$entry->{'CDS'}}; | |
| 255 foreach $translation (@translationobjects) { | |
| 256 $obj_transl=$translation->seq; | |
| 257 $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'}; | |
| 258 #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*" | |
| 259 unless ($obj_transl eq $hash_transl) { | |
| 260 cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i"; | |
| 261 carp "\nEntry's transl: ",$hash_transl,"\n"; | |
| 262 carp "\nObject's transl: ",$obj_transl,"\n"; | |
| 263 exit; | |
| 264 } | |
| 265 $i++; | |
| 266 } | |
| 267 } | |
| 268 | |
| 269 # argument: hashref containing the EMBL entry datas, | |
| 270 # getswissprotinfo boolean flag | |
| 271 # creates the liveseq objects | |
| 272 # returns: an array of Translation object references | |
| 273 sub hash2liveseq { | |
| 274 my ($self,$entry,$getswissprotinfo)=@_; | |
| 275 my $i; | |
| 276 my @transcripts; | |
| 277 my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} ); | |
| 278 $dna->alphabet(lc($entry->{'Molecule'})); | |
| 279 $dna->display_id($entry->{'ID'}); | |
| 280 $dna->accession_number($entry->{'AccNumber'}); | |
| 281 $dna->desc($entry->{'Description'}); | |
| 282 my @cds=@{$entry->{'CDS'}}; | |
| 283 my ($swissacc,$swisshash); my @swisshashes; | |
| 284 for $i (0..$#cds) { | |
| 285 #my @transcript=@{$cds[$i]->{'range'}}; | |
| 286 #$transcript=\@transcript; | |
| 287 #push (@transcripts,$transcript); | |
| 288 push (@transcripts,$cds[$i]->{'range'}); | |
| 289 if ($getswissprotinfo) { | |
| 290 $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'}; | |
| 291 $swisshash=$self->get_swisshash($swissacc); | |
| 292 #$self->printswissprot($swisshash); # DEBUG | |
| 293 push (@swisshashes,$swisshash); | |
| 294 } | |
| 295 } | |
| 296 my @translations=($self->transexonscreation($dna,\@transcripts)); | |
| 297 my $translation; my $j=0; | |
| 298 foreach $translation (@translations) { | |
| 299 if ($swisshashes[$j]) { # if not 0 | |
| 300 $self->swisshash2liveseq($swisshashes[$j],$translation); | |
| 301 } | |
| 302 $j++; | |
| 303 } | |
| 304 return (@translations); | |
| 305 } | |
| 306 | |
| 307 # only features pertaining to a specified gene are created | |
| 308 # only the sequence of the gene and appropriate context flanking regions | |
| 309 # are created as chain | |
| 310 # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag | |
| 311 # returns: reference to Gene object | |
| 312 # | |
| 313 # Note: if entry contains just one CDS, all the features get added | |
| 314 # this is useful because often the features in these entries do not | |
| 315 # carry the /gene qualifier | |
| 316 # | |
| 317 # errorcode: 0 | |
| 318 sub hash2gene { | |
| 319 my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_; | |
| 320 my $entryfeature; | |
| 321 my $genefeatureshash; | |
| 322 | |
| 323 my @cds=@{$entry->{'CDS'}}; | |
| 324 | |
| 325 # checking if a position has been given instead than a gene_name | |
| 326 if (index($input,"cds-position:") == 0 ) { | |
| 327 my $cds_position=substr($input,13); # extracting the cds position | |
| 328 if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) { | |
| 329 $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo); | |
| 330 } | |
| 331 } else { | |
| 332 $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo); | |
| 333 } | |
| 334 | |
| 335 unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found | |
| 336 my @genes=$self->genes($entry); | |
| 337 my $cds_number=scalar(@cds); | |
| 338 warn "Warning! Not even one genefeature found for /$input/.... | |
| 339 The genes present in this entry are:\n\t@genes\n | |
| 340 The number of CDS in this entry is:\n\t$cds_number\n"; | |
| 341 return(0); | |
| 342 } | |
| 343 | |
| 344 # get max and min, check flankings | |
| 345 my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries" | |
| 346 my $seqlength=$entry->{'SeqLength'}; | |
| 347 my ($mindna,$maxdna); # some flanking region next to the gene "boundaries" | |
| 348 if ($min-$flanking < 1) { | |
| 349 $mindna=1; | |
| 350 } else { | |
| 351 $mindna=$min-$flanking; | |
| 352 } | |
| 353 if ($max+$flanking > $seqlength) { | |
| 354 $maxdna=$seqlength; | |
| 355 } else { | |
| 356 $maxdna=$max+$flanking; | |
| 357 } | |
| 358 my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1); | |
| 359 | |
| 360 # create LiveSeq objects | |
| 361 | |
| 362 # create DNA | |
| 363 my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna); | |
| 364 $dna->alphabet(lc($entry->{'Molecule'})); | |
| 365 $dna->source($entry->{'Organism'}); | |
| 366 $dna->display_id($entry->{'ID'}); | |
| 367 $dna->accession_number($entry->{'AccNumber'}); | |
| 368 $dna->desc($entry->{'Description'}); | |
| 369 | |
| 370 my @transcripts=@{$genefeatureshash->{'transcripts'}}; | |
| 371 # create Translations, Transcripts, Exons out of the CDS | |
| 372 unless (@transcripts) { | |
| 373 cluck "no CDS feature found for /$input/...."; | |
| 374 return(0); | |
| 375 } | |
| 376 my @translationobjs=$self->transexonscreation($dna,\@transcripts); | |
| 377 my @transcriptobjs; | |
| 378 | |
| 379 # get the Transcript obj_refs | |
| 380 my $translation; | |
| 381 my $j=0; | |
| 382 my @ttables=@{$genefeatureshash->{'ttables'}}; | |
| 383 my @swisshashes=@{$genefeatureshash->{'swisshashes'}}; | |
| 384 foreach $translation (@translationobjs) { | |
| 385 push(@transcriptobjs,$translation->get_Transcript); | |
| 386 if ($ttables[$j]) { # if not undef | |
| 387 $translation->get_Transcript->translation_table($ttables[$j]); | |
| 388 #} else { # DEBUG | |
| 389 # print "\n\t\tno translation table information....\n"; | |
| 390 } | |
| 391 if ($swisshashes[$j]) { # if not 0 | |
| 392 $self->swisshash2liveseq($swisshashes[$j],$translation); | |
| 393 } | |
| 394 $j++; | |
| 395 } | |
| 396 | |
| 397 my %gene; # this is the hash to store created object references | |
| 398 $gene{DNA}=$dna; | |
| 399 $gene{Transcripts}=\@transcriptobjs; | |
| 400 $gene{Translations}=\@translationobjs; | |
| 401 | |
| 402 my @exonobjs; my @intronobjs; | |
| 403 my @repeatunitobjs; my @repeatregionobjs; | |
| 404 my @primtranscriptobjs; | |
| 405 | |
| 406 my ($object,$range,$start,$end,$strand); | |
| 407 | |
| 408 my @exons=@{$genefeatureshash->{'exons'}}; | |
| 409 my @exondescs=@{$genefeatureshash->{'exondescs'}}; | |
| 410 if (@exons) { | |
| 411 my $exoncount = 0; | |
| 412 foreach $range (@exons) { | |
| 413 ($start,$end,$strand)=@{$range}; | |
| 414 $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
| 415 if ($object != -1) { | |
| 416 $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount]; | |
| 417 $exoncount++; | |
| 418 push (@exonobjs,$object); | |
| 419 } else { | |
| 420 $exoncount++; | |
| 421 } | |
| 422 } | |
| 423 $gene{Exons}=\@exonobjs; | |
| 424 } | |
| 425 my @introns=@{$genefeatureshash->{'introns'}}; | |
| 426 my @introndescs=@{$genefeatureshash->{'introndescs'}}; | |
| 427 if (@introns) { | |
| 428 my $introncount = 0; | |
| 429 foreach $range (@introns) { | |
| 430 ($start,$end,$strand)=@{$range}; | |
| 431 $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
| 432 if ($object != -1) { | |
| 433 $object->desc($introndescs[$introncount]); | |
| 434 $introncount++; | |
| 435 push (@intronobjs,$object); | |
| 436 } else { | |
| 437 $introncount++; | |
| 438 } | |
| 439 } | |
| 440 $gene{Introns}=\@intronobjs; | |
| 441 } | |
| 442 my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}}; | |
| 443 if (@prim_transcripts) { | |
| 444 foreach $range (@prim_transcripts) { | |
| 445 ($start,$end,$strand)=@{$range}; | |
| 446 $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
| 447 if ($object != -1) { push (@primtranscriptobjs,$object); } | |
| 448 } | |
| 449 $gene{Prim_Transcripts}=\@primtranscriptobjs; | |
| 450 } | |
| 451 my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}}; | |
| 452 my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}}; | |
| 453 if (@repeat_regions) { | |
| 454 my $k=0; | |
| 455 foreach $range (@repeat_regions) { | |
| 456 ($start,$end,$strand)=@{$range}; | |
| 457 $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
| 458 if ($object != -1) { | |
| 459 $object->desc($repeat_regions_family[$k]); | |
| 460 $k++; | |
| 461 push (@repeatregionobjs,$object); | |
| 462 } else { | |
| 463 $k++; | |
| 464 } | |
| 465 } | |
| 466 $gene{Repeat_Regions}=\@repeatregionobjs; | |
| 467 } | |
| 468 my @repeat_units=@{$genefeatureshash->{'repeat_units'}}; | |
| 469 my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}}; | |
| 470 if (@repeat_units) { | |
| 471 my $k=0; | |
| 472 foreach $range (@repeat_units) { | |
| 473 ($start,$end,$strand)=@{$range}; | |
| 474 $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
| 475 if ($object != -1) { | |
| 476 $object->desc($repeat_units_family[$k]); | |
| 477 $k++; | |
| 478 push (@repeatunitobjs,$object); | |
| 479 } else { | |
| 480 $k++; | |
| 481 } | |
| 482 } | |
| 483 $gene{Repeat_Units}=\@repeatunitobjs; | |
| 484 } | |
| 485 | |
| 486 # create the Gene | |
| 487 my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos | |
| 488 return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene, | |
| 489 -upbound=>$min,-downbound=>$max)); | |
| 490 } | |
| 491 | |
| 492 # maybe this function will be moved to general utility package | |
| 493 # argument: array of numbers | |
| 494 # returns: (min,max) numbers in the array | |
| 495 sub rangeofarray { | |
| 496 my $self=shift; | |
| 497 my @array=@_; | |
| 498 #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n"; | |
| 499 my ($max,$min,$element); | |
| 500 $min=$max=shift(@array); | |
| 501 foreach $element (@array) { | |
| 502 $element = 0 unless defined $element; | |
| 503 if ($element < $min) { | |
| 504 $min=$element; | |
| 505 } | |
| 506 if ($element > $max) { | |
| 507 $max=$element; | |
| 508 } | |
| 509 } | |
| 510 #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n"; | |
| 511 return ($min,$max); | |
| 512 } | |
| 513 | |
| 514 | |
| 515 # argument: reference to DNA object, reference to array of transcripts | |
| 516 # returns: an array of Translation object references | |
| 517 sub transexonscreation { | |
| 518 my $self=shift; | |
| 519 my $dna=$_[0]; | |
| 520 my @transcripts=@{$_[1]}; | |
| 521 | |
| 522 my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj); | |
| 523 my $translationobj; | |
| 524 my @translationobjects; | |
| 525 foreach $transcript (@transcripts) { | |
| 526 foreach $exonref (@{$transcript}) { | |
| 527 ($start,$end,$strand)=@{$exonref}; | |
| 528 #print "Creating Exon: start $start end $end strand $strand\n"; | |
| 529 $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); | |
| 530 #push (@exonobjects,$exonobj); | |
| 531 push (@transexons,$exonobj); | |
| 532 } | |
| 533 $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons ); | |
| 534 if ($transcriptobj != -1) { | |
| 535 $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj); | |
| 536 @transexons=(); # cleans it | |
| 537 #push (@transcriptobjects,$transcriptobj); | |
| 538 push (@translationobjects,$translationobj); | |
| 539 } | |
| 540 } | |
| 541 return (@translationobjects); | |
| 542 } | |
| 543 | |
| 544 #sub printgene { | |
| 545 # deleted. Some functionality placed in Gene->printfeaturesnum | |
| 546 | |
| 547 =head2 printswissprot | |
| 548 | |
| 549 Title : printswissprot | |
| 550 Usage : $loader->printswissprot($hashref); | |
| 551 Function: prints out all informations loaded from a database entry into the | |
| 552 loader. Mainly used for testing purposes. | |
| 553 Args : a hashref containing the SWISSPROT entry datas | |
| 554 Note : the hashref can be obtained with a call to the method | |
| 555 $loader->get_swisshash() (only with SRS loader or | |
| 556 BioPerl via Bio::DB::EMBL.pm) | |
| 557 that takes as argument a string like "SWISS-PROT:P10275" or | |
| 558 from $loader->swissprot2hash() that takes an SRS query string | |
| 559 as its argument (e.g. "swissprot-acc:P10275") | |
| 560 | |
| 561 =cut | |
| 562 | |
| 563 # argument: hashref containing the SWISSPROT entry datas | |
| 564 # prints out that hash, showing the informations loaded | |
| 565 sub printswissprot { | |
| 566 my ($self,$entry)=@_; | |
| 567 unless ($entry) { | |
| 568 return; | |
| 569 } | |
| 570 printf "ID: %s\n", | |
| 571 $entry->{'ID'}; | |
| 572 printf "ACC: %s\n", | |
| 573 $entry->{'AccNumber'}; | |
| 574 printf "GENE: %s\n", | |
| 575 $entry->{'Gene'}; | |
| 576 printf "DES: %s\n", | |
| 577 $entry->{'Description'}; | |
| 578 printf "ORG: %s\n", | |
| 579 $entry->{'Organism'}; | |
| 580 printf "SEQLN: %s\n", | |
| 581 $entry->{'SeqLength'}; | |
| 582 printf "SEQ: %s\n", | |
| 583 substr($entry->{'Sequence'},0,64); | |
| 584 if ($entry->{'Features'}) { | |
| 585 my @features=@{$entry->{'Features'}}; | |
| 586 my $i; | |
| 587 for $i (0..$#features) { | |
| 588 print "|",$features[$i]->{'name'},"|"; | |
| 589 print " at ",$features[$i]->{'location'},": "; | |
| 590 print "",$features[$i]->{'desc'},"\n"; | |
| 591 } | |
| 592 } | |
| 593 } | |
| 594 | |
| 595 =head2 printembl | |
| 596 | |
| 597 Title : printembl | |
| 598 Usage : $loader->printembl(); | |
| 599 Function: prints out all informations loaded from a database entry into the | |
| 600 loader. Mainly used for testing purposes. | |
| 601 Args : none | |
| 602 | |
| 603 =cut | |
| 604 | |
| 605 # argument: hashref containing the EMBL entry datas | |
| 606 # prints out that hash, showing the informations loaded | |
| 607 sub printembl { | |
| 608 my ($self,$entry)=@_; | |
| 609 unless ($entry) { | |
| 610 $entry=$self->{'hash'}; | |
| 611 } | |
| 612 my ($i,$featurename); | |
| 613 printf "ID: %s\n", | |
| 614 $entry->{'ID'}; | |
| 615 printf "ACC: %s\n", | |
| 616 $entry->{'AccNumber'}; | |
| 617 printf "MOL: %s\n", | |
| 618 $entry->{'Molecule'}; | |
| 619 printf "DIV: %s\n", | |
| 620 $entry->{'Division'}; | |
| 621 printf "DES: %s\n", | |
| 622 $entry->{'Description'}; | |
| 623 printf "ORG: %s\n", | |
| 624 $entry->{'Organism'}; | |
| 625 printf "SEQLN: %s\n", | |
| 626 $entry->{'SeqLength'}; | |
| 627 printf "SEQ: %s\n", | |
| 628 substr($entry->{'Sequence'},0,64); | |
| 629 my @features=@{$entry->{'Features'}}; | |
| 630 my @cds=@{$entry->{'CDS'}}; | |
| 631 print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n"; | |
| 632 my ($exonref,$transcript); | |
| 633 my ($qualifiernumber,$qualifiers,$key); | |
| 634 my ($start,$end,$strand); | |
| 635 my $j=0; | |
| 636 for $i (0..$#features) { | |
| 637 $featurename=$features[$i]->{'name'}; | |
| 638 if ($featurename eq "CDS") { | |
| 639 print "|CDS| number $j at feature position: $i\n"; | |
| 640 #print $features[$i]->{'location'},"\n"; | |
| 641 $transcript=$features[$i]->{'range'}; | |
| 642 foreach $exonref (@{$transcript}) { | |
| 643 ($start,$end,$strand)=@{$exonref}; | |
| 644 print "\tExon: start $start end $end strand $strand\n"; | |
| 645 } | |
| 646 $j++; | |
| 647 } else { | |
| 648 print "|$featurename| at feature position: $i\n"; | |
| 649 print "\trange: "; | |
| 650 print join("\t",@{$features[$i]->{'range'}}),"\n"; | |
| 651 #print $features[$i]->{'location'},"\n"; | |
| 652 } | |
| 653 $qualifiernumber=$features[$i]->{'qual_number'}; | |
| 654 $qualifiers=$features[$i]->{'qualifiers'}; # hash | |
| 655 foreach $key (keys (%{$qualifiers})) { | |
| 656 print "\t\t",$key,": "; | |
| 657 print $qualifiers->{$key},"\n"; | |
| 658 } | |
| 659 } | |
| 660 } | |
| 661 | |
| 662 =head2 genes | |
| 663 | |
| 664 Title : genes | |
| 665 Usage : $loader->genes(); | |
| 666 Function: Returns an array of gene_names (strings) contained in the loaded | |
| 667 entry. | |
| 668 Args : none | |
| 669 | |
| 670 =cut | |
| 671 | |
| 672 # argument: entryhashref | |
| 673 # returns: array of genenames found in the entry | |
| 674 sub genes { | |
| 675 my ($self,$entry)=@_; | |
| 676 unless ($entry) { | |
| 677 $entry=$self->{'hash'}; | |
| 678 } | |
| 679 my @entryfeatures=@{$entry->{'Features'}}; | |
| 680 my ($genename,$genenames,$entryfeature); | |
| 681 for $entryfeature (@entryfeatures) { | |
| 682 $genename=$entryfeature->{'qualifiers'}->{'gene'}; | |
| 683 if ($genename) { | |
| 684 if (index($genenames,$genename) == -1) { # if name is new | |
| 685 $genenames .= $genename . " "; # add the name | |
| 686 } | |
| 687 } | |
| 688 } | |
| 689 return (split(/ /,$genenames)); # assumes no space inbetween each genename | |
| 690 } | |
| 691 | |
| 692 # arguments: swisshash, translation objref | |
| 693 # adds information to the Translation, creates AARange objects, sets the | |
| 694 # aa_range attribute on the Translation, pointing to those objects | |
| 695 sub swisshash2liveseq { | |
| 696 my ($self,$entry,$translation)=@_; | |
| 697 my $translength=$translation->length; | |
| 698 $translation->desc($translation->desc . $entry->{'Description'}); | |
| 699 $translation->display_id("SWISSPROT:" . $entry->{'ID'}); | |
| 700 $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'}); | |
| 701 $translation->name($entry->{'Gene'}); | |
| 702 $translation->source($entry->{'Organism'}); | |
| 703 my @aarangeobjects; | |
| 704 my ($start,$end,$aarangeobj,$feature); | |
| 705 my @features; my @newfeatures; | |
| 706 if ($entry->{'Features'}) { | |
| 707 @features=@{$entry->{'Features'}}; | |
| 708 } | |
| 709 my $cleavedmet=0; | |
| 710 # check for cleaved Met | |
| 711 foreach $feature (@features) { | |
| 712 if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) { | |
| 713 $cleavedmet=1; | |
| 714 $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence | |
| 715 } else { | |
| 716 push(@newfeatures,$feature); | |
| 717 } | |
| 718 } | |
| 719 | |
| 720 my $swissseq=$entry->{'Sequence'}; | |
| 721 my $liveseqtransl=$translation->seq; | |
| 722 chop $liveseqtransl; # to take away the trailing STOP "*" | |
| 723 my $translated=substr($liveseqtransl,$cleavedmet); | |
| 724 | |
| 725 my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met | |
| 726 | |
| 727 if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed? | |
| 728 print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n"; | |
| 729 carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!"; | |
| 730 return; | |
| 731 } | |
| 732 | |
| 733 #my $i=0; # debug | |
| 734 @features=@newfeatures; | |
| 735 foreach $feature (@features) { | |
| 736 #print "Processing SwissProtFeature: $i\n"; # debug | |
| 737 ($start,$end)=split(/ /,$feature->{'location'}); | |
| 738 # Note: cleavedmet is taken in account for updating numbering | |
| 739 $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -desc => $feature->{'desc'}, -translation => $translation, -translength => $translength); | |
| 740 if ($aarangeobj != -1) { | |
| 741 push(@aarangeobjects,$aarangeobj); | |
| 742 } | |
| 743 # $i++; # debug | |
| 744 } | |
| 745 $translation->{'aa_ranges'}=\@aarangeobjects; | |
| 746 } | |
| 747 | |
| 748 # if there is no SRS support, the default will be to return 0 | |
| 749 # i.e. this function is overridden in SRS package | |
| 750 sub get_swisshash { | |
| 751 return (0); | |
| 752 } | |
| 753 | |
| 754 # Args: $entry hashref, gene_name OR cds_position (undef is used to | |
| 755 # choose between the two), getswissprotinfo boolean flag | |
| 756 # Returns: an hash holding various arrayref used in the hash2gene method | |
| 757 # Function: examines the nucleotide entry, identifying features belonging | |
| 758 # to the gene (defined either by its name or by the position of its CDS in | |
| 759 # the entry) | |
| 760 | |
| 761 sub _findgenefeatures { | |
| 762 my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_; | |
| 763 | |
| 764 my @entryfeatures=@{$entry->{'Features'}}; | |
| 765 my @exons; my @introns; my @prim_transcripts; my @transcripts; | |
| 766 my @repeat_units; my @repeat_regions; | |
| 767 my @repeat_units_family; my @repeat_regions_family; my $rpt_family; | |
| 768 my $entryfeature; my @genefeatures; | |
| 769 my $desc; my @exondescs; my @introndescs; | |
| 770 | |
| 771 # for swissprot xreference | |
| 772 my ($swissacc,$swisshash); my @swisshashes; | |
| 773 | |
| 774 # for translation_tables | |
| 775 my @ttables; | |
| 776 | |
| 777 # to create labels | |
| 778 my ($name,$exon); | |
| 779 my @range; my @cdsexons; my @labels; | |
| 780 | |
| 781 # maybe here also could be added special case when there is no CDS feature | |
| 782 # in the entry (e.g. tRNA entry -> TOCHECK). | |
| 783 # let's deal with the special case in which there is just one gene per entry | |
| 784 # usually without /gene qualifier | |
| 785 my @cds=@{$entry->{'CDS'}}; | |
| 786 | |
| 787 my $skipgenematch=0; | |
| 788 if (scalar(@cds) == 1) { | |
| 789 #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features."; | |
| 790 $skipgenematch=1; | |
| 791 } | |
| 792 | |
| 793 my ($cds_begin,$cds_end,$proximity); | |
| 794 if ($cds_position) { # if a position has been requested | |
| 795 my @cds_exons=@{$cds[$cds_position-1]->{'range'}}; | |
| 796 ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS | |
| 797 $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'}; | |
| 798 # DEBUG | |
| 799 unless ($skipgenematch) { | |
| 800 carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------"; | |
| 801 } | |
| 802 $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS | |
| 803 } | |
| 804 | |
| 805 for $entryfeature (@entryfeatures) { # get only features for the desired gene | |
| 806 if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) { | |
| 807 push(@genefeatures,$entryfeature); | |
| 808 | |
| 809 my @range=@{$entryfeature->{'range'}}; | |
| 810 $name=$entryfeature->{'name'}; | |
| 811 my %qualifierhash=%{$entryfeature->{'qualifiers'}}; | |
| 812 if ($name eq "CDS") { # that has range containing array of exons | |
| 813 | |
| 814 # swissprot crossindexing (if without SRS support it will fill array | |
| 815 # with zeros and do nothing | |
| 816 if ($getswissprotinfo) { | |
| 817 $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'}; | |
| 818 $swisshash=$self->get_swisshash($swissacc); | |
| 819 #$self->printswissprot($swisshash); # DEBUG | |
| 820 push (@swisshashes,$swisshash); | |
| 821 } | |
| 822 | |
| 823 push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified | |
| 824 | |
| 825 # create labels array | |
| 826 for $exon (@range) { | |
| 827 push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS | |
| 828 } | |
| 829 push (@transcripts,$entryfeature->{'range'}); | |
| 830 } else { | |
| 831 # "simplifying" the joinedlocation features. I.e. changing them from | |
| 832 # multijoined ones to simple plain start-end features, taking only | |
| 833 # the start of the first "exon" and the end of the last "exon" as | |
| 834 # start and end of the entire feature | |
| 835 if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location | |
| 836 @range=($range[0]->[0],$range[-1]->[1]); | |
| 837 } | |
| 838 push(@labels,$range[0],$range[1]); # start and end of every feature | |
| 839 if ($name eq "exon") { | |
| 840 $desc=$entryfeature->{'qualifiers'}->{'number'}; | |
| 841 if ($entryfeature->{'qualifiers'}->{'note'}) { | |
| 842 if ($desc) { | |
| 843 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; | |
| 844 } else { | |
| 845 $desc = $entryfeature->{'qualifiers'}->{'note'}; | |
| 846 } | |
| 847 } | |
| 848 push (@exondescs,$desc || "unknown"); | |
| 849 push(@exons,\@range); | |
| 850 } | |
| 851 if ($name eq "intron") { | |
| 852 $desc=$entryfeature->{'qualifiers'}->{'number'}; | |
| 853 if ($desc) { | |
| 854 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; | |
| 855 } else { | |
| 856 $desc = $entryfeature->{'qualifiers'}->{'note'}; | |
| 857 } | |
| 858 push (@introndescs,$desc || "unknown"); | |
| 859 push(@introns,\@range); | |
| 860 } | |
| 861 if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); } | |
| 862 if ($name eq "repeat_unit") { push(@repeat_units,\@range); | |
| 863 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; | |
| 864 push (@repeat_units_family,$rpt_family || "unknown"); | |
| 865 } | |
| 866 if ($name eq "repeat_region") { push(@repeat_regions,\@range); | |
| 867 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; | |
| 868 push (@repeat_regions_family,$rpt_family || "unknown"); | |
| 869 } | |
| 870 } | |
| 871 } | |
| 872 } | |
| 873 unless ($gene_name) { $gene_name="cds-position:".$cds_position; } | |
| 874 my %genefeatureshash; | |
| 875 $genefeatureshash{gene_name}=$gene_name; | |
| 876 $genefeatureshash{genefeatures}=\@genefeatures; | |
| 877 $genefeatureshash{labels}=\@labels; | |
| 878 $genefeatureshash{ttables}=\@ttables; | |
| 879 $genefeatureshash{swisshashes}=\@swisshashes; | |
| 880 $genefeatureshash{transcripts}=\@transcripts; | |
| 881 $genefeatureshash{exons}=\@exons; | |
| 882 $genefeatureshash{exondescs}=\@exondescs; | |
| 883 $genefeatureshash{introns}=\@introns; | |
| 884 $genefeatureshash{introndescs}=\@introndescs; | |
| 885 $genefeatureshash{prim_transcripts}=\@prim_transcripts; | |
| 886 $genefeatureshash{repeat_units}=\@repeat_units; | |
| 887 $genefeatureshash{repeat_regions}=\@repeat_regions; | |
| 888 $genefeatureshash{repeat_units_family}=\@repeat_units_family; | |
| 889 $genefeatureshash{repeat_regions_family}=\@repeat_regions_family; | |
| 890 return (\%genefeatureshash); | |
| 891 } | |
| 892 | |
| 893 | |
| 894 # used by _findgenefeatures, when a CDS at a certain position is requested, | |
| 895 # to retrieve only features quite close to the wanted CDS. | |
| 896 # Args: range hashref, begin and end positions of the CDS, $proximity | |
| 897 # $proximity holds the maximum distance between the extremes of the CDS | |
| 898 # and of the feature under exam. | |
| 899 # Returns: boolean | |
| 900 sub _checkfeatureproximity { | |
| 901 my ($self,$range,$cds_begin,$cds_end,$proximity)=@_; | |
| 902 my @range=@{$range}; | |
| 903 my ($begin,$end,$strand); | |
| 904 if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons | |
| 905 ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]); | |
| 906 } else { | |
| 907 ($begin,$end,$strand)=@range; | |
| 908 } | |
| 909 if ($cds_begin > $cds_end) { # i.e. reverse strand CDS | |
| 910 ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries | |
| 911 } | |
| 912 if ($strand == -1) { # reverse strand | |
| 913 ($begin,$end)=($end,$begin); # swap boundaries | |
| 914 } | |
| 915 if (($cds_begin-$end)>$proximity) { | |
| 916 carp "--DEBUG-- feature rejected: begin $begin end $end -------"; | |
| 917 return (0); | |
| 918 } | |
| 919 if (($begin-$cds_end)>$proximity) { | |
| 920 carp "--DEBUG-- feature rejected: begin $begin end $end -------"; | |
| 921 return (0); | |
| 922 } | |
| 923 carp "--DEBUG-- feature accepted: begin $begin end $end -------"; | |
| 924 return (1); # otherwise ok, feature considered next to CDS | |
| 925 } | |
| 926 | |
| 927 | |
| 928 # function that calls the external program "align" (on the fasta2 package) | |
| 929 # to create an alignment between two sequences, returning the aligned | |
| 930 # strings and the codes for the identity (:: ::::) | |
| 931 | |
| 932 sub _get_alignment { | |
| 933 my ($self,$seq1,$seq2)=@_; | |
| 934 my $fastafile1="/tmp/tmpfastafile1"; | |
| 935 my $fastafile2="/tmp/tmpfastafile2"; | |
| 936 my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut | |
| 937 my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>/dev/null | $grepcut"; # ALIGN | |
| 938 open TMPFASTAFILE1,">$fastafile1" || croak "Cannot write into $fastafile1 for aa alignment"; | |
| 939 open TMPFASTAFILE2,">$fastafile2" || croak "Cannot write into $fastafile1 for aa alignment"; | |
| 940 print TMPFASTAFILE1 ">firstseq\n$seq1\n"; | |
| 941 print TMPFASTAFILE2 ">secondseq\n$seq2\n"; | |
| 942 close TMPFASTAFILE1; | |
| 943 close TMPFASTAFILE2; | |
| 944 my $alignment=`$alignprogram`; | |
| 945 my @alignlines=split(/\n/,$alignment); | |
| 946 my ($linecount,$seq1_aligned,$seq2_aligned,$codes); | |
| 947 for ($linecount=0; $linecount < @alignlines; $linecount+=3) { | |
| 948 $seq1_aligned .= $alignlines[$linecount]; | |
| 949 $codes .= $alignlines[$linecount+1]; | |
| 950 $seq2_aligned .= $alignlines[$linecount+2]; | |
| 951 } | |
| 952 return ($seq1_aligned,$seq2_aligned,$codes); | |
| 953 } | |
| 954 | |
| 955 # common part of the function to create a novel liveseq gene structure | |
| 956 # from an amino acid sequence, using codon usage frequencies | |
| 957 # args: codon_usage_array transltableid aasequence gene_name | |
| 958 sub _common_novelaasequence2gene { | |
| 959 my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_; | |
| 960 my @species_codon_usage=@{$species_codon_usage}; | |
| 961 my @codon_usage_label= | |
| 962 qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg | |
| 963 tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga | |
| 964 ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa | |
| 965 gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag | |
| 966 tga); | |
| 967 my ($i,$j); | |
| 968 my %codon_usage_value; | |
| 969 my $aa_codon_total; | |
| 970 for ($i=0;$i<64;$i++) { | |
| 971 $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i]; | |
| 972 } | |
| 973 | |
| 974 my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid ); | |
| 975 my @aminoacids = split(//,uc($aasequence)); | |
| 976 my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon); | |
| 977 for $i (@aminoacids) { | |
| 978 @alt_codons = $CodonTable->revtranslate($i); | |
| 979 unless (@alt_codons) { | |
| 980 carp "No reverse translation possible for aminoacid \'$i\'"; | |
| 981 $dnasequence .= "???"; | |
| 982 } else { | |
| 983 $aa_codon_total=0; | |
| 984 for $j (@alt_codons) { | |
| 985 $aa_codon_total+=$codon_usage_value{$j}; | |
| 986 } | |
| 987 # print "aminoacid $i, codonchoice: "; # verbose | |
| 988 #$partial=0; | |
| 989 #for $j (@alt_codons) { | |
| 990 #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total; | |
| 991 #$partial+=($codon_usage_value{$j}/$aa_codon_total); | |
| 992 #} | |
| 993 #print "\n"; | |
| 994 $dice=rand(1); | |
| 995 #print "roulette: $dice\n"; # verbose | |
| 996 $partial=0; | |
| 997 $chosen_codon=""; | |
| 998 CODONCHOICE: | |
| 999 for $j (0..@alt_codons) { # last one not accounted | |
| 1000 $thiscodon=$alt_codons[$j]; | |
| 1001 $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total); | |
| 1002 if ($dice < $relativeusage+$partial) { | |
| 1003 $chosen_codon=$thiscodon; | |
| 1004 last CODONCHOICE; | |
| 1005 } else { | |
| 1006 $partial += $relativeusage; | |
| 1007 } | |
| 1008 } | |
| 1009 unless ($chosen_codon) { | |
| 1010 $chosen_codon = $alt_codons[-1]; # the last one | |
| 1011 } | |
| 1012 # print ".....adding $chosen_codon\n"; # verbose | |
| 1013 $dnasequence .= $chosen_codon; | |
| 1014 } | |
| 1015 } | |
| 1016 | |
| 1017 my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence); | |
| 1018 my $min=1; | |
| 1019 my $max=length($dnasequence); | |
| 1020 my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1); | |
| 1021 my @exons=($exon); | |
| 1022 my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons); | |
| 1023 $transcript->translation_table($ttabid); | |
| 1024 my @transcripts=($transcript); | |
| 1025 my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript); | |
| 1026 my @translations=($translation); | |
| 1027 my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations); | |
| 1028 my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max); | |
| 1029 | |
| 1030 # creation of gene | |
| 1031 unless ($gene) { # if $gene == 0 it means problems in hash2gene | |
| 1032 carp "Error in Gene creation phase"; | |
| 1033 return (0); | |
| 1034 } | |
| 1035 return $gene; | |
| 1036 } | |
| 1037 | |
| 1038 1; |
