Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/LiveSeq/IO/Loader.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/LiveSeq/IO/Loader.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1038 @@ +# $Id: Loader.pm,v 1.15 2001/10/22 08:22:51 heikki Exp $ +# +# bioperl module for Bio::LiveSeq::IO::Loader +# +# Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net> +# +# Copyright Joseph Insana +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq + +=head1 SYNOPSIS + + #documentation needed + +=head1 DESCRIPTION + +This package holds common methods used by BioPerl, SRS and file loaders. +It contains methods to create LiveSeq objects out of entire entries or from a +localized sequence region surrounding a particular gene. + +=head1 AUTHOR - Joseph A.L. Insana + +Email: Insana@ebi.ac.uk, jinsana@gmx.net + +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::LiveSeq::IO::Loader; +$VERSION=4.44; + +# Version history: +# Wed Feb 16 17:55:01 GMT 2000 0.1a was a general EMBL entry printer with SRS +# Wed Mar 29 16:53:30 BST 2000 0.2 rewrote as SRS LiveSeq Loader +# Wed Mar 29 19:11:21 BST 2000 0.2.1 used successfully by liveseq3.pl +# Fri Mar 31 02:33:43 BST 2000 v 1.0 begun wrapping into this package +# Fri Mar 31 03:58:43 BST 2000 v 1.1 finished wrapping +# Fri Mar 31 04:24:50 BST 2000 v 1.2 added test_transl +# Fri Mar 31 04:34:48 BST 2000: noticed problem with K02083, if translation is +# included as valid qualifier name -> investigate +# Fri Mar 31 16:55:07 BST 2000 v 1.21 removed chop in test_transl() +# Mon Apr 3 18:25:27 BST 2000 v 1.3 begun working at lightweight loader +# Mon Apr 3 18:42:30 BST 2000 v 1.31 started changing so that CDS is no more +# the only default feature possibly asked for +# Tue Apr 4 16:19:09 BST 2000 v 1.4 started creating hash2gene +# Tue Apr 4 16:41:56 BST 2000 v 1.42 created location2range and rewritten +# cdslocation2transcript +# Tue Apr 4 18:18:42 BST 2000 v 1.44 finished (maybe) hash2gene +# Tue Apr 4 19:14:33 BST 2000 v 1.49 temporary printgene done. All working :) +# Wed Apr 5 02:04:01 BST 2000 v 1.5 added upbound,downbound to hash2gene +# Wed Apr 5 13:06:43 BST 2000 v 2.0 started obj_oriented and inheritance +# Thu Apr 6 03:11:29 BST 2000 v 2.2 transition from $location to @range +# Thu Apr 6 04:26:04 BST 2000 v 2.3 both SRS and BP work with gene and entry! +# Fri Apr 7 01:47:51 BST 2000 v 2.4 genes() created +# Fri Apr 7 03:01:46 BST 2000 v 2.5 changed hash2gene so that if there is +# just 1 CDS in entry it will use all +# features of the entry as Gene features +# Tue Apr 18 18:14:19 BST 2000 v 3.0 printswissprot added +# Wed Apr 19 22:15:12 BST 2000 v 3.2 swisshash2liveseq created +# Thu Apr 20 00:14:09 BST 2000 v 3.4 swisshash2liveseq updated: now it correctly handles cleaved_met and conflicts/mod_res/variants recorded differences between EMBL and SWISSPROT translations sequences. Still some not-recorded conflicts are possible and in these cases the program won't create the AARange -> this could change in the future, if a better stringcomparison is introduced +# Thu Apr 20 01:14:16 BST 2000 v 3.6 changed entry2liveseq and gene2liveseq to namedargument input format; added getswissprotinfo flag/option +# Thu Apr 20 02:18:58 BST 2000 v 3.7 mRNA added as valid_feature -> it gets recorded as prim_transcript object +# Thu Apr 27 16:19:43 BST 2000 v 3.8 translation_table set added to hash2gene +# Mon May 1 22:16:18 BST 2000 v 3.9 -position option added to gene2liveseq +# Tue May 2 02:43:05 BST 2000 v 4.0 moved some code in _findgenefeatures, added the possibility of using cds_position information, created _checkfeatureproximity +# Tue May 2 03:20:20 BST 2000 v 4.01 findgenefeatures debugged +# Wed May 31 13:59:09 BST 2000 v 4.02 chopped $translated to take away STOP +# Fri Jun 2 14:49:12 BST 2000 v 4.1 prints alignment with CLUSTALW +# Wed Jun 7 02:07:54 BST 2000 v 4.2 added code for "simplifying" joinedlocation features (e.g. join() in mRNA features), changing them to plain start-end ones +# Wed Jun 7 04:20:15 BST 2000 v 4.22 added translation->{'offset'} for INIT_MET +# Tue Jun 27 14:05:19 BST 2000 v. 4.3 added if() conditions so that if new() of object creation failed, the object is not passed on +# Tue Jul 4 14:15:58 BST 2000 v 4.4 note and number qualifier added to exon and intron descriptions +# Wed Jul 12 14:06:38 BST 2000 v 4.41 added if() condition out of transcript creation in transexoncreation() +# Fri Sep 15 15:41:02 BST 2000 v 4.44 created _common_novelaasequence2gene + +# Note: test_transl has been left as deprecated and is not really supported now + +use strict; +use Carp qw(cluck croak carp); +use vars qw($VERSION @ISA); +use Bio::LiveSeq::DNA 1.2; +use Bio::LiveSeq::Exon 1.0; +use Bio::LiveSeq::Transcript 2.4; +use Bio::LiveSeq::Translation 1.4; +use Bio::LiveSeq::Gene 1.1; +use Bio::LiveSeq::Intron 1.0; +use Bio::LiveSeq::Prim_Transcript 1.0; +use Bio::LiveSeq::Repeat_Region 1.0; +use Bio::LiveSeq::Repeat_Unit 1.0; +use Bio::LiveSeq::AARange 1.4; +use Bio::Tools::CodonTable; + +#@ISA=qw(Bio::LiveSeq::); # not useful now + +=head2 entry2liveseq + + Title : entry2liveseq + Usage : @translationobjects=$loader->entry2liveseq(); + : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0); + Function: creates LiveSeq objects from an entry previously loaded + Returns : array of references to objects of class Translation + Errorcode 0 + Args : optional boolean flag to avoid the retrieval of SwissProt + informations for all Transcripts containing SwissProt x-reference + default is 1 (to retrieve those informations and create AARange + LiveSeq objects) + Note : this method can get really slow for big entries. The lightweight + gene2liveseq method is recommended + +=cut + +sub entry2liveseq { + my ($self, %args) = @_; + my ($getswissprotinfo)=($args{-getswissprotinfo}); + if (defined($getswissprotinfo)) { + if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { + carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; + $getswissprotinfo=0; + } + } else { + $getswissprotinfo=1; + } + my $hashref=$self->{'hash'}; + unless ($hashref) { return (0); } + my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo); + my $test_transl=0; + if ($test_transl) { $self->test_transl($hashref,\@translationobjects);} + return @translationobjects; +} + +=head2 novelaasequence2gene + + Title : novelaasequence2gene + Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); + : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); + -taxon => 9606, + -gene_name => "tyr-kinase"); + + Function: creates LiveSeq objects from a novel amino acid sequence, + using codon usage database to choose codons according to + relative frequencies. + If a taxon ID is not specified, the default is to use the human + one (taxonomy ID 9606). + Returns : reference to a Gene object containing references to LiveSeq objects + Errorcode 0 + Args : string containing an amino acid sequence + integer (optional) with a taxonomy ID + string specifying a gene name + +=cut + +=head2 gene2liveseq + + Title : gene2liveseq + Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name"); + : $gene=$loader->gene2liveseq(-gene_name => "gene name", + -flanking => 64); + : $gene=$loader->gene2liveseq(-gene_name => "gene name", + -getswissprotinfo => 0); + : $gene=$loader->gene2liveseq(-position => 4); + + Function: creates LiveSeq objects from an entry previously loaded + It is a "light weight" creation because it creates + a LiveSequence just for the interesting region in an entry + (instead than for the total entry, like in entry2liveseq) and for + the flanking regions up to 500 nucleotides (default) or up to + the specified amount of nucleotides (given as argument) around the + Gene. + Returns : reference to a Gene object containing possibly alternative + Transcripts. + Errorcode 0 + Args : string containing the gene name as in the EMBL feature qualifier + integer (optional) "flanking": amount of flanking bases to be kept + boolean (optional) "getswissprotinfo": if set to "0" it will avoid + trying to fetch information from a crossreference to a SwissProt + entry, avoding the process of creation of AARange objects + It is "1" (on) by default + + Alternative to a gene_name, a position can be given: an + integer (1-) containing the position of the desired CDS in the + loaded entry + +=cut + +sub gene2liveseq { + my ($self, %args) = @_; + my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position}); + my $input; + unless (($gene_name)||($cds_position)) { + carp "Gene_Name or Position not specified for gene2liveseq loading function"; + return (0); + } + if (($gene_name)&&($cds_position)) { + carp "Gene_Name and Position cannot be given together, use one"; + return (0); + } elsif ($gene_name) { + $input=$gene_name; + } else { + $input="cds-position:".$cds_position; + } + + if (defined($getswissprotinfo)) { + if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { + carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; + $getswissprotinfo=0; + } + } else { + $getswissprotinfo=1; + } + + if (defined($flanking)) { + unless ($flanking >= 0) { + carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function"; + return (0); + } + } else { + $flanking=500; # the default flanking length + } + my $hashref=$self->{'hash'}; + unless ($hashref) { return (0); } + my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo); + unless ($gene) { # if $gene == 0 it means problems in hash2gene + carp "gene2liveseq produced error"; + return (0); + } + return $gene; +} + +# TODO: update so that it will work even if CDS is not only accepted FEATURE!! +# this method is for now deprecated and not supported +sub test_transl { + my ($self,$entry)=@_; + my @features=@{$entry->{'Features'}}; + my @translationobjects=@{$_[1]}; + my ($i,$translation); + my ($obj_transl,$hash_transl); + my @cds=@{$entry->{'CDS'}}; + foreach $translation (@translationobjects) { + $obj_transl=$translation->seq; + $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'}; + #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*" + unless ($obj_transl eq $hash_transl) { + cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i"; + carp "\nEntry's transl: ",$hash_transl,"\n"; + carp "\nObject's transl: ",$obj_transl,"\n"; + exit; + } + $i++; + } +} + +# argument: hashref containing the EMBL entry datas, +# getswissprotinfo boolean flag +# creates the liveseq objects +# returns: an array of Translation object references +sub hash2liveseq { + my ($self,$entry,$getswissprotinfo)=@_; + my $i; + my @transcripts; + my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} ); + $dna->alphabet(lc($entry->{'Molecule'})); + $dna->display_id($entry->{'ID'}); + $dna->accession_number($entry->{'AccNumber'}); + $dna->desc($entry->{'Description'}); + my @cds=@{$entry->{'CDS'}}; + my ($swissacc,$swisshash); my @swisshashes; + for $i (0..$#cds) { + #my @transcript=@{$cds[$i]->{'range'}}; + #$transcript=\@transcript; + #push (@transcripts,$transcript); + push (@transcripts,$cds[$i]->{'range'}); + if ($getswissprotinfo) { + $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'}; + $swisshash=$self->get_swisshash($swissacc); + #$self->printswissprot($swisshash); # DEBUG + push (@swisshashes,$swisshash); + } + } + my @translations=($self->transexonscreation($dna,\@transcripts)); + my $translation; my $j=0; + foreach $translation (@translations) { + if ($swisshashes[$j]) { # if not 0 + $self->swisshash2liveseq($swisshashes[$j],$translation); + } + $j++; + } + return (@translations); +} + +# only features pertaining to a specified gene are created +# only the sequence of the gene and appropriate context flanking regions +# are created as chain +# arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag +# returns: reference to Gene object +# +# Note: if entry contains just one CDS, all the features get added +# this is useful because often the features in these entries do not +# carry the /gene qualifier +# +# errorcode: 0 +sub hash2gene { + my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_; + my $entryfeature; + my $genefeatureshash; + + my @cds=@{$entry->{'CDS'}}; + + # checking if a position has been given instead than a gene_name + if (index($input,"cds-position:") == 0 ) { + my $cds_position=substr($input,13); # extracting the cds position + if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) { + $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo); + } + } else { + $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo); + } + + unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found + my @genes=$self->genes($entry); + my $cds_number=scalar(@cds); + warn "Warning! Not even one genefeature found for /$input/.... + The genes present in this entry are:\n\t@genes\n + The number of CDS in this entry is:\n\t$cds_number\n"; + return(0); + } + + # get max and min, check flankings + my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries" + my $seqlength=$entry->{'SeqLength'}; + my ($mindna,$maxdna); # some flanking region next to the gene "boundaries" + if ($min-$flanking < 1) { + $mindna=1; + } else { + $mindna=$min-$flanking; + } + if ($max+$flanking > $seqlength) { + $maxdna=$seqlength; + } else { + $maxdna=$max+$flanking; + } + my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1); + + # create LiveSeq objects + + # create DNA + my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna); + $dna->alphabet(lc($entry->{'Molecule'})); + $dna->source($entry->{'Organism'}); + $dna->display_id($entry->{'ID'}); + $dna->accession_number($entry->{'AccNumber'}); + $dna->desc($entry->{'Description'}); + + my @transcripts=@{$genefeatureshash->{'transcripts'}}; + # create Translations, Transcripts, Exons out of the CDS + unless (@transcripts) { + cluck "no CDS feature found for /$input/...."; + return(0); + } + my @translationobjs=$self->transexonscreation($dna,\@transcripts); + my @transcriptobjs; + + # get the Transcript obj_refs + my $translation; + my $j=0; + my @ttables=@{$genefeatureshash->{'ttables'}}; + my @swisshashes=@{$genefeatureshash->{'swisshashes'}}; + foreach $translation (@translationobjs) { + push(@transcriptobjs,$translation->get_Transcript); + if ($ttables[$j]) { # if not undef + $translation->get_Transcript->translation_table($ttables[$j]); + #} else { # DEBUG + # print "\n\t\tno translation table information....\n"; + } + if ($swisshashes[$j]) { # if not 0 + $self->swisshash2liveseq($swisshashes[$j],$translation); + } + $j++; + } + + my %gene; # this is the hash to store created object references + $gene{DNA}=$dna; + $gene{Transcripts}=\@transcriptobjs; + $gene{Translations}=\@translationobjs; + + my @exonobjs; my @intronobjs; + my @repeatunitobjs; my @repeatregionobjs; + my @primtranscriptobjs; + + my ($object,$range,$start,$end,$strand); + + my @exons=@{$genefeatureshash->{'exons'}}; + my @exondescs=@{$genefeatureshash->{'exondescs'}}; + if (@exons) { + my $exoncount = 0; + foreach $range (@exons) { + ($start,$end,$strand)=@{$range}; + $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount]; + $exoncount++; + push (@exonobjs,$object); + } else { + $exoncount++; + } + } + $gene{Exons}=\@exonobjs; + } + my @introns=@{$genefeatureshash->{'introns'}}; + my @introndescs=@{$genefeatureshash->{'introndescs'}}; + if (@introns) { + my $introncount = 0; + foreach $range (@introns) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($introndescs[$introncount]); + $introncount++; + push (@intronobjs,$object); + } else { + $introncount++; + } + } + $gene{Introns}=\@intronobjs; + } + my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}}; + if (@prim_transcripts) { + foreach $range (@prim_transcripts) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { push (@primtranscriptobjs,$object); } + } + $gene{Prim_Transcripts}=\@primtranscriptobjs; + } + my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}}; + my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}}; + if (@repeat_regions) { + my $k=0; + foreach $range (@repeat_regions) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($repeat_regions_family[$k]); + $k++; + push (@repeatregionobjs,$object); + } else { + $k++; + } + } + $gene{Repeat_Regions}=\@repeatregionobjs; + } + my @repeat_units=@{$genefeatureshash->{'repeat_units'}}; + my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}}; + if (@repeat_units) { + my $k=0; + foreach $range (@repeat_units) { + ($start,$end,$strand)=@{$range}; + $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + if ($object != -1) { + $object->desc($repeat_units_family[$k]); + $k++; + push (@repeatunitobjs,$object); + } else { + $k++; + } + } + $gene{Repeat_Units}=\@repeatunitobjs; + } + + # create the Gene + my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos + return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene, + -upbound=>$min,-downbound=>$max)); +} + +# maybe this function will be moved to general utility package +# argument: array of numbers +# returns: (min,max) numbers in the array +sub rangeofarray { + my $self=shift; + my @array=@_; + #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n"; + my ($max,$min,$element); + $min=$max=shift(@array); + foreach $element (@array) { + $element = 0 unless defined $element; + if ($element < $min) { + $min=$element; + } + if ($element > $max) { + $max=$element; + } + } + #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n"; + return ($min,$max); +} + + +# argument: reference to DNA object, reference to array of transcripts +# returns: an array of Translation object references +sub transexonscreation { + my $self=shift; + my $dna=$_[0]; + my @transcripts=@{$_[1]}; + + my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj); + my $translationobj; + my @translationobjects; + foreach $transcript (@transcripts) { + foreach $exonref (@{$transcript}) { + ($start,$end,$strand)=@{$exonref}; + #print "Creating Exon: start $start end $end strand $strand\n"; + $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); + #push (@exonobjects,$exonobj); + push (@transexons,$exonobj); + } + $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons ); + if ($transcriptobj != -1) { + $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj); + @transexons=(); # cleans it + #push (@transcriptobjects,$transcriptobj); + push (@translationobjects,$translationobj); + } + } + return (@translationobjects); +} + +#sub printgene { +# deleted. Some functionality placed in Gene->printfeaturesnum + +=head2 printswissprot + + Title : printswissprot + Usage : $loader->printswissprot($hashref); + Function: prints out all informations loaded from a database entry into the + loader. Mainly used for testing purposes. + Args : a hashref containing the SWISSPROT entry datas + Note : the hashref can be obtained with a call to the method + $loader->get_swisshash() (only with SRS loader or + BioPerl via Bio::DB::EMBL.pm) + that takes as argument a string like "SWISS-PROT:P10275" or + from $loader->swissprot2hash() that takes an SRS query string + as its argument (e.g. "swissprot-acc:P10275") + +=cut + +# argument: hashref containing the SWISSPROT entry datas +# prints out that hash, showing the informations loaded +sub printswissprot { + my ($self,$entry)=@_; + unless ($entry) { + return; + } + printf "ID: %s\n", + $entry->{'ID'}; + printf "ACC: %s\n", + $entry->{'AccNumber'}; + printf "GENE: %s\n", + $entry->{'Gene'}; + printf "DES: %s\n", + $entry->{'Description'}; + printf "ORG: %s\n", + $entry->{'Organism'}; + printf "SEQLN: %s\n", + $entry->{'SeqLength'}; + printf "SEQ: %s\n", + substr($entry->{'Sequence'},0,64); + if ($entry->{'Features'}) { + my @features=@{$entry->{'Features'}}; + my $i; + for $i (0..$#features) { + print "|",$features[$i]->{'name'},"|"; + print " at ",$features[$i]->{'location'},": "; + print "",$features[$i]->{'desc'},"\n"; + } + } +} + +=head2 printembl + + Title : printembl + Usage : $loader->printembl(); + Function: prints out all informations loaded from a database entry into the + loader. Mainly used for testing purposes. + Args : none + +=cut + +# argument: hashref containing the EMBL entry datas +# prints out that hash, showing the informations loaded +sub printembl { + my ($self,$entry)=@_; + unless ($entry) { + $entry=$self->{'hash'}; + } + my ($i,$featurename); + printf "ID: %s\n", + $entry->{'ID'}; + printf "ACC: %s\n", + $entry->{'AccNumber'}; + printf "MOL: %s\n", + $entry->{'Molecule'}; + printf "DIV: %s\n", + $entry->{'Division'}; + printf "DES: %s\n", + $entry->{'Description'}; + printf "ORG: %s\n", + $entry->{'Organism'}; + printf "SEQLN: %s\n", + $entry->{'SeqLength'}; + printf "SEQ: %s\n", + substr($entry->{'Sequence'},0,64); + my @features=@{$entry->{'Features'}}; + my @cds=@{$entry->{'CDS'}}; + print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n"; + my ($exonref,$transcript); + my ($qualifiernumber,$qualifiers,$key); + my ($start,$end,$strand); + my $j=0; + for $i (0..$#features) { + $featurename=$features[$i]->{'name'}; + if ($featurename eq "CDS") { + print "|CDS| number $j at feature position: $i\n"; + #print $features[$i]->{'location'},"\n"; + $transcript=$features[$i]->{'range'}; + foreach $exonref (@{$transcript}) { + ($start,$end,$strand)=@{$exonref}; + print "\tExon: start $start end $end strand $strand\n"; + } + $j++; + } else { + print "|$featurename| at feature position: $i\n"; + print "\trange: "; + print join("\t",@{$features[$i]->{'range'}}),"\n"; + #print $features[$i]->{'location'},"\n"; + } + $qualifiernumber=$features[$i]->{'qual_number'}; + $qualifiers=$features[$i]->{'qualifiers'}; # hash + foreach $key (keys (%{$qualifiers})) { + print "\t\t",$key,": "; + print $qualifiers->{$key},"\n"; + } + } +} + +=head2 genes + + Title : genes + Usage : $loader->genes(); + Function: Returns an array of gene_names (strings) contained in the loaded + entry. + Args : none + +=cut + +# argument: entryhashref +# returns: array of genenames found in the entry +sub genes { + my ($self,$entry)=@_; + unless ($entry) { + $entry=$self->{'hash'}; + } + my @entryfeatures=@{$entry->{'Features'}}; + my ($genename,$genenames,$entryfeature); + for $entryfeature (@entryfeatures) { + $genename=$entryfeature->{'qualifiers'}->{'gene'}; + if ($genename) { + if (index($genenames,$genename) == -1) { # if name is new + $genenames .= $genename . " "; # add the name + } + } + } + return (split(/ /,$genenames)); # assumes no space inbetween each genename +} + +# arguments: swisshash, translation objref +# adds information to the Translation, creates AARange objects, sets the +# aa_range attribute on the Translation, pointing to those objects +sub swisshash2liveseq { + my ($self,$entry,$translation)=@_; + my $translength=$translation->length; + $translation->desc($translation->desc . $entry->{'Description'}); + $translation->display_id("SWISSPROT:" . $entry->{'ID'}); + $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'}); + $translation->name($entry->{'Gene'}); + $translation->source($entry->{'Organism'}); + my @aarangeobjects; + my ($start,$end,$aarangeobj,$feature); + my @features; my @newfeatures; + if ($entry->{'Features'}) { + @features=@{$entry->{'Features'}}; + } + my $cleavedmet=0; + # check for cleaved Met + foreach $feature (@features) { + if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) { + $cleavedmet=1; + $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence + } else { + push(@newfeatures,$feature); + } + } + + my $swissseq=$entry->{'Sequence'}; + my $liveseqtransl=$translation->seq; + chop $liveseqtransl; # to take away the trailing STOP "*" + my $translated=substr($liveseqtransl,$cleavedmet); + + my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met + + if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed? + print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n"; + carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!"; + return; + } + + #my $i=0; # debug + @features=@newfeatures; + foreach $feature (@features) { + #print "Processing SwissProtFeature: $i\n"; # debug + ($start,$end)=split(/ /,$feature->{'location'}); + # Note: cleavedmet is taken in account for updating numbering + $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -desc => $feature->{'desc'}, -translation => $translation, -translength => $translength); + if ($aarangeobj != -1) { + push(@aarangeobjects,$aarangeobj); + } + # $i++; # debug + } + $translation->{'aa_ranges'}=\@aarangeobjects; +} + +# if there is no SRS support, the default will be to return 0 +# i.e. this function is overridden in SRS package +sub get_swisshash { + return (0); +} + +# Args: $entry hashref, gene_name OR cds_position (undef is used to +# choose between the two), getswissprotinfo boolean flag +# Returns: an hash holding various arrayref used in the hash2gene method +# Function: examines the nucleotide entry, identifying features belonging +# to the gene (defined either by its name or by the position of its CDS in +# the entry) + +sub _findgenefeatures { + my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_; + + my @entryfeatures=@{$entry->{'Features'}}; + my @exons; my @introns; my @prim_transcripts; my @transcripts; + my @repeat_units; my @repeat_regions; + my @repeat_units_family; my @repeat_regions_family; my $rpt_family; + my $entryfeature; my @genefeatures; + my $desc; my @exondescs; my @introndescs; + + # for swissprot xreference + my ($swissacc,$swisshash); my @swisshashes; + + # for translation_tables + my @ttables; + + # to create labels + my ($name,$exon); + my @range; my @cdsexons; my @labels; + + # maybe here also could be added special case when there is no CDS feature + # in the entry (e.g. tRNA entry -> TOCHECK). + # let's deal with the special case in which there is just one gene per entry + # usually without /gene qualifier + my @cds=@{$entry->{'CDS'}}; + + my $skipgenematch=0; + if (scalar(@cds) == 1) { + #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features."; + $skipgenematch=1; + } + + my ($cds_begin,$cds_end,$proximity); + if ($cds_position) { # if a position has been requested + my @cds_exons=@{$cds[$cds_position-1]->{'range'}}; + ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS + $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'}; + # DEBUG + unless ($skipgenematch) { + carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------"; + } + $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS + } + + for $entryfeature (@entryfeatures) { # get only features for the desired gene + if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) { + push(@genefeatures,$entryfeature); + + my @range=@{$entryfeature->{'range'}}; + $name=$entryfeature->{'name'}; + my %qualifierhash=%{$entryfeature->{'qualifiers'}}; + if ($name eq "CDS") { # that has range containing array of exons + + # swissprot crossindexing (if without SRS support it will fill array + # with zeros and do nothing + if ($getswissprotinfo) { + $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'}; + $swisshash=$self->get_swisshash($swissacc); + #$self->printswissprot($swisshash); # DEBUG + push (@swisshashes,$swisshash); + } + + push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified + + # create labels array + for $exon (@range) { + push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS + } + push (@transcripts,$entryfeature->{'range'}); + } else { + # "simplifying" the joinedlocation features. I.e. changing them from + # multijoined ones to simple plain start-end features, taking only + # the start of the first "exon" and the end of the last "exon" as + # start and end of the entire feature + if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location + @range=($range[0]->[0],$range[-1]->[1]); + } + push(@labels,$range[0],$range[1]); # start and end of every feature + if ($name eq "exon") { + $desc=$entryfeature->{'qualifiers'}->{'number'}; + if ($entryfeature->{'qualifiers'}->{'note'}) { + if ($desc) { + $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; + } else { + $desc = $entryfeature->{'qualifiers'}->{'note'}; + } + } + push (@exondescs,$desc || "unknown"); + push(@exons,\@range); + } + if ($name eq "intron") { + $desc=$entryfeature->{'qualifiers'}->{'number'}; + if ($desc) { + $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; + } else { + $desc = $entryfeature->{'qualifiers'}->{'note'}; + } + push (@introndescs,$desc || "unknown"); + push(@introns,\@range); + } + if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); } + if ($name eq "repeat_unit") { push(@repeat_units,\@range); + $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; + push (@repeat_units_family,$rpt_family || "unknown"); + } + if ($name eq "repeat_region") { push(@repeat_regions,\@range); + $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; + push (@repeat_regions_family,$rpt_family || "unknown"); + } + } + } + } + unless ($gene_name) { $gene_name="cds-position:".$cds_position; } + my %genefeatureshash; + $genefeatureshash{gene_name}=$gene_name; + $genefeatureshash{genefeatures}=\@genefeatures; + $genefeatureshash{labels}=\@labels; + $genefeatureshash{ttables}=\@ttables; + $genefeatureshash{swisshashes}=\@swisshashes; + $genefeatureshash{transcripts}=\@transcripts; + $genefeatureshash{exons}=\@exons; + $genefeatureshash{exondescs}=\@exondescs; + $genefeatureshash{introns}=\@introns; + $genefeatureshash{introndescs}=\@introndescs; + $genefeatureshash{prim_transcripts}=\@prim_transcripts; + $genefeatureshash{repeat_units}=\@repeat_units; + $genefeatureshash{repeat_regions}=\@repeat_regions; + $genefeatureshash{repeat_units_family}=\@repeat_units_family; + $genefeatureshash{repeat_regions_family}=\@repeat_regions_family; + return (\%genefeatureshash); +} + + +# used by _findgenefeatures, when a CDS at a certain position is requested, +# to retrieve only features quite close to the wanted CDS. +# Args: range hashref, begin and end positions of the CDS, $proximity +# $proximity holds the maximum distance between the extremes of the CDS +# and of the feature under exam. +# Returns: boolean +sub _checkfeatureproximity { + my ($self,$range,$cds_begin,$cds_end,$proximity)=@_; + my @range=@{$range}; + my ($begin,$end,$strand); + if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons + ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]); + } else { + ($begin,$end,$strand)=@range; + } + if ($cds_begin > $cds_end) { # i.e. reverse strand CDS + ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries + } + if ($strand == -1) { # reverse strand + ($begin,$end)=($end,$begin); # swap boundaries + } + if (($cds_begin-$end)>$proximity) { + carp "--DEBUG-- feature rejected: begin $begin end $end -------"; + return (0); + } + if (($begin-$cds_end)>$proximity) { + carp "--DEBUG-- feature rejected: begin $begin end $end -------"; + return (0); + } + carp "--DEBUG-- feature accepted: begin $begin end $end -------"; + return (1); # otherwise ok, feature considered next to CDS +} + + +# function that calls the external program "align" (on the fasta2 package) +# to create an alignment between two sequences, returning the aligned +# strings and the codes for the identity (:: ::::) + +sub _get_alignment { + my ($self,$seq1,$seq2)=@_; + my $fastafile1="/tmp/tmpfastafile1"; + my $fastafile2="/tmp/tmpfastafile2"; + my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut + my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>/dev/null | $grepcut"; # ALIGN + open TMPFASTAFILE1,">$fastafile1" || croak "Cannot write into $fastafile1 for aa alignment"; + open TMPFASTAFILE2,">$fastafile2" || croak "Cannot write into $fastafile1 for aa alignment"; + print TMPFASTAFILE1 ">firstseq\n$seq1\n"; + print TMPFASTAFILE2 ">secondseq\n$seq2\n"; + close TMPFASTAFILE1; + close TMPFASTAFILE2; + my $alignment=`$alignprogram`; + my @alignlines=split(/\n/,$alignment); + my ($linecount,$seq1_aligned,$seq2_aligned,$codes); + for ($linecount=0; $linecount < @alignlines; $linecount+=3) { + $seq1_aligned .= $alignlines[$linecount]; + $codes .= $alignlines[$linecount+1]; + $seq2_aligned .= $alignlines[$linecount+2]; + } + return ($seq1_aligned,$seq2_aligned,$codes); +} + +# common part of the function to create a novel liveseq gene structure +# from an amino acid sequence, using codon usage frequencies +# args: codon_usage_array transltableid aasequence gene_name +sub _common_novelaasequence2gene { + my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_; + my @species_codon_usage=@{$species_codon_usage}; + my @codon_usage_label= + qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg + tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga + ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa + gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag + tga); + my ($i,$j); + my %codon_usage_value; + my $aa_codon_total; + for ($i=0;$i<64;$i++) { + $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i]; + } + + my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid ); + my @aminoacids = split(//,uc($aasequence)); + my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon); + for $i (@aminoacids) { + @alt_codons = $CodonTable->revtranslate($i); + unless (@alt_codons) { + carp "No reverse translation possible for aminoacid \'$i\'"; + $dnasequence .= "???"; + } else { + $aa_codon_total=0; + for $j (@alt_codons) { + $aa_codon_total+=$codon_usage_value{$j}; + } + # print "aminoacid $i, codonchoice: "; # verbose + #$partial=0; + #for $j (@alt_codons) { + #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total; + #$partial+=($codon_usage_value{$j}/$aa_codon_total); + #} + #print "\n"; + $dice=rand(1); + #print "roulette: $dice\n"; # verbose + $partial=0; + $chosen_codon=""; + CODONCHOICE: + for $j (0..@alt_codons) { # last one not accounted + $thiscodon=$alt_codons[$j]; + $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total); + if ($dice < $relativeusage+$partial) { + $chosen_codon=$thiscodon; + last CODONCHOICE; + } else { + $partial += $relativeusage; + } + } + unless ($chosen_codon) { + $chosen_codon = $alt_codons[-1]; # the last one + } + # print ".....adding $chosen_codon\n"; # verbose + $dnasequence .= $chosen_codon; + } + } + + my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence); + my $min=1; + my $max=length($dnasequence); + my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1); + my @exons=($exon); + my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons); + $transcript->translation_table($ttabid); + my @transcripts=($transcript); + my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript); + my @translations=($translation); + my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations); + my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max); + + # creation of gene + unless ($gene) { # if $gene == 0 it means problems in hash2gene + carp "Error in Gene creation phase"; + return (0); + } + return $gene; +} + +1;