annotate variant_effect_predictor/Bio/LiveSeq/IO/BioPerl.pm @ 1:d6778b5d8382 draft default tip

Deleted selected files
author willmclaren
date Fri, 03 Aug 2012 10:05:43 -0400
parents 21066c0abaf5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 # $Id: BioPerl.pm,v 1.15 2001/12/14 16:40:15 heikki Exp $
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 # bioperl module for Bio::LiveSeq::IO::BioPerl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 # Copyright Joseph Insana
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 # You may distribute this module under the same terms as perl itself
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 # POD documentation - main docs before the code
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15 Bio::LiveSeq::IO::BioPerl - Loader for LiveSeq from EMBL entries with BioPerl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 my $db="EMBL";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20 my $file="../data/M20132";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 my $id="HSANDREC";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -file=>"$file");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24 or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 my $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"$db", -id=>"$id");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 my @translationobjects=$loader->entry2liveseq();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 my $genename="AR";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 my $gene=$loader->gene2liveseq(-gene_name => "$genename",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 -getswissprotinfo => 0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 NOTE1: The only -db now supported is EMBL. Hence it defaults to EMBL.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 NOTE2: -file requires a filename (and path if necessary) containing an
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 EMBL entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 -id will use Bio::DB::EMBL.pm to fetch the sequence from the web,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 (bioperl wraparound to [w]getz from SRS)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38 NOTE3: To retrieve the swissprot (if possible) attached to the embl entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 (to get protein domains at dna level), only Bio::DB::EMBL.pm
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40 is supported under BioPerl. Refer to Bio::LiveSeq::IO::SRS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 otherwise.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 NOTE4: NOTE3 is not implemented yet for bioperl, working on it
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 This package uses BioPerl (SeqIO) to fetch a sequence database entry,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 analyse it and create LiveSeq objects out of it.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 A filename (or an ID that will fetch entry through the web) has to be passed
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51 to this package which will return references to all translation objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 created from the EMBL entry. References to Transcription, DNA and Exon
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53 objects can all be retrieved departing from these.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55 Alternatively, a specific "gene" name can be specified, together with
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 the embl-acc ID. This will create a LiveSeq::Gene object with all
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57 relevant gene features attached/created.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 ATTENTION: if web fetching is requested, the package HTTP::Request needs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60 to be installed.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 =head1 AUTHOR - Joseph A.L. Insana
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 Email: Insana@ebi.ac.uk, jinsana@gmx.net
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67 Address:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69 EMBL Outstation, European Bioinformatics Institute
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 Wellcome Trust Genome Campus, Hinxton
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71 Cambs. CB10 1SD, United Kingdom
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 =head1 APPENDIX
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75 The rest of the documentation details each of the object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 methods. Internal methods are usually preceded with a _
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 # Let the code begin...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 package Bio::LiveSeq::IO::BioPerl;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 $VERSION=2.42;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 # Version history:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 # Thu Apr 6 00:25:46 BST 2000 v 1.0 begun
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 # Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88 # Thu Apr 6 03:40:36 BST 2000 v 2.0 working
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 # Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 # Tue Jul 4 14:07:52 BST 2000 v 2.11 note&number added in val_qual_names
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 # Fri Sep 15 15:41:02 BST 2000 v 2.22 novelaasequence2gene now works without SRS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 # Mon Jan 29 17:40:06 EST 2001 v 2.3 made it work with the new split_location of BioPerl 0.7
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 # Tue Apr 10 17:00:18 BST 2001 v 2.41 started work on support of DB::EMBL.pm
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 # Tue Apr 10 17:22:26 BST 2001 v 2.42 -id should work now
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 # TODO->TOCHECK
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97 # each_secondary_access not working
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 # why array from each_tag_value($qual) ? When will there be more than one
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 # element in such array?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 # what is the annotation object? ($seqobj->annotation)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 # unsatisfied by both BioPerl binomial and SRS "org" to retrieve Organism info
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104 use Carp qw(cluck croak carp);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 use vars qw($VERSION @ISA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 use Bio::SeqIO; # for -file entry loading
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 # Note, the following requires HTTP::Request. If the modules are not installed
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 # uncomment the following and use only -filename and don't request swissprotinfo
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 use Bio::DB::EMBL; # for -id entry loading
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 use Bio::LiveSeq::IO::Loader 2.0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 @ISA=qw(Bio::LiveSeq::IO::Loader);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 # This package can in the future host other databases loading subroutines.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117 # e.g. ensembl2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 =head2 load
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 Title : load
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 Usage : my $filename="../data/M20132";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123 $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -file=>"$filename");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125 $loader=Bio::LiveSeq::IO::BioPerl->load(-db=>"EMBL", -id=>"HSANDREC");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 Function: loads an entry with BioPerl from a database into a hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128 Returns : reference to a new object of class IO::BioPerl holding an entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 Errorcode 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 Args : an filename containing an EMBL entry OR an ID or ACCESSION code
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 sub load {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 my ($thing, %args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 my $class = ref($thing) || $thing;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137 my ($obj,%loader);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139 my ($db,$filename,$id)=($args{-db},$args{-file},$args{-id});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 if (defined($db)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 unless ($db eq "EMBL") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 carp "Note: only EMBL now supported!";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144 return(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 $db="EMBL";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 if (defined($id) && defined($filename)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 carp "You can either specify a -id or a -filename!";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152 return(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 unless (defined($id) || defined($filename)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 carp "You must specify either a -id or a -filename!";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 return(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 my $hashref;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 if ($db eq "EMBL") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 my $test_transl=0; # change to 0 to avoid comparison of translation
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 # these can be changed for future needs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165 my @embl_valid_feature_names=qw(CDS CDS_span exon prim_transcript intron repeat_unit repeat_region mRNA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 # dunno yet how to implement test_transl again....
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169 # probably on a one-on-one basis with each translation?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 if ($test_transl) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171 push (@embl_valid_qual_names,"translation"); # needed for test_transl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174 my $seqobj; # bioperl sequence object, to be passed to embl2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176 if (defined($filename)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 my $stream = Bio::SeqIO->new('-file' => $filename, '-format' => 'EMBL');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 $seqobj = $stream->next_seq();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179 } else { # i.e. if -id
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 my $embl = new Bio::DB::EMBL;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 $seqobj = $embl->get_Seq_by_id($id); # EMBL ID or ACC
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 $hashref=&embl2hash($seqobj,\@embl_valid_feature_names,\@embl_valid_qual_names);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 unless ($hashref) { return (0); }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 %loader = (db => $db, filename => $filename, id => $id, hash => $hashref);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 $obj = \%loader;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 $obj = bless $obj, $class;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 return $obj;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194 =head2 embl2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 Title : embl2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197 Function: retrieves with BioPerl an EMBL entry, parses it and creates
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198 a hash that contains all the information.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199 Returns : a reference to a hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200 Errorcode: 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201 Args : a BioPerl Sequence Object (from file or web fetching)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202 two array references to skip features and qualifiers (for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203 performance)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204 Example: @valid_features=qw(CDS exon prim_transcript mRNA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206 $hashref=&embl2hash($seqobj,\@valid_features,\@valid_qualifiers);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 # arguments: Bioperl $seqobj
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 # to skip features and qualifiers (for performance), two array
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 # references must be passed (this can change into string arguments to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 # be passed....)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 # returns: a reference to a hash containing the important features requested
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 sub embl2hash {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 my $seqobj=$_[0];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217 my %valid_features; my %valid_names;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 if ($_[1]) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 if ($_[2]) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 my $annobj = $seqobj->annotation(); # what's this?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227 my $entry_Sequence = lc($seqobj->seq()); # SRS returns lowercase
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 my $entry_ID = $seqobj->display_id;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 my $entry_AccNumber = $seqobj->accession; # or maybe accession_number ?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 my $secondary_acc; # to fetch the other acc numbers
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 foreach $secondary_acc ($seqobj->get_secondary_accessions) { # not working!
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233 $entry_AccNumber .= " $secondary_acc";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 my $entry_Molecule = $seqobj->molecule; # this alone returns molec+division
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 my $entry_Division = $seqobj->division;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 # fixed: now Molecule works in BioPerl, no need for next lines
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 #my @Molecule=split(" ",$entry_Molecule);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 #my $entry_Division = pop(@Molecule); # only division
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240 #$entry_Molecule = join(" ",@Molecule); # only molecule
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 my $entry_Description = $seqobj->desc;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243 my $speciesobj = $seqobj->species;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 my $entry_Organism = $speciesobj->binomial;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 my $entry_SeqLength = $seqobj->length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 # put into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 my %entryhash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250 $entryhash{ID}=$entry_ID;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 $entryhash{AccNumber}=$entry_AccNumber;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252 $entryhash{Molecule}=$entry_Molecule;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 $entryhash{Division}=$entry_Division;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 $entryhash{Description}=$entry_Description;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 $entryhash{Organism}=$entry_Organism;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 $entryhash{Sequence}=$entry_Sequence;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 $entryhash{SeqLength}=$entry_SeqLength;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 my @topfeatures=$seqobj->top_SeqFeatures();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 # create features array
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 my $featuresnumber= scalar(@topfeatures);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262 $entryhash{FeaturesNumber}=$featuresnumber;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263 my $feature_name;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 my @feature_qual_names; my @feature_qual_value;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 my ($feature_qual_name,$feature_qual_number);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 my @features;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 my ($feat,$qual,$subfeat);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 my @subfeat;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270 my $i=0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 foreach $feat (@topfeatures) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272 my %feature;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 $feature_name = $feat->primary_tag;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274 unless ($valid_features{$feature_name}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 #print "skipping $feature_name\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278 # works ok with 0.6.2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 # if ($feature_name eq "CDS_span") { # case of CDS with various exons 0.6.2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280 # $feature_name="CDS"; # 0.6.2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 my $featlocation=$feat->location; # 0.7
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282 if (($feature_name eq "CDS")&&($featlocation->isa('Bio::Location::SplitLocationI'))) { # case of CDS with various exons BioPerl 0.7
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 # @subfeat=$feat->sub_SeqFeature; # 0.6.2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 @subfeat=$featlocation->sub_Location(); # 0.7
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 my @transcript;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286 foreach $subfeat (@subfeat) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 my @range;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288 if ($subfeat->strand == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 @range=($subfeat->end,$subfeat->start,$subfeat->strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 @range=($subfeat->start,$subfeat->end,$subfeat->strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 push (@transcript,\@range);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 $feature{range}=\@transcript;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 my @range;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 ($feat->strand == -1) ? (@range = ($feat->end, $feat->start, $feat->strand) ) :
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 (@range = ( $feat->start,$feat->end,$feat->strand) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300 # works ok with 0.6.2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 if ($feature_name eq "CDS") { # case of single exon CDS (CDS name but not split location)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 my @transcript=(\@range);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303 $feature{range}=\@transcript;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304 } else { # all other range features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305 $feature{range}=\@range;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 $feature{location}="deprecated";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310 $feature{position}=$i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 $feature{name}=$feature_name;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 @feature_qual_names= $feat->all_tags();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314 $feature_qual_number= scalar(@feature_qual_names);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316 $feature{qual_number}=$feature_qual_number;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 my %feature_qualifiers;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319 for $qual (@feature_qual_names) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320 $feature_qual_name=$qual;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321 unless ($valid_names{$feature_qual_name}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324 @feature_qual_value=$feat->each_tag_value($qual);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 #print "$qual => @feature_qual_value \n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326 $feature_qualifiers{$feature_qual_name}=$feature_qual_value[0]; # ?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327 # maybe the whole array should be entered, not just the 1st element?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328 # what could be the other elements? TOCHECK!
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 $feature{qualifiers}=\%feature_qualifiers;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 push (@features,\%feature); # array of features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332 $i++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334 $entryhash{Features}=\@features; # put this also into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336 my @cds; # array just of CDSs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337 for $i (0..$#features) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338 if ($features[$i]->{'name'} eq "CDS") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 push(@cds,$features[$i]);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342 $entryhash{CDS}=\@cds; # put this also into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 return (\%entryhash);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346 =head2 novelaasequence2gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348 Title : novelaasequence2gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349 Usage : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350 : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351 -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352 : $gene=Bio::LiveSeq::IO::BioPerl->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353 -cusg_data => "58 44 7 29 3 3 480 267 105 143 122 39 144 162 14 59 53 25 233 292 19 113 88 246 28 68 161 231 27 102 128 151 67 60 138 131 48 61 153 19 233 73 150 31 129 38 147 71 138 43 181 81 44 15 255 118 312 392 236 82 20 10 14 141",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354 -translation_table => "2",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 -gene_name => "tyr-kinase");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357 Function: creates LiveSeq objects from a novel amino acid sequence,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358 using codon usage information (loaded from a file) to choose
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359 codons according to relative frequencies.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 If a codon_usage information is not specified,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 the default is to use Homo sapiens data (taxonomy ID 9606).
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362 If a translation_table ID is not specified, it will default to 1
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363 (standard code).
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364 Returns : reference to a Gene object containing references to LiveSeq objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365 Errorcode 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366 Args : string containing an amino acid sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367 string (optional) with codon usage data (64 integer numbers)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368 string (optional) specifying a gene_name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369 integer (optional) specifying a translation_table ID
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373 sub novelaasequence2gene {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374 my ($self, %args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375 my ($gene_name,$cusg_data,$aasequence,$ttabid)=($args{-gene_name},$args{-cusg_data},$args{-aasequence},$args{-translation_table});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377 my @species_codon_usage;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378 unless ($aasequence) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 carp "aasequence not given";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382 unless ($gene_name) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383 $gene_name="Novel Unknown";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385 unless ($ttabid) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386 $ttabid=1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 unless ($cusg_data) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389 @species_codon_usage=
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390 qw(68664 118404 126679 51100 125600 123646 75667 210903 435317
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 139009 79303 135218 128429 192616 49456 161556 211962 131222
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392 162837 213626 69346 140780 182506 219428 76684 189374 173010
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393 310626 82647 202329 180955 250410 180001 118798 76398 160764
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 317359 119013 262630 359627 218376 186915 130857 377006 162826
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395 113684 317703 441298 287040 245435 174805 133427 134523 108740
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396 225633 185619 78463 240138 174021 244236 142435 8187 5913
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 14381); # updated 21Jul2000
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399 @species_codon_usage=split(/ /,$cusg_data);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 return ($gene);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406 1;