annotate variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm @ 1:d6778b5d8382 draft default tip

Deleted selected files
author willmclaren
date Fri, 03 Aug 2012 10:05:43 -0400
parents 21066c0abaf5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 # $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 # bioperl module for Bio::LiveSeq::IO::SRS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 # Copyright Joseph Insana
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 # You may distribute this module under the same terms as perl itself
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 # POD documentation - main docs before the code
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15 Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 my $db="EMBL";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20 my $acc_id="M20132";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 my $query="embl-acc:$acc_id";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 my @translationobjects=$loader->entry2liveseq();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 my $gene="AR";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28 my $gene=$loader->gene2liveseq("gene");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 This package uses the SRS (Sequence Retrieval System) to fetch a sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 database entry, analyse it and create LiveSeq objects out of it.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 An embl-acc ID has to be passed to this package which will return references
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38 to all translation objects created from the EMBL entry.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 References to Transcription, DNA and Exon objects can all be retrieved departing
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40 from these.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 Alternatively, a specific "gene" name can be specified, together with the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44 features attached/created.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 =head1 AUTHOR - Joseph A.L. Insana
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 Email: Insana@ebi.ac.uk, jinsana@gmx.net
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 Address:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 EMBL Outstation, European Bioinformatics Institute
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53 Wellcome Trust Genome Campus, Hinxton
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 Cambs. CB10 1SD, United Kingdom
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 =head1 APPENDIX
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 The rest of the documentation details each of the object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 methods. Internal methods are usually preceded with a _
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 # Let the code begin...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 package Bio::LiveSeq::IO::SRS;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66 $VERSION=2.4;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 # Version history:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69 # Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 # Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71 # Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 # Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 # Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 # Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75 # Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 # Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77 # Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 # Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79 # Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 # Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 use Carp qw(cluck croak carp);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84 use vars qw($VERSION @ISA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 use lib $ENV{SRSEXE};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 use srsperl;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 use Bio::Tools::CodonTable; # for novelaasequence2gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 use Bio::LiveSeq::IO::Loader 2.2;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 @ISA=qw(Bio::LiveSeq::IO::Loader);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 # This package can in the future host other databases loading subroutines.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 # e.g. ensembl2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 =head2 load
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 Title : load
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 Usage : my $acc_id="M20132";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 my $query="embl-acc:$acc_id";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 Function: loads an entry with SRS from a database into a hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104 Returns : reference to a new object of class IO::SRS holding an entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 Errorcode 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 Args : an SRS query resulting in one fetched EMBL (by default) entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 sub load {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111 my ($thing, %args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 my $class = ref($thing) || $thing;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 my ($obj,%loader);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115 my ($db,$query)=($args{-db},$args{-query});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117 if (defined($db)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 unless ($db eq "EMBL") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 carp "Note: only EMBL now supported!";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 return(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123 $db="EMBL";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126 my $hashref;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 if ($db eq "EMBL") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128 my $test_transl=0; # change to 0 to avoid comparison of translation
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 # these can be changed for future needs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131 my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 # dunno yet how to implement test_transl again....
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 # probably on a one-on-one basis with each translation?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 if ($test_transl) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137 push (@embl_valid_qual_names,"translation"); # needed for test_transl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 # not to have the whole program die because of SRS death
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 eval {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144 my $errormsg;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 if ($@) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146 foreach $errormsg (split(/\n/,$@)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 if (index($errormsg,"in cleanup") == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 carp "SRS EMBL Loader: $@";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 unless ($hashref) { return (0); }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 %loader = (db => $db, query => $query, hash => $hashref);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 $obj = \%loader;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 $obj = bless $obj, $class;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 return $obj;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 =head2 embl2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 Title : embl2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 Function: retrieves with SRS an EMBL entry, parses it and creates
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165 a hash that contains all the information.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 Returns : a reference to a hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 Errorcode: 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 Args : an SRS query resulting in one fetched EMBL entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169 i.e. a string in a form like "embl-acc:accession_number"
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 two array references to skip features and qualifiers (for
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171 performance)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 Example: @valid_features=qw(CDS exon prim_transcript mRNA);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174 $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 # this has to be defined here as it is the only thing really proper to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179 # the /SRS/ loader. Every loader has to sport his own "entry2hash" function
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 # the main functions will be in the Loader.pm
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 # arguments: embl SRS query resulting in one fetched EMBL entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 # to skip features and qualifiers (for performance), two array
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183 # references must be passed (this can change into string arguments to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 # be passed....)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185 # returns: a reference to a hash containing the important features requested
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 sub embl2hash {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187 my ($i,$j);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 my $query=$_[0];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 my %valid_features; my %valid_names;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 if ($_[1]) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193 if ($_[2]) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 # SRS API used to fetch all relevant fields
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197 my $sess = new Session;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198 my $set = $sess->query("[$query]", "");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199 my $numEntries=$set->size();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200 if ($numEntries < 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201 carp "No sequence entry found for the query $query";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203 } elsif ($numEntries > 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204 carp "Too many entries found for the input given";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207 my $entry = $set->getEntry(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 # Fetch what we can fetch without the loader
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 $entry_ID = $entry->fieldToString("id","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 $entry_AccNumber = $entry->fieldToString("acc","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 $entry_Molecule = $entry->fieldToString("mol","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 $entry_Division = $entry->fieldToString("div","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 $entry_Description = $entry->fieldToString("des","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 $entry_Description =~ s/\n/ /g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 $entry_Organism = $entry->fieldToString("org","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217 $entry_SeqLength = $entry->fieldToString("sl","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 # Now use the loader
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 my $loadedentry = $entry->load("EMBL");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220 # Fetch the rest via the loader
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 $entry_Sequence = $loadedentry->attrStr("sequence");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 $entry_Sequence =~ s/\n//g; # from plain format to raw string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224 # put into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 my %entryhash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226 $entryhash{ID}=$entry_ID;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227 $entryhash{AccNumber}=$entry_AccNumber;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228 $entryhash{Molecule}=$entry_Molecule;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 $entryhash{Division}=$entry_Division;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 $entryhash{Description}=$entry_Description;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 $entryhash{Organism}=$entry_Organism;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 $entryhash{Sequence}=$entry_Sequence;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233 $entryhash{SeqLength}=$entry_SeqLength;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 # create features array
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 my $features = $loadedentry->attrObjList("features");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 my $featuresnumber= $features->size();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 $entryhash{FeaturesNumber}=$featuresnumber;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 my ($feature,$feature_name,$feature_location);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240 my ($feature_qual_names,$feature_qual_values);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 my ($feature_qual_name,$feature_qual_value);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 my ($feature_qual_number,$feature_qual_number2);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243 my @features;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 for $i (0..$featuresnumber-1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245 my %feature;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 $feature = $features->get($i);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 $feature_name = $feature->attrStr("FtKey");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 #unless ($feature_name eq "CDS") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250 unless ($valid_features{$feature_name}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 #print "not valid feature: $feature_name\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 #print "now processing feature: $feature_name\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 $feature_location = $feature->attrStr("FtLocation");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 $feature_location =~ s/\n//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 $feature_qual_names= $feature->attrStrList("FtQualNames");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258 $feature_qual_values= $feature->attrStrList("FtQualValues");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 $feature_qual_number= $feature_qual_names->size();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 $feature_qual_number2= $feature_qual_values->size(); # paranoia check
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 if ($feature_qual_number > $feature_qual_number2) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262 carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values.");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263 # this can happen, e.g. "/partial", let's move on, just issue a warning
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 #return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 } elsif ($feature_qual_number < $feature_qual_number2) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 # Put things inside hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272 $feature{position}=$i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 $feature{name}=$feature_name;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 # new range field in hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276 my @range;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 if ($feature_name eq "CDS") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278 @range=cdslocation2transcript($feature_location);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 $feature{locationtype}="joined";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 if (index($feature_location,"join") == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282 @range=location2range($feature_location);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 } else { # complex location in feature different than CDS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 @range=joinedlocation2range($feature_location);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 $feature{locationtype}="joined";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288 $feature{range}=\@range;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 $feature{location}="deprecated";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 my %feature_qualifiers;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 for $j (0..$feature_qual_number-1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292 $feature_qual_name=$feature_qual_names->get($j);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 $feature_qual_name =~ s/^\///; # eliminates heading "/"
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 # skip all not interesting (for now) qualifiers
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 unless ($valid_names{$feature_qual_name}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 #print "not valid name: $feature_qual_name\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300 #print "now processing: $feature_qual_name\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 $feature_qual_value=$feature_qual_values->get($j);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 $feature_qual_value =~ s/^"//; # eliminates heading "
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303 $feature_qual_value =~ s/"$//; # eliminates trailing "
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304 $feature_qual_value =~ s/\n//g; # eliminates all new lines
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305 $feature_qualifiers{$feature_qual_name}=$feature_qual_value;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 $feature{qualifiers}=\%feature_qualifiers;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 push (@features,\%feature); # array of features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310 $entryhash{Features}=\@features; # put this also into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 my @cds; # array just of CDSs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312 for $i (0..$#features) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 if ($features[$i]->{'name'} eq "CDS") {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314 push(@cds,$features[$i]);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317 $entryhash{CDS}=\@cds; # put this also into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 return (\%entryhash);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 # argument: location field of an EMBL feature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 # returns: array with correct $start $end and $strand to create LiveSeq obj
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324 sub location2range {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 my ($location)=@_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326 my ($start,$end,$strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327 if (index($location,"complement") == -1) { # forward strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328 $strand=1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 } else { # reverse strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 $location =~ s/complement\(//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 $location =~ s/\)//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332 $strand=-1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334 $location =~ s/\<//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335 $location =~ s/\>//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336 my @range=split(/\.\./,$location);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337 if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338 $start=$end=$range[0];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340 if ($strand == 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 ($start,$end)=@range;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342 } else { # reverse strand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 ($end,$start)=@range;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346 return ($start,$end,$strand);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349 # argument: location field of a CDS feature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351 sub cdslocation2transcript {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352 my ($location)=@_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353 my @exonlocs;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354 my $exonloc;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 my @exon;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356 my @transcript=();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357 $location =~ s/^join\(//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358 $location =~ s/\)$//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359 @exonlocs = split (/\,/,$location);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 for $exonloc (@exonlocs) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 my @exon=location2range($exonloc);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362 push (@transcript,\@exon);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364 return (@transcript);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367 # argument: location field of a CDS feature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369 sub joinedlocation2range {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370 &cdslocation2transcript;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374 =head2 get_swisshash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376 Title : get_swisshash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377 Usage : $loader->get_swisshash();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378 Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275")
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 Function: retrieves with SRS a SwissProt entry, parses it and creates
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380 a hash that contains all the information.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381 Returns : a reference to a hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382 Errorcode: 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383 Args : the db_xref qualifier's value from an EMBL CDS Feature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 i.e. a string in the form "SWISS-PROT:accession_number"
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385 Note : this can be modified (adding a second argument) to retrieve
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386 and parse SWTREMBL, SWALL... entries
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390 # argument: db_xref qualifier's value from EMBL CDS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 # errorcode: 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392 # returns hashref
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393 sub get_swisshash {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 my ($self,$query)=@_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395 if (index($query,"SWISS-PROT") == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398 $query =~ s/SWISS-PROT/swissprot-acc/;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399 my $hashref;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400 eval {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401 $hashref=&swissprot2hash("$query");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 my $errormsg;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404 if ($@) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405 foreach $errormsg (split(/\n/,$@)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406 if (index($errormsg,"in cleanup") == -1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
407 carp "SRS Swissprot Loader: $@";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
408 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
409 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
410 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
411 unless ($hashref) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
412 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
413 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
414 return ($hashref);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
415 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
416 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
417
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
418 =head2 swissprot2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
419
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
420 Title : swissprot2hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
421 Usage : $loader->swissprot2hash();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
422 Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275")
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
423 Function: retrieves with SRS a SwissProt entry, parses it and creates
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
424 a hash that contains all the information.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
425 Returns : a reference to a hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
426 Errorcode: 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
427 Args : an SRS query resulting in one entry from SwissProt database
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
428 i.e. a string in the form "swissprot-acc:accession_number"
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
429 Note : this can be modified (adding a second argument) to retrieve
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
430 and parse SWTREMBL, SWALL... entries
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
431
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
432 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
433
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
434 # arguments: swissprot SRS query resulting in one fetched swissprot entry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
435 # returns: a reference to a hash containing the important features requested
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
436 sub swissprot2hash {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
437 my ($i,$j);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
438 my $query=$_[0];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
439 # SRS API used to fetch all relevant fields
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
440 my $sess = new Session;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
441 my $set = $sess->query("[$query]", "");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
442 my $numEntries = $set->size();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
443 if ($numEntries < 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
444 carp "No sequence entry found for the query $query";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
445 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
446 } elsif ($numEntries > 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
447 carp "Too many entries found for the input given";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
448 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
449 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
450 my $entry = $set->getEntry(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
451 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
452 # Fetch what we can fetch without the loader
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
453 $entry_ID = $entry->fieldToString("id","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
454 $entry_AccNumber = $entry->fieldToString("acc","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
455 $entry_Gene = $entry->fieldToString("gen","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
456 $entry_Gene =~ s/\n/ /g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
457 $entry_Description = $entry->fieldToString("des","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
458 $entry_Description =~ s/\n/ /g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
459 $entry_Organism = $entry->fieldToString("org","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
460 chop $entry_Organism;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
461 $entry_SeqLength = $entry->fieldToString("sl","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
462 # Now use the loader
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
463 my $loadedentry = $entry->load("Swissprot");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
464 # Fetch the rest via the loader
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
465 $entry_Sequence = $loadedentry->attrStr("Sequence");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
466 $entry_Sequence =~ s/\n//g; # from plain format to raw string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
467
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
468 # put into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
469 my %entryhash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
470 $entryhash{ID}=$entry_ID;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
471 $entryhash{AccNumber}=$entry_AccNumber;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
472 $entryhash{Molecule}=$entry_Molecule;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
473 $entryhash{Gene}=$entry_Gene;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
474 $entryhash{Description}=$entry_Description;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
475 $entryhash{Organism}=$entry_Organism;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
476 $entryhash{Sequence}=$entry_Sequence;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
477 $entryhash{SeqLength}=$entry_SeqLength;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
478
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
479 # create features array
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
480 my $features = $loadedentry->attrObjList("Features");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
481 my $featuresnumber= $features->size();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
482 $entryhash{FeaturesNumber}=$featuresnumber;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
483 my ($feature,$feature_name,$feature_description,$feature_location);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
484 my @features;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
485 for $i (0..$featuresnumber-1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
486 my %feature;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
487 $feature = $features->get($i);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
488 $feature_name = $feature->attrStr("FtKey");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
489 $feature_location = $feature->attrStr("FtLocation");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
490 $feature_location =~ s/ +/ /g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
491 $feature_description = $feature->attrStr("FtDescription");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
492 chop $feature_description;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
493 $feature_description =~ s/\nFT / /g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
494
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
495 # Put things inside hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
496 $feature{position}=$i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
497 $feature{name}=$feature_name;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
498 $feature{location}=$feature_location;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
499 $feature{description}=$feature_description;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
500
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
501 push (@features,\%feature); # array of features
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
502 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
503 $entryhash{Features}=\@features; # put this also into the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
504 return (\%entryhash);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
505 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
506 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
507
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
508 =head2 novelaasequence2gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
509
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
510 Title : novelaasequence2gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
511 Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
512 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
513 -genome => "Homo sapiens");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
514 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
515 -genome => "Mitochondrion Homo sapiens",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
516 -gene_name => "tyr-kinase");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
517
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
518 Function: creates LiveSeq objects from a novel amino acid sequence,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
519 using codon usage database to choose codons according to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
520 relative frequencies.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
521 If a genome latin name is not specified, the default is to use
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
522 'Homo sapiens' (taxonomy ID 9606).
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
523 Returns : reference to a Gene object containing references to LiveSeq objects
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
524 Errorcode 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
525 Args : string containing an amino acid sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
526 string (optional) with a species/genome latin name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
527 string specifying a gene name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
528 Note : SRS access to TAXON and CODONUSAGE databases is required
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
529
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
530 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
531
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
532 sub novelaasequence2gene {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
533 my ($self, %args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
534 my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
535 unless ($aasequence) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
536 carp "aasequence not given";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
537 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
538 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
539 unless ($gene_name) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
540 $gene_name="Novel Unknown";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
541 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
542 unless ($species_name) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
543 $species_name="Homo sapiens";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
544 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
545
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
546 my $sess = new Session;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
547 my ($e,$numEntries,$set);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
548
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
549 # codonusage lookup
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
550 my $codonusage_usage;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
551 my @species_codon_usage;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
552 $set = $sess->query("[codonusage:'$species_name']", "");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
553 $numEntries = $set->size();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
554 if ($numEntries > 0) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
555 $e = $set->getEntry(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
556 $species_name = $e->fieldToString("id","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
557 $codonusage_usage = $e->fieldToString("usage","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
558 @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
559 if ($numEntries > 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
560 carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
561 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
562 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
563 carp "Genome not found in codon usage DB.";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
564 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
565 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
566
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
567 # taxonomy lookup
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
568 my $mito_flag = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
569 my $species_origin;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
570 if ($species_name =~ /^Mitochondrion /) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
571 $mito_flag = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
572 $species_origin = $species_name;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
573 $species_origin =~ s/^Mitochondrion //;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
574 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
575 } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
576 $species_origin = $species_name;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
577 $species_origin =~ s/^Chromoplast //;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
578 $species_origin =~ s/^Kinetoplast //;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
579 $species_origin =~ s/^Chloroplast //;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
580 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
581 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
582 $set = $sess->query("[taxonomy-species:'$species_name']", "");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
583 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
584 $numEntries = $set->size();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
585 my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
586 if ($numEntries > 0) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
587 $e = $set->getEntry(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
588 $taxonomy_id = $e->fieldToString("id","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
589 $taxonomy_name = $e->fieldToString("species","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
590 $taxonomy_gc = $e->fieldToString("gc","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
591 $taxonomy_mgc = $e->fieldToString("mgc","","","");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
592 if ($numEntries > 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
593 carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
594 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
595 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
596 carp "Genome not found in taxonomy DB.";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
597 return (0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
598 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
599 # Lookup appropriate translation table
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
600 my $ttabid;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
601 if ($mito_flag) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
602 $ttabid = $taxonomy_mgc;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
603 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
604 $ttabid = $taxonomy_gc;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
605 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
606
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
607 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
608 return ($gene);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
609 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
610
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
611 1;