annotate variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 # $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 # bioperl module for Bio::LiveSeq::IO::SRS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 # Copyright Joseph Insana
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 # You may distribute this module under the same terms as perl itself
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 # POD documentation - main docs before the code
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 =head1 SYNOPSIS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 my $db="EMBL";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 my $acc_id="M20132";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 my $query="embl-acc:$acc_id";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 my @translationobjects=$loader->entry2liveseq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 my $gene="AR";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 my $gene=$loader->gene2liveseq("gene");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 This package uses the SRS (Sequence Retrieval System) to fetch a sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35 database entry, analyse it and create LiveSeq objects out of it.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 An embl-acc ID has to be passed to this package which will return references
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 to all translation objects created from the EMBL entry.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 References to Transcription, DNA and Exon objects can all be retrieved departing
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 from these.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42 Alternatively, a specific "gene" name can be specified, together with the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 features attached/created.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 =head1 AUTHOR - Joseph A.L. Insana
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 Email: Insana@ebi.ac.uk, jinsana@gmx.net
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50 Address:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 EMBL Outstation, European Bioinformatics Institute
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 Wellcome Trust Genome Campus, Hinxton
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 Cambs. CB10 1SD, United Kingdom
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 =head1 APPENDIX
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 The rest of the documentation details each of the object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 methods. Internal methods are usually preceded with a _
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 # Let the code begin...
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 package Bio::LiveSeq::IO::SRS;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 $VERSION=2.4;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 # Version history:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 # Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 # Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 # Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 # Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 # Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 # Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 # Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 # Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 # Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 # Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 # Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 # Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 use Carp qw(cluck croak carp);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 use vars qw($VERSION @ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 use lib $ENV{SRSEXE};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 use srsperl;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87 use Bio::Tools::CodonTable; # for novelaasequence2gene
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 use Bio::LiveSeq::IO::Loader 2.2;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 @ISA=qw(Bio::LiveSeq::IO::Loader);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 # This package can in the future host other databases loading subroutines.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 # e.g. ensembl2hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 =head2 load
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98 Title : load
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 Usage : my $acc_id="M20132";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 my $query="embl-acc:$acc_id";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 Function: loads an entry with SRS from a database into a hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 Returns : reference to a new object of class IO::SRS holding an entry
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 Errorcode 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 Args : an SRS query resulting in one fetched EMBL (by default) entry
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 sub load {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 my ($thing, %args) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 my $class = ref($thing) || $thing;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 my ($obj,%loader);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 my ($db,$query)=($args{-db},$args{-query});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 if (defined($db)) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118 unless ($db eq "EMBL") {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 carp "Note: only EMBL now supported!";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 return(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 $db="EMBL";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126 my $hashref;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 if ($db eq "EMBL") {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 my $test_transl=0; # change to 0 to avoid comparison of translation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 # these can be changed for future needs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131 my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 # dunno yet how to implement test_transl again....
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 # probably on a one-on-one basis with each translation?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 if ($test_transl) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 push (@embl_valid_qual_names,"translation"); # needed for test_transl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 # not to have the whole program die because of SRS death
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141 eval {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 my $errormsg;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 if ($@) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146 foreach $errormsg (split(/\n/,$@)) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 if (index($errormsg,"in cleanup") == -1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 carp "SRS EMBL Loader: $@";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 unless ($hashref) { return (0); }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 %loader = (db => $db, query => $query, hash => $hashref);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 $obj = \%loader;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 $obj = bless $obj, $class;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158 return $obj;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 =head2 embl2hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 Title : embl2hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 Function: retrieves with SRS an EMBL entry, parses it and creates
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 a hash that contains all the information.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166 Returns : a reference to a hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167 Errorcode: 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 Args : an SRS query resulting in one fetched EMBL entry
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169 i.e. a string in a form like "embl-acc:accession_number"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 two array references to skip features and qualifiers (for
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171 performance)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 Example: @valid_features=qw(CDS exon prim_transcript mRNA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174 $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178 # this has to be defined here as it is the only thing really proper to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 # the /SRS/ loader. Every loader has to sport his own "entry2hash" function
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 # the main functions will be in the Loader.pm
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181 # arguments: embl SRS query resulting in one fetched EMBL entry
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 # to skip features and qualifiers (for performance), two array
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 # references must be passed (this can change into string arguments to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 # be passed....)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185 # returns: a reference to a hash containing the important features requested
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186 sub embl2hash {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187 my ($i,$j);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188 my $query=$_[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189 my %valid_features; my %valid_names;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190 if ($_[1]) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193 if ($_[2]) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196 # SRS API used to fetch all relevant fields
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 my $sess = new Session;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198 my $set = $sess->query("[$query]", "");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 my $numEntries=$set->size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 if ($numEntries < 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201 carp "No sequence entry found for the query $query";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 } elsif ($numEntries > 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204 carp "Too many entries found for the input given";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207 my $entry = $set->getEntry(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209 # Fetch what we can fetch without the loader
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210 $entry_ID = $entry->fieldToString("id","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211 $entry_AccNumber = $entry->fieldToString("acc","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 $entry_Molecule = $entry->fieldToString("mol","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213 $entry_Division = $entry->fieldToString("div","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214 $entry_Description = $entry->fieldToString("des","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215 $entry_Description =~ s/\n/ /g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 $entry_Organism = $entry->fieldToString("org","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217 $entry_SeqLength = $entry->fieldToString("sl","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218 # Now use the loader
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 my $loadedentry = $entry->load("EMBL");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220 # Fetch the rest via the loader
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221 $entry_Sequence = $loadedentry->attrStr("sequence");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222 $entry_Sequence =~ s/\n//g; # from plain format to raw string
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224 # put into the hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 my %entryhash;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226 $entryhash{ID}=$entry_ID;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 $entryhash{AccNumber}=$entry_AccNumber;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228 $entryhash{Molecule}=$entry_Molecule;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229 $entryhash{Division}=$entry_Division;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230 $entryhash{Description}=$entry_Description;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231 $entryhash{Organism}=$entry_Organism;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232 $entryhash{Sequence}=$entry_Sequence;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 $entryhash{SeqLength}=$entry_SeqLength;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 # create features array
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236 my $features = $loadedentry->attrObjList("features");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 my $featuresnumber= $features->size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238 $entryhash{FeaturesNumber}=$featuresnumber;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239 my ($feature,$feature_name,$feature_location);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240 my ($feature_qual_names,$feature_qual_values);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 my ($feature_qual_name,$feature_qual_value);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242 my ($feature_qual_number,$feature_qual_number2);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243 my @features;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244 for $i (0..$featuresnumber-1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 my %feature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246 $feature = $features->get($i);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247 $feature_name = $feature->attrStr("FtKey");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249 #unless ($feature_name eq "CDS") {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250 unless ($valid_features{$feature_name}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251 #print "not valid feature: $feature_name\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252 next;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254 #print "now processing feature: $feature_name\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255 $feature_location = $feature->attrStr("FtLocation");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256 $feature_location =~ s/\n//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257 $feature_qual_names= $feature->attrStrList("FtQualNames");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258 $feature_qual_values= $feature->attrStrList("FtQualValues");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259 $feature_qual_number= $feature_qual_names->size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260 $feature_qual_number2= $feature_qual_values->size(); # paranoia check
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261 if ($feature_qual_number > $feature_qual_number2) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262 carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values.");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263 # this can happen, e.g. "/partial", let's move on, just issue a warning
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264 #return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265 } elsif ($feature_qual_number < $feature_qual_number2) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266 carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269 #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271 # Put things inside hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272 $feature{position}=$i;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273 $feature{name}=$feature_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275 # new range field in hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276 my @range;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277 if ($feature_name eq "CDS") {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278 @range=cdslocation2transcript($feature_location);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279 $feature{locationtype}="joined";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281 if (index($feature_location,"join") == -1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282 @range=location2range($feature_location);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283 } else { # complex location in feature different than CDS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284 @range=joinedlocation2range($feature_location);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285 $feature{locationtype}="joined";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288 $feature{range}=\@range;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289 $feature{location}="deprecated";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290 my %feature_qualifiers;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291 for $j (0..$feature_qual_number-1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292 $feature_qual_name=$feature_qual_names->get($j);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293 $feature_qual_name =~ s/^\///; # eliminates heading "/"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295 # skip all not interesting (for now) qualifiers
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296 unless ($valid_names{$feature_qual_name}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297 #print "not valid name: $feature_qual_name\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298 next;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300 #print "now processing: $feature_qual_name\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301 $feature_qual_value=$feature_qual_values->get($j);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302 $feature_qual_value =~ s/^"//; # eliminates heading "
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303 $feature_qual_value =~ s/"$//; # eliminates trailing "
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 $feature_qual_value =~ s/\n//g; # eliminates all new lines
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305 $feature_qualifiers{$feature_qual_name}=$feature_qual_value;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307 $feature{qualifiers}=\%feature_qualifiers;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308 push (@features,\%feature); # array of features
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310 $entryhash{Features}=\@features; # put this also into the hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311 my @cds; # array just of CDSs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312 for $i (0..$#features) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313 if ($features[$i]->{'name'} eq "CDS") {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314 push(@cds,$features[$i]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317 $entryhash{CDS}=\@cds; # put this also into the hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318 return (\%entryhash);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322 # argument: location field of an EMBL feature
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
323 # returns: array with correct $start $end and $strand to create LiveSeq obj
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
324 sub location2range {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
325 my ($location)=@_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
326 my ($start,$end,$strand);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
327 if (index($location,"complement") == -1) { # forward strand
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
328 $strand=1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
329 } else { # reverse strand
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
330 $location =~ s/complement\(//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
331 $location =~ s/\)//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
332 $strand=-1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
333 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
334 $location =~ s/\<//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
335 $location =~ s/\>//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
336 my @range=split(/\.\./,$location);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
337 if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
338 $start=$end=$range[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
339 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
340 if ($strand == 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
341 ($start,$end)=@range;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
342 } else { # reverse strand
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
343 ($end,$start)=@range;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
344 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
345 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
346 return ($start,$end,$strand);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
347 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
348
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
349 # argument: location field of a CDS feature
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
350 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
351 sub cdslocation2transcript {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
352 my ($location)=@_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
353 my @exonlocs;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
354 my $exonloc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
355 my @exon;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
356 my @transcript=();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
357 $location =~ s/^join\(//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
358 $location =~ s/\)$//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
359 @exonlocs = split (/\,/,$location);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
360 for $exonloc (@exonlocs) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
361 my @exon=location2range($exonloc);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
362 push (@transcript,\@exon);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
363 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
364 return (@transcript);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
365 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
366
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
367 # argument: location field of a CDS feature
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
368 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
369 sub joinedlocation2range {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
370 &cdslocation2transcript;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
371 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
372
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
373
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
374 =head2 get_swisshash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
375
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
376 Title : get_swisshash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
377 Usage : $loader->get_swisshash();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
378 Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
379 Function: retrieves with SRS a SwissProt entry, parses it and creates
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
380 a hash that contains all the information.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
381 Returns : a reference to a hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
382 Errorcode: 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
383 Args : the db_xref qualifier's value from an EMBL CDS Feature
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
384 i.e. a string in the form "SWISS-PROT:accession_number"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
385 Note : this can be modified (adding a second argument) to retrieve
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
386 and parse SWTREMBL, SWALL... entries
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
387
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
388 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
389
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
390 # argument: db_xref qualifier's value from EMBL CDS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
391 # errorcode: 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
392 # returns hashref
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
393 sub get_swisshash {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
394 my ($self,$query)=@_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
395 if (index($query,"SWISS-PROT") == -1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
396 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
397 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
398 $query =~ s/SWISS-PROT/swissprot-acc/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
399 my $hashref;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
400 eval {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
401 $hashref=&swissprot2hash("$query");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
402 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
403 my $errormsg;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
404 if ($@) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
405 foreach $errormsg (split(/\n/,$@)) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
406 if (index($errormsg,"in cleanup") == -1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
407 carp "SRS Swissprot Loader: $@";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
408 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
409 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
410 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
411 unless ($hashref) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
412 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
413 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
414 return ($hashref);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
415 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
416 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
417
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
418 =head2 swissprot2hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
419
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
420 Title : swissprot2hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
421 Usage : $loader->swissprot2hash();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
422 Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
423 Function: retrieves with SRS a SwissProt entry, parses it and creates
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
424 a hash that contains all the information.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
425 Returns : a reference to a hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
426 Errorcode: 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
427 Args : an SRS query resulting in one entry from SwissProt database
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
428 i.e. a string in the form "swissprot-acc:accession_number"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
429 Note : this can be modified (adding a second argument) to retrieve
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
430 and parse SWTREMBL, SWALL... entries
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
431
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
432 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
433
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
434 # arguments: swissprot SRS query resulting in one fetched swissprot entry
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
435 # returns: a reference to a hash containing the important features requested
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
436 sub swissprot2hash {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
437 my ($i,$j);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
438 my $query=$_[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
439 # SRS API used to fetch all relevant fields
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
440 my $sess = new Session;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
441 my $set = $sess->query("[$query]", "");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
442 my $numEntries = $set->size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
443 if ($numEntries < 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
444 carp "No sequence entry found for the query $query";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
445 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
446 } elsif ($numEntries > 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
447 carp "Too many entries found for the input given";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
448 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
449 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
450 my $entry = $set->getEntry(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
451 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
452 # Fetch what we can fetch without the loader
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
453 $entry_ID = $entry->fieldToString("id","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
454 $entry_AccNumber = $entry->fieldToString("acc","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
455 $entry_Gene = $entry->fieldToString("gen","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
456 $entry_Gene =~ s/\n/ /g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
457 $entry_Description = $entry->fieldToString("des","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
458 $entry_Description =~ s/\n/ /g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
459 $entry_Organism = $entry->fieldToString("org","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
460 chop $entry_Organism;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
461 $entry_SeqLength = $entry->fieldToString("sl","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
462 # Now use the loader
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
463 my $loadedentry = $entry->load("Swissprot");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
464 # Fetch the rest via the loader
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
465 $entry_Sequence = $loadedentry->attrStr("Sequence");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
466 $entry_Sequence =~ s/\n//g; # from plain format to raw string
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
467
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
468 # put into the hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
469 my %entryhash;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
470 $entryhash{ID}=$entry_ID;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
471 $entryhash{AccNumber}=$entry_AccNumber;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
472 $entryhash{Molecule}=$entry_Molecule;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
473 $entryhash{Gene}=$entry_Gene;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
474 $entryhash{Description}=$entry_Description;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
475 $entryhash{Organism}=$entry_Organism;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
476 $entryhash{Sequence}=$entry_Sequence;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
477 $entryhash{SeqLength}=$entry_SeqLength;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
478
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
479 # create features array
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
480 my $features = $loadedentry->attrObjList("Features");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
481 my $featuresnumber= $features->size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
482 $entryhash{FeaturesNumber}=$featuresnumber;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
483 my ($feature,$feature_name,$feature_description,$feature_location);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
484 my @features;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
485 for $i (0..$featuresnumber-1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
486 my %feature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
487 $feature = $features->get($i);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
488 $feature_name = $feature->attrStr("FtKey");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
489 $feature_location = $feature->attrStr("FtLocation");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
490 $feature_location =~ s/ +/ /g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
491 $feature_description = $feature->attrStr("FtDescription");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
492 chop $feature_description;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
493 $feature_description =~ s/\nFT / /g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
494
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
495 # Put things inside hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
496 $feature{position}=$i;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
497 $feature{name}=$feature_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
498 $feature{location}=$feature_location;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
499 $feature{description}=$feature_description;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
500
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
501 push (@features,\%feature); # array of features
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
502 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
503 $entryhash{Features}=\@features; # put this also into the hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
504 return (\%entryhash);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
505 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
506 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
507
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
508 =head2 novelaasequence2gene
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
509
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
510 Title : novelaasequence2gene
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
511 Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
512 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
513 -genome => "Homo sapiens");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
514 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
515 -genome => "Mitochondrion Homo sapiens",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
516 -gene_name => "tyr-kinase");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
517
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
518 Function: creates LiveSeq objects from a novel amino acid sequence,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
519 using codon usage database to choose codons according to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
520 relative frequencies.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
521 If a genome latin name is not specified, the default is to use
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
522 'Homo sapiens' (taxonomy ID 9606).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
523 Returns : reference to a Gene object containing references to LiveSeq objects
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
524 Errorcode 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
525 Args : string containing an amino acid sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
526 string (optional) with a species/genome latin name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
527 string specifying a gene name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
528 Note : SRS access to TAXON and CODONUSAGE databases is required
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
529
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
530 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
531
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
532 sub novelaasequence2gene {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
533 my ($self, %args) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
534 my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
535 unless ($aasequence) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
536 carp "aasequence not given";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
537 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
538 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
539 unless ($gene_name) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
540 $gene_name="Novel Unknown";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
541 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
542 unless ($species_name) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
543 $species_name="Homo sapiens";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
544 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
545
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
546 my $sess = new Session;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
547 my ($e,$numEntries,$set);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
548
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
549 # codonusage lookup
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
550 my $codonusage_usage;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
551 my @species_codon_usage;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
552 $set = $sess->query("[codonusage:'$species_name']", "");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
553 $numEntries = $set->size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
554 if ($numEntries > 0) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
555 $e = $set->getEntry(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
556 $species_name = $e->fieldToString("id","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
557 $codonusage_usage = $e->fieldToString("usage","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
558 @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
559 if ($numEntries > 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
560 carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
561 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
562 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
563 carp "Genome not found in codon usage DB.";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
564 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
565 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
566
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
567 # taxonomy lookup
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
568 my $mito_flag = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
569 my $species_origin;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
570 if ($species_name =~ /^Mitochondrion /) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
571 $mito_flag = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
572 $species_origin = $species_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
573 $species_origin =~ s/^Mitochondrion //;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
574 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
575 } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
576 $species_origin = $species_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
577 $species_origin =~ s/^Chromoplast //;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
578 $species_origin =~ s/^Kinetoplast //;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
579 $species_origin =~ s/^Chloroplast //;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
580 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
581 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
582 $set = $sess->query("[taxonomy-species:'$species_name']", "");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
583 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
584 $numEntries = $set->size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
585 my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
586 if ($numEntries > 0) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
587 $e = $set->getEntry(0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
588 $taxonomy_id = $e->fieldToString("id","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
589 $taxonomy_name = $e->fieldToString("species","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
590 $taxonomy_gc = $e->fieldToString("gc","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
591 $taxonomy_mgc = $e->fieldToString("mgc","","","");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
592 if ($numEntries > 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
593 carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
594 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
595 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
596 carp "Genome not found in taxonomy DB.";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
597 return (0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
598 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
599 # Lookup appropriate translation table
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
600 my $ttabid;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
601 if ($mito_flag) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
602 $ttabid = $taxonomy_mgc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
603 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
604 $ttabid = $taxonomy_gc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
605 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
606
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
607 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
608 return ($gene);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
609 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
610
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
611 1;