annotate variant_effect_predictor/Bio/Seq/EncodedSeq.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 # $Id: EncodedSeq.pm,v 1.5.2.1 2003/04/28 12:11:57 jason Exp $
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 # BioPerl module for Bio::Seq::EncodedSeq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 # Cared for by Aaron Mackey
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 # Copyright Aaron Mackey
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 # You may distribute this module under the same terms as perl itself
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 # POD documentation - main docs before the code
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 Bio::Seq::EncodedSeq - subtype of L<Bio::LocatableSeq|Bio::LocatableSeq> to store DNA that encodes a protein
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 =head1 SYNOPSIS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 $obj = new Bio::Seq::EncodedSeq(-seq => $dna,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 -encoding => "CCCCCCCIIIIICCCCC",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 -start => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22 -strand => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 -length => 17);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 # splice out (and possibly revcomp) the coding sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26 $cds = obj->cds;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 # obtain the protein translation of the sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 $prot = $obj->translate;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 # other access/inspection routines as with Bio::LocatableSeq and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32 # Bio::SeqI; note that coordinates are relative only to the DNA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33 # sequence, not it's implicit encoded protein sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 Bio::Seq::EncodedSeq is a L<Bio::LocatableSeq|Bio::LocatableSeq>
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 object that holds a DNA sequence as well as information about the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 coding potential of that DNA sequence. It is meant to be useful in an
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 alignment context, where the DNA may contain frameshifts, gaps and/or
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 introns, or in describing the transcript of a gene. An EncodedSeq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42 provides the ability to access the "spliced" coding sequence, meaning
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 that all introns and gaps are removed, and any frameshifts are
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 adjusted to provide a "clean" CDS.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 In order to make simultaneous use of either the DNA or the implicit
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 encoded protein sequence coordinates, please see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 L<Bio::Coordinate::EncodingPair>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50 =head1 ENCODING
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 We use the term "encoding" here to refer to the series of symbols that
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 we use to identify which residues of a DNA sequence are protein-coding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 (i.e. part of a codon), intronic, part of a 5' or 3', frameshift
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 "mutations", etc. From this information, a Bio::Seq::EncodedSeq is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 able to "figure out" its translational CDS. There are two sets of
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 coding characters, one termed "implicit" and one termed "explicit".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 The "implict" encoding is a bit simpler than the "explicit" encoding:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 'C' is used for any nucleotide that's part of a codon, 'U' for any
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 UTR, etc. The full list is shown below:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 Code Meaning
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64 ---- -------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 C coding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 I intronic
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 U untranslated
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 G gapped (for use in alignments)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 F a "forward", +1 frameshift
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 B a "backward", -1 frameshift
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 The "explicit" encoding is just an expansion of the "implicit"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 encoding, to denote phase:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 Code Meaning
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 ---- -------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 C coding, 1st codon position
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 D coding, 2nd codon position
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 E coding, 3rd codon position
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 I intronic, phase 0 (relative to intron begin)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 J intronic, phase 1
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 K intronic, phase 2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 U untranslated 3'UTR
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 V untranslated 5'UTR
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88 G gapped (for use in alignments)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 F a "forward", +1 frameshift
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 B a "backward", -1 frameshift
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92 Note that the explicit coding is meant to provide easy access to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 position/phase specific nucleotides:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 $obj = new Bio::Seq::EncodedSeq (-seq => "ACAATCAGACTACG...",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 -encoding => "CCCCCCIII..."
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 # fetch arrays of nucleotides at each codon position:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 my @pos1 = $obj->dnaseq(encoding => 'C', explicit => 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 my @pos2 = $obj->dnaseq(encoding => 'D');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102 my @pos3 = $obj->dnaseq(encoding => 'E');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 # fetch arrays of "3-1" codon dinucleotides, useful for genomic
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 # signature analyses without compounding influences of codon bias:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 my @pairs = $obj->dnaseq(encoding => 'EC');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 =head1 FEEDBACK
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 =head2 Mailing Lists
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 User feedback is an integral part of the evolution of this
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 and other Bioperl modules. Send your comments and suggestions preferably
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114 to one of the Bioperl mailing lists.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 Your participation is much appreciated.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 bioperl-l@bioperl.org - General discussion
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118 http://www.bioperl.org/MailList.html - About the mailing lists
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 =head2 Reporting Bugs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 Report bugs to the Bioperl bug tracking system to help us keep track
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 the bugs and their resolution.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 Bug reports can be submitted via email or the web:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126 bioperl-bugs@bio.perl.org
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 http://bugzilla.bioperl.org/
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 =head1 AUTHOR - Aaron Mackey
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131 Email amackey@virginia.edu
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133 =head1 APPENDIX
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 # Let the code begin...
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 package Bio::Seq::EncodedSeq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 use vars qw(@ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 use Bio::LocatableSeq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 @ISA = qw(Bio::LocatableSeq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 =head2 new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 Title : new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 Usage : $obj = Bio::Seq::EncodedSeq->new(-seq => "AGTACGTGTCATG",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 -encoding => "CCCCCCFCCCCCC",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 -id => "myseq",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 -start => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158 -end => 13,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 -strand => 1
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 Function: creates a new Bio::Seq::EncodedSeq object from a supplied DNA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162 sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 Returns : a new Bio::Seq::EncodedSeq object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 Args : seq - primary nucleotide sequence used to encode the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166 protein; note that any positions involved in a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167 gap ('G') or backward frameshift ('B') should
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 have one or more gap characters; if the encoding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169 specifies G or B, but no (or not enough) gap
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 characters exist, *they'll be added*; similary,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171 if there are gap characters without a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 corresponding G or B encoding, G's will be
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173 inserted into the encoding. This allows some
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174 flexibility in specifying your sequence and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175 coding without having to calculate a lot of the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 encoding for yourself.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178 encoding - a string of characters (see Encoding Table)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 describing backwards frameshifts implied by the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 encoding but not present in the sequence will be
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181 added (as '-'s) to the sequence. If not
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 supplied, it will be assumed that all positions
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 are coding (C). Encoding may include either
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 implicit phase encoding characters (i.e. "CCC")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185 and/or explicit encoding characters (i.e. "CDE").
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186 Additionally, prefixed numbers may be used to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187 denote repetition (i.e. "27C3I28C").
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189 Alternatively, encoding may be a hashref
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190 datastructure, with encoding characters as keys
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191 and Bio::LocationI objects (or arrayrefs of
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 Bio::LocationI objects) as values, e.g.:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 { C => [ Bio::Location::Simple->new(1,9),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 Bio::Location::Simple->new(11,13) ],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196 F => Bio::Location::Simple->new(10,10),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 } # same as "CCCCCCCCCFCCC"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 Note that if the location ranges overlap, the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 behavior of the encoding will be undefined
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201 (well, it will be defined, but only according to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202 the order in which the hash keys are read, which
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 is basically undefined ... just don't do that).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 id, start, end, strand - as with Bio::LocatableSeq; note
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 that the coordinates are relative to the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207 encoding DNA sequence, not the implicit protein
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 sequence. If strand is reversed, then the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209 encoding is assumed to be relative to the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210 reverse strand as well.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214 #'
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 sub new {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218 my ($self, @args) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 $self = $self->SUPER::new(@args, -alphabet => 'dna');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220 my ($enc) = $self->_rearrange([qw(ENCODING)], @args);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221 # set the real encoding:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222 if ($enc) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223 $self->encoding($enc);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 $self->_recheck_encoding;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 return $self;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230 =head2 encoding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232 Title : encoding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 Usage : $obj->encoding("CCCCCC");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234 $obj->encoding( -encoding => { I => $location } );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 $enc = $obj->encoding(-explicit => 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236 $enc = $obj->encoding("CCCCCC", -explicit => 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 $enc = $obj->encoding(-location => $location,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238 -explicit => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239 -absolute => 1 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240 Function: get/set the objects encoding, either globally or by location(s).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 Returns : the (possibly new) encoding string.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242 Args : encoding - see the encoding argument to the new() function.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244 explicit - whether or not to return explicit phase
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 information in the coding (i.e. "CCC" becomes
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246 "CDE", "III" becomes "IJK", etc); defaults to 0.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248 location - optional; location to get/set the encoding.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249 Defaults to the entire sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251 absolute - whether or not the locational elements (either
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252 in the encoding hashref or the location
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253 argument) are relative to the absolute start/end
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254 of the Bio::LocatableSeq, or to the internal,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255 relative coordinate system (beginning at 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256 defaults to 0 (i.e. relative)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260 sub encoding {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262 my ($self, @args) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263 my ($enc, $loc, $exp, $abs) = $self->_rearrange([qw(ENCODING LOCATION EXPLICIT ABSOLUTE)], @args);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265 if (!$enc) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266 # do nothing; _recheck_encoding will fix for us, if necessary
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267 } elsif (ref $enc eq 'HASH') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268 $self->throw( -class => 'Bio::Root::NotImplemented',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269 -text => "Hashref functionality not yet implemented;\nplease email me if you really need this.");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270 #TODO: finish all this
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271 while (my ($char, $locs) = each %$enc) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272 if (ref $locs eq 'ARRAY') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273 } elsif (UNIVERSAL::isa($locs, "Bio::LocationI")) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275 $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278 } elsif (! ref $enc) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279 $enc = uc $enc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280 $exp = 1 if (!defined $exp && $enc =~ m/[DEJKV]/o);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282 if ($enc =~ m/\d/o) { # numerically "enhanced" encoding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283 my $numenc = $enc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284 $enc = '';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285 while ($numenc =~ m/\G(\d*)([CDEIJKUVGFB])/g) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 my ($num, $char) = ($1, $2);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287 $num = 1 unless $num;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288 $enc .= $char x $num;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292 if (defined $exp && $exp == 0 && $enc =~ m/([^CIUGFB])/) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293 $self->throw("Unrecognized character '$1' in implicit encoding");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294 } elsif ($enc =~ m/[^CDEIJKUVGFB]/) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295 $self->throw("Unrecognized character '$1' in explicit encoding");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298 if ($loc) { # a global location, over which to apply the specified encoding.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300 # balk if too many non-gap characters present in encoding:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301 my ($ct) = $enc =~ tr/GB/GB/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302 $ct = length($enc) - $ct;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303 $self->throw("Location length doesn't match number of coding chars in encoding!")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 if ($loc->location_type eq 'EXACT' && $loc->length != $ct);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306 my $start = $loc->start;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307 my $end = $loc->end;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 # strip any encoding that hangs off the ends of the sequence:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310 if ($start < $self->start) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311 my $diff = $self->start - $start;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312 $start = $self->start;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313 $enc = substr($enc, $diff);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315 if ($end > $self->end) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316 my $diff = $end - $self->end;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317 $end = $self->end;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318 $enc = substr($enc, -$diff);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321 my $currenc = $self->{_encoding};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322 my $currseq = $self->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
323
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
324 my ($spanstart, $spanend) = ($self->column_from_residue_number($start),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
325 $self->column_from_residue_number($end) );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
326
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
327 if ($currseq) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
328 # strip any gaps in sequence spanned by this location:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
329 my ($before, $in, $after) = $currseq =~ m/(.{@{[ $spanstart - ($loc->location_type eq 'IN-BETWEEN' ? 0 : 1) ]}})
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
330 (.{@{[ $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1) ]}})
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
331 (.*)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
332 /x;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
333 $in =~ s/[\.\-]+//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
334 $currseq = $before . $in . $after;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
335 # change seq without changing the alphabet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
336 $self->seq($currseq,$self->alphabet());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
337 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
338
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
339 $currenc = reverse $currenc if $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
340 substr($currenc, $spanstart, $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
341 $self->strand >= 0 ? $enc : reverse $enc);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
342 $currenc = reverse $currenc if $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
343
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
344 $self->{_encoding} = $currenc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
345 $self->_recheck_encoding;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
346
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
347 $currenc = $self->{_encoding};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
348 $currenc = reverse $currenc if $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
349 $enc = substr($currenc, $spanstart, length $enc);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
350 $enc = reverse $enc if $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
351
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
352 return $exp ? $enc: $self->_convert2implicit($enc);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
353
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
354 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
355 # presume a global redefinition; strip any current gap
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
356 # characters in the sequence so they don't corrupt the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
357 # encoding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
358 my $dna = $self->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
359 $dna =~ s/[\.\-]//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
360 $self->seq($dna, 'dna');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
361 $self->{_encoding} = $enc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
362 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
363 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
364 $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
365 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
366
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
367 $self->_recheck_encoding();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
368
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
369 return $exp ? $self->{_encoding} : $self->_convert2implicit($self->{_encoding});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
370 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
371
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
372 sub _convert2implicit {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
373
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
374 my ($self, $enc) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
375
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
376 $enc =~ s/[DE]/C/g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
377 $enc =~ s/[JK]/I/g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
378 $enc =~ s/V/U/g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
379
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
380 return $enc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
381 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
382
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
383 sub _recheck_encoding {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
384
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
385 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
386
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
387 my @enc = split //, ($self->{_encoding} || '');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
388
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
389 my @nt = split(//, $self->SUPER::seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
390 @nt = reverse @nt if $self->strand && $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
391
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
392 # make sure an encoding exists!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
393 @enc = ('C') x scalar grep { !/[\.\-]/o } @nt
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
394 unless @enc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
395
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
396 # check for gaps to be truly present in the sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
397 # and vice versa
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
398 my $i;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
399 for ($i = 0 ; $i < @nt && $i < @enc ; $i++) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
400 if ($nt[$i] =~ /[\.\-]/o && $enc[$i] !~ m/[GB]/o) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
401 splice(@enc, $i, 0, 'G');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
402 } elsif ($nt[$i] !~ /[\.\-]/o && $enc[$i] =~ m/[GB]/o) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
403 splice(@nt, $i, 0, '-');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
404 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
405 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
406 if ($i < @enc) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
407 # extra encoding; presumably all gaps?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
408 for ( ; $i < @enc ; $i++) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
409 if ($enc[$i] =~ m/[GB]/o) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
410 push @nt, '-';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
411 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
412 $self->throw("Extraneous encoding info: " . join('', @enc[$i..$#enc]));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
413 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
414 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
415 } elsif ($i < @nt) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
416 for ( ; $i < @nt ; $i++) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
417 if ($nt[$i] =~ m/[\.\-]/o) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
418 push @enc, 'G';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
419 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
420 push @enc, 'C';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
421 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
422 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
423 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
424
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
425 my @cde_array = qw(C D E);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
426 my @ijk_array = qw(I J K);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
427 # convert any leftover implicit coding into explicit coding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
428 my ($Cct, $Ict, $Uct, $Vct, $Vwarned) = (0, 0, 0, 0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
429 for ($i = 0 ; $i < @enc ; $i++) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
430 if ($enc[$i] =~ m/[CDE]/o) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
431 my $temp_index = $Cct %3;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
432 $enc[$i] = $cde_array[$temp_index];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
433 $Cct++; $Ict = 0; $Uct = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
434 $self->warn("3' untranslated encoding (V) seen prior to other coding symbols")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
435 if ($Vct && !$Vwarned++);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
436 } elsif ($enc[$i] =~ m/[IJK]/o) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
437 $enc[$i] = $ijk_array[$Ict % 3];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
438 $Ict++; $Uct = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
439 $self->warn("3' untranslated encoding (V) seen before other coding symbols")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
440 if ($Vct && !$Vwarned++);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
441 } elsif ($enc[$i] =~ m/[UV]/o) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
442 if ($Uct == 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
443 $enc[$i] = 'V';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
444 $Vct = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
445 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
446 } elsif ($enc[$i] eq 'B') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
447 $Cct++; $Ict++
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
448 } elsif ($enc[$i] eq 'G') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
449 # gap; leave alone
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
450 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
451 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
452
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
453 @nt = reverse @nt if $self->strand && $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
454
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
455 $self->{'seq'} = join('', @nt);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
456 # $self->seq(join('', @nt), 'dna');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
457 $self->{_encoding} = join '', @enc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
458 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
459
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
460 =head2 cds
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
461
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
462 Title : cds
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
463 Usage : $cds = $obj->cds(-nogaps => 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
464 Function: obtain the "spliced" DNA sequence, by removing any
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
465 nucleotides that participate in an UTR, forward frameshift
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
466 or intron, and replacing any unknown nucleotide implied by
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
467 a backward frameshift or gap with N's.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
468 Returns : a Bio::Seq::EncodedSeq object, with an encoding consisting only
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
469 of "CCCC..".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
470 Args : nogaps - strip any gap characters (resulting from 'G' or 'B'
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
471 encodings), rather than replacing them with N's.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
472
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
473 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
474
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
475 sub cds {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
476
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
477 my ($self, @args) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
478
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
479 my ($nogaps, $loc) = $self->_rearrange([qw(NOGAPS LOCATION)], @args);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
480 $nogaps = 0 unless defined $nogaps;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
481
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
482 my @nt = split //, $self->strand < 0 ? $self->revcom->seq : $self->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
483 my @enc = split //, $self->_convert2implicit($self->{_encoding});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
484
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
485 my ($start, $end) = (0, scalar @nt);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
486
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
487 if ($loc) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
488 $start = $loc->start;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
489 $start++ if $loc->location_type eq 'IN-BETWEEN';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
490 $start = $self->column_from_residue_number($start);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
491 $start--;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
492
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
493 $end = $loc->end;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
494 $end = $self->column_from_residue_number($end);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
495
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
496 ($start, $end) = ($end, $start) if $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
497 $start--;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
498 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
499
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
500 for (my $i = $start ; $i < $end ; $i++) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
501 if ($enc[$i] eq 'I' || $enc[$i] eq 'U' || $enc[$i] eq 'F') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
502 # remove introns, untranslated and forward frameshift nucleotides
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
503 $nt[$i] = undef;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
504 } elsif ($enc[$i] eq 'G' || $enc[$i] eq 'B') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
505 # replace gaps and backward frameshifts with N's, unless asked not to.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
506 $nt[$i] = $nogaps ? undef : 'N';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
507 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
508 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
509
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
510 return ($self->can_call_new ? ref($self) : __PACKAGE__)->new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
511 (-seq => join('', grep { defined } @nt[$start..--$end]),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
512 -start => $self->start,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
513 -end => $self->end,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
514 -strand => 1, -alphabet => 'dna');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
515 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
516
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
517 =head2 translate
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
518
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
519 Title : translate
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
520 Usage : $prot = $obj->translate(@args);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
521 Function: obtain the protein sequence encoded by the underlying DNA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
522 sequence; same as $obj->cds()->translate(@args).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
523 Returns : a Bio::PrimarySeq object.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
524 Args : same as the translate() function of Bio::PrimarySeqI
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
525
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
526 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
527
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
528 sub translate { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_) };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
529
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
530 =head2 protseq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
531
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
532 Title : seq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
533 Usage : $protseq = $obj->protseq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
534 Function: obtain the raw protein sequence encoded by the underlying
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
535 DNA sequence; This is the same as calling
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
536 $obj->translate()->seq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
537 Returns : a string of single-letter amino acid codes
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
538 Args : same as the seq() function of Bio::PrimarySeq; note that this
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
539 function may not be used to set the protein sequence; see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
540 the dnaseq() function for that.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
541
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
542 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
543
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
544 sub protseq { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_)->seq };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
545
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
546 =head2 dnaseq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
547
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
548 Title : dnaseq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
549 Usage : $dnaseq = $obj->dnaseq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
550 $obj->dnaseq("ACGTGTCGT", "CCCCCCCCC");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
551 $obj->dnaseq(-seq => "ATG",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
552 -encoding => "CCC",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
553 -location => $loc );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
554 @introns = $obj->$dnaseq(-encoding => 'I')
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
555 Function: get/set the underlying DNA sequence; will overwrite any
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
556 current DNA and/or encoding information present.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
557 Returns : a string of single-letter nucleotide codes, including any
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
558 gaps implied by the encoding.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
559 Args : seq - the DNA sequence to be used as a replacement
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
560 encoding - the encoding of the DNA sequence (see the new()
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
561 constructor); defaults to all 'C' if setting a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
562 new DNA sequence. If no new DNA sequence is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
563 being provided, then the encoding is used as a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
564 "filter" for which to return fragments of
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
565 non-overlapping DNA that match the encoding.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
566 location - optional, the location of the DNA sequence to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
567 get/set; defaults to the entire sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
568
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
569 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
570
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
571 sub dnaseq {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
572
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
573 my ($self, @args) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
574 my ($seq, $enc, $loc) = $self->_rearrange([qw(DNASEQ ENCODING LOCATION)], @args);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
575
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
576 $self
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
577
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
578 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
579
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
580 # need to overload this so that we truncate both the seq and the encoding!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
581 sub trunc {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
582
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
583 my ($self, $start, $end) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
584 my $new = $self->SUPER::trunc($start, $end);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
585 $start--;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
586 my $enc = $self->{_encoding};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
587 $enc = reverse $enc if $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
588 $enc = substr($enc, $start, $end - $start);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
589 $enc = reverse $enc if $self->strand < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
590 $new->encoding($enc);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
591 return $new;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
592 }