comparison variant_effect_predictor/Bio/SeqIO/gcg.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: gcg.pm,v 1.21 2002/10/25 16:22:01 jason Exp $
2 #
3 # BioPerl module for Bio::SeqIO::gcg
4 #
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
6 # and Lincoln Stein <lstein@cshl.org>
7 #
8 # Copyright Ewan Birney & Lincoln Stein
9 #
10 # You may distribute this module under the same terms as perl itself
11 #
12 # _history
13 # October 18, 1999 Largely rewritten by Lincoln Stein
14
15 # POD documentation - main docs before the code
16
17 =head1 NAME
18
19 Bio::SeqIO::gcg - GCG sequence input/output stream
20
21 =head1 SYNOPSIS
22
23 Do not use this module directly. Use it via the Bio::SeqIO class.
24
25 =head1 DESCRIPTION
26
27 This object can transform Bio::Seq objects to and from GCG flat
28 file databases.
29
30 =head1 FEEDBACK
31
32 =head2 Mailing Lists
33
34 User feedback is an integral part of the evolution of this
35 and other Bioperl modules. Send your comments and suggestions preferably
36 to one of the Bioperl mailing lists.
37 Your participation is much appreciated.
38
39 bioperl-l@bioperl.org - General discussion
40 http://www.bioperl.org/MailList.shtml - About the mailing lists
41
42 =head2 Reporting Bugs
43
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 the bugs and their resolution. Bug reports can be submitted via email
46 or the web:
47
48 bioperl-bugs@bio.perl.org
49 http://bugzilla.bioperl.org/
50
51 =head1 AUTHORS - Ewan Birney & Lincoln Stein
52
53 Email: E<lt>birney@ebi.ac.ukE<gt>
54 E<lt>lstein@cshl.orgE<gt>
55
56 =head1 CONTRIBUTORS
57
58 Jason Stajich, jason@bioperl.org
59
60 =head1 APPENDIX
61
62 The rest of the documentation details each of the object
63 methods. Internal methods are usually preceded with a _
64
65 =cut
66
67 # Let the code begin...
68
69 package Bio::SeqIO::gcg;
70 use vars qw(@ISA);
71 use strict;
72
73 use Bio::SeqIO;
74 use Bio::Seq::SeqFactory;
75
76 @ISA = qw(Bio::SeqIO);
77
78 sub _initialize {
79 my($self,@args) = @_;
80 $self->SUPER::_initialize(@args);
81 if( ! defined $self->sequence_factory ) {
82 $self->sequence_factory(new Bio::Seq::SeqFactory
83 (-verbose => $self->verbose(),
84 -type => 'Bio::Seq::RichSeq'));
85 }
86 }
87
88 =head2 next_seq
89
90 Title : next_seq
91 Usage : $seq = $stream->next_seq()
92 Function: returns the next sequence in the stream
93 Returns : Bio::Seq object
94 Args :
95
96 =cut
97
98 sub next_seq {
99 my ($self,@args) = @_;
100 my($id,$type,$desc,$line,$chksum,$sequence,$date,$len);
101
102 while( defined($_ = $self->_readline()) ) {
103
104 ## Get the descriptive info (anything before the line with '..')
105 unless( /\.\.$/ ) { $desc.= $_; }
106 ## Pull ID, Checksum & Type from the line containing '..'
107 /\.\.$/ && do { $line = $_; chomp;
108 if(/Check\:\s(\d+)\s/) { $chksum = $1; }
109 if(/Type:\s(\w)\s/) { $type = $1; }
110 if(/(\S+)\s+Length/)
111 { $id = $1; }
112 if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ )
113 { $len = $1; $date = $2;}
114 last;
115 }
116 }
117 return if ( !defined $_);
118 chomp($desc); # remove last "\n"
119
120 while( defined($_ = $self->_readline()) ) {
121
122 ## This is where we grab the sequence info.
123
124 if( /\.\.$/ ) {
125 $self->throw("Looks like start of another sequence. See documentation. ");
126 }
127
128 next if($_ eq "\n"); ## skip whitespace lines in formatted seq
129 s/[^a-zA-Z]//g; ## remove anything that is not alphabet char
130 # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL
131 $sequence .= $_;
132 }
133 ##If we parsed out a checksum, we might as well test it
134
135 if(defined $chksum) {
136 unless(_validate_checksum($sequence,$chksum)) {
137 $self->throw("Checksum failure on parsed sequence.");
138 }
139 }
140
141 ## Remove whitespace from identifier because the constructor
142 ## will throw a warning otherwise...
143 if(defined $id) { $id =~ s/\s+//g;}
144
145 ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate
146 ## keyword that the constructor expects...
147 if(defined $type) {
148 if($type eq "N") { $type = "dna"; }
149 if($type eq "P") { $type = "prot"; }
150 }
151
152 return $self->sequence_factory->create(-seq => $sequence,
153 -id => $id,
154 -desc => $desc,
155 -type => $type,
156 -dates => [ $date ]
157 );
158 }
159
160 =head2 write_seq
161
162 Title : write_seq
163 Usage : $stream->write_seq(@seq)
164 Function: writes the formatted $seq object into the stream
165 Returns : 1 for success and 0 for error
166 Args : array of Bio::PrimarySeqI object
167
168
169 =cut
170
171 sub write_seq {
172 my ($self,@seq) = @_;
173 for my $seq (@seq) {
174 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
175 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
176
177 my $str = $seq->seq;
178 my $comment = $seq->desc;
179 my $id = $seq->id;
180 my $type = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P';
181 my $timestamp;
182
183 if( $seq->can('get_dates') ) {
184 ($timestamp) = $seq->get_dates;
185 } else {
186 $timestamp = localtime(time);
187 }
188 my($sum,$offset,$len,$i,$j,$cnt,@out);
189
190 $len = length($str);
191 ## Set the offset if we have any non-standard numbering going on
192 $offset=1;
193 # checksum
194 $sum = $self->GCG_checksum($seq);
195
196 #Output the sequence header info
197 push(@out,"$comment\n");
198 push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n");
199
200 #Format the sequence
201 $i = $#out + 1;
202 for($j = 0 ; $j < $len ; ) {
203 if( $j % 50 == 0) {
204 $out[$i] = sprintf("%8d ",($j+$offset)); #numbering
205 }
206 $out[$i] .= sprintf("%s",substr($str,$j,10));
207 $j += 10;
208 if( $j < $len && $j % 50 != 0 ) {
209 $out[$i] .= " ";
210 }elsif($j % 50 == 0 ) {
211 $out[$i++] .= "\n\n";
212 }
213 }
214 local($^W) = 0;
215 if($j % 50 != 0 ) {
216 $out[$i] .= "\n";
217 }
218 $out[$i] .= "\n";
219 return unless $self->_print(@out);
220 }
221
222 $self->flush if $self->_flush_on_write && defined $self->_fh;
223 return 1;
224 }
225
226 =head2 GCG_checksum
227
228 Title : GCG_checksum
229 Usage : $cksum = $gcgio->GCG_checksum($seq);
230 Function : returns a gcg checksum for the sequence specified
231
232 This method can also be called as a class method.
233 Example :
234 Returns : a GCG checksum string
235 Argument : a Bio::PrimarySeqI implementing object
236
237 =cut
238
239 sub GCG_checksum {
240 my ($self,$seqobj) = @_;
241 my $index = 0;
242 my $checksum = 0;
243 my $char;
244
245 my $seq = $seqobj->seq();
246 $seq =~ tr/a-z/A-Z/;
247
248 foreach $char ( split(/[\.\-]*/, $seq)) {
249 $index++;
250 $checksum += ($index * (unpack("c",$char) || 0) );
251 if( $index == 57 ) {
252 $index = 0;
253 }
254 }
255
256 return ($checksum % 10000);
257 }
258
259 =head2 _validate_checksum
260
261 Title : _validate_checksum
262 Usage : n/a - internal method
263 Function: if parsed gcg sequence contains a checksum field
264 : we compare it to a value computed here on the parsed
265 : sequence. A checksum mismatch would indicate some
266 : type of parsing failure occured.
267 :
268 Returns : 1 for success, 0 for failure
269 Args : string containing parsed seq, value of parsed cheksum
270
271
272 =cut
273
274 sub _validate_checksum {
275 my($seq,$parsed_sum) = @_;
276 my($i,$len,$computed_sum,$cnt);
277
278 $len = length($seq);
279
280 #Generate the GCG Checksum value
281
282 for($i=0; $i<$len ;$i++) {
283 $cnt++;
284 $computed_sum += $cnt * ord(substr($seq,$i,1));
285 ($cnt == 57) && ($cnt=0);
286 }
287 $computed_sum %= 10000;
288
289 ## Compare and decide if success or failure
290
291 if($parsed_sum == $computed_sum) {
292 return 1;
293 } else { return 0; }
294
295
296 }
297
298 1;