Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqIO/gcg.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: gcg.pm,v 1.21 2002/10/25 16:22:01 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::SeqIO::gcg | |
| 4 # | |
| 5 # Cared for by Ewan Birney <birney@ebi.ac.uk> | |
| 6 # and Lincoln Stein <lstein@cshl.org> | |
| 7 # | |
| 8 # Copyright Ewan Birney & Lincoln Stein | |
| 9 # | |
| 10 # You may distribute this module under the same terms as perl itself | |
| 11 # | |
| 12 # _history | |
| 13 # October 18, 1999 Largely rewritten by Lincoln Stein | |
| 14 | |
| 15 # POD documentation - main docs before the code | |
| 16 | |
| 17 =head1 NAME | |
| 18 | |
| 19 Bio::SeqIO::gcg - GCG sequence input/output stream | |
| 20 | |
| 21 =head1 SYNOPSIS | |
| 22 | |
| 23 Do not use this module directly. Use it via the Bio::SeqIO class. | |
| 24 | |
| 25 =head1 DESCRIPTION | |
| 26 | |
| 27 This object can transform Bio::Seq objects to and from GCG flat | |
| 28 file databases. | |
| 29 | |
| 30 =head1 FEEDBACK | |
| 31 | |
| 32 =head2 Mailing Lists | |
| 33 | |
| 34 User feedback is an integral part of the evolution of this | |
| 35 and other Bioperl modules. Send your comments and suggestions preferably | |
| 36 to one of the Bioperl mailing lists. | |
| 37 Your participation is much appreciated. | |
| 38 | |
| 39 bioperl-l@bioperl.org - General discussion | |
| 40 http://www.bioperl.org/MailList.shtml - About the mailing lists | |
| 41 | |
| 42 =head2 Reporting Bugs | |
| 43 | |
| 44 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 45 the bugs and their resolution. Bug reports can be submitted via email | |
| 46 or the web: | |
| 47 | |
| 48 bioperl-bugs@bio.perl.org | |
| 49 http://bugzilla.bioperl.org/ | |
| 50 | |
| 51 =head1 AUTHORS - Ewan Birney & Lincoln Stein | |
| 52 | |
| 53 Email: E<lt>birney@ebi.ac.ukE<gt> | |
| 54 E<lt>lstein@cshl.orgE<gt> | |
| 55 | |
| 56 =head1 CONTRIBUTORS | |
| 57 | |
| 58 Jason Stajich, jason@bioperl.org | |
| 59 | |
| 60 =head1 APPENDIX | |
| 61 | |
| 62 The rest of the documentation details each of the object | |
| 63 methods. Internal methods are usually preceded with a _ | |
| 64 | |
| 65 =cut | |
| 66 | |
| 67 # Let the code begin... | |
| 68 | |
| 69 package Bio::SeqIO::gcg; | |
| 70 use vars qw(@ISA); | |
| 71 use strict; | |
| 72 | |
| 73 use Bio::SeqIO; | |
| 74 use Bio::Seq::SeqFactory; | |
| 75 | |
| 76 @ISA = qw(Bio::SeqIO); | |
| 77 | |
| 78 sub _initialize { | |
| 79 my($self,@args) = @_; | |
| 80 $self->SUPER::_initialize(@args); | |
| 81 if( ! defined $self->sequence_factory ) { | |
| 82 $self->sequence_factory(new Bio::Seq::SeqFactory | |
| 83 (-verbose => $self->verbose(), | |
| 84 -type => 'Bio::Seq::RichSeq')); | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 =head2 next_seq | |
| 89 | |
| 90 Title : next_seq | |
| 91 Usage : $seq = $stream->next_seq() | |
| 92 Function: returns the next sequence in the stream | |
| 93 Returns : Bio::Seq object | |
| 94 Args : | |
| 95 | |
| 96 =cut | |
| 97 | |
| 98 sub next_seq { | |
| 99 my ($self,@args) = @_; | |
| 100 my($id,$type,$desc,$line,$chksum,$sequence,$date,$len); | |
| 101 | |
| 102 while( defined($_ = $self->_readline()) ) { | |
| 103 | |
| 104 ## Get the descriptive info (anything before the line with '..') | |
| 105 unless( /\.\.$/ ) { $desc.= $_; } | |
| 106 ## Pull ID, Checksum & Type from the line containing '..' | |
| 107 /\.\.$/ && do { $line = $_; chomp; | |
| 108 if(/Check\:\s(\d+)\s/) { $chksum = $1; } | |
| 109 if(/Type:\s(\w)\s/) { $type = $1; } | |
| 110 if(/(\S+)\s+Length/) | |
| 111 { $id = $1; } | |
| 112 if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ ) | |
| 113 { $len = $1; $date = $2;} | |
| 114 last; | |
| 115 } | |
| 116 } | |
| 117 return if ( !defined $_); | |
| 118 chomp($desc); # remove last "\n" | |
| 119 | |
| 120 while( defined($_ = $self->_readline()) ) { | |
| 121 | |
| 122 ## This is where we grab the sequence info. | |
| 123 | |
| 124 if( /\.\.$/ ) { | |
| 125 $self->throw("Looks like start of another sequence. See documentation. "); | |
| 126 } | |
| 127 | |
| 128 next if($_ eq "\n"); ## skip whitespace lines in formatted seq | |
| 129 s/[^a-zA-Z]//g; ## remove anything that is not alphabet char | |
| 130 # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL | |
| 131 $sequence .= $_; | |
| 132 } | |
| 133 ##If we parsed out a checksum, we might as well test it | |
| 134 | |
| 135 if(defined $chksum) { | |
| 136 unless(_validate_checksum($sequence,$chksum)) { | |
| 137 $self->throw("Checksum failure on parsed sequence."); | |
| 138 } | |
| 139 } | |
| 140 | |
| 141 ## Remove whitespace from identifier because the constructor | |
| 142 ## will throw a warning otherwise... | |
| 143 if(defined $id) { $id =~ s/\s+//g;} | |
| 144 | |
| 145 ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate | |
| 146 ## keyword that the constructor expects... | |
| 147 if(defined $type) { | |
| 148 if($type eq "N") { $type = "dna"; } | |
| 149 if($type eq "P") { $type = "prot"; } | |
| 150 } | |
| 151 | |
| 152 return $self->sequence_factory->create(-seq => $sequence, | |
| 153 -id => $id, | |
| 154 -desc => $desc, | |
| 155 -type => $type, | |
| 156 -dates => [ $date ] | |
| 157 ); | |
| 158 } | |
| 159 | |
| 160 =head2 write_seq | |
| 161 | |
| 162 Title : write_seq | |
| 163 Usage : $stream->write_seq(@seq) | |
| 164 Function: writes the formatted $seq object into the stream | |
| 165 Returns : 1 for success and 0 for error | |
| 166 Args : array of Bio::PrimarySeqI object | |
| 167 | |
| 168 | |
| 169 =cut | |
| 170 | |
| 171 sub write_seq { | |
| 172 my ($self,@seq) = @_; | |
| 173 for my $seq (@seq) { | |
| 174 $self->throw("Did not provide a valid Bio::PrimarySeqI object") | |
| 175 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); | |
| 176 | |
| 177 my $str = $seq->seq; | |
| 178 my $comment = $seq->desc; | |
| 179 my $id = $seq->id; | |
| 180 my $type = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P'; | |
| 181 my $timestamp; | |
| 182 | |
| 183 if( $seq->can('get_dates') ) { | |
| 184 ($timestamp) = $seq->get_dates; | |
| 185 } else { | |
| 186 $timestamp = localtime(time); | |
| 187 } | |
| 188 my($sum,$offset,$len,$i,$j,$cnt,@out); | |
| 189 | |
| 190 $len = length($str); | |
| 191 ## Set the offset if we have any non-standard numbering going on | |
| 192 $offset=1; | |
| 193 # checksum | |
| 194 $sum = $self->GCG_checksum($seq); | |
| 195 | |
| 196 #Output the sequence header info | |
| 197 push(@out,"$comment\n"); | |
| 198 push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n"); | |
| 199 | |
| 200 #Format the sequence | |
| 201 $i = $#out + 1; | |
| 202 for($j = 0 ; $j < $len ; ) { | |
| 203 if( $j % 50 == 0) { | |
| 204 $out[$i] = sprintf("%8d ",($j+$offset)); #numbering | |
| 205 } | |
| 206 $out[$i] .= sprintf("%s",substr($str,$j,10)); | |
| 207 $j += 10; | |
| 208 if( $j < $len && $j % 50 != 0 ) { | |
| 209 $out[$i] .= " "; | |
| 210 }elsif($j % 50 == 0 ) { | |
| 211 $out[$i++] .= "\n\n"; | |
| 212 } | |
| 213 } | |
| 214 local($^W) = 0; | |
| 215 if($j % 50 != 0 ) { | |
| 216 $out[$i] .= "\n"; | |
| 217 } | |
| 218 $out[$i] .= "\n"; | |
| 219 return unless $self->_print(@out); | |
| 220 } | |
| 221 | |
| 222 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
| 223 return 1; | |
| 224 } | |
| 225 | |
| 226 =head2 GCG_checksum | |
| 227 | |
| 228 Title : GCG_checksum | |
| 229 Usage : $cksum = $gcgio->GCG_checksum($seq); | |
| 230 Function : returns a gcg checksum for the sequence specified | |
| 231 | |
| 232 This method can also be called as a class method. | |
| 233 Example : | |
| 234 Returns : a GCG checksum string | |
| 235 Argument : a Bio::PrimarySeqI implementing object | |
| 236 | |
| 237 =cut | |
| 238 | |
| 239 sub GCG_checksum { | |
| 240 my ($self,$seqobj) = @_; | |
| 241 my $index = 0; | |
| 242 my $checksum = 0; | |
| 243 my $char; | |
| 244 | |
| 245 my $seq = $seqobj->seq(); | |
| 246 $seq =~ tr/a-z/A-Z/; | |
| 247 | |
| 248 foreach $char ( split(/[\.\-]*/, $seq)) { | |
| 249 $index++; | |
| 250 $checksum += ($index * (unpack("c",$char) || 0) ); | |
| 251 if( $index == 57 ) { | |
| 252 $index = 0; | |
| 253 } | |
| 254 } | |
| 255 | |
| 256 return ($checksum % 10000); | |
| 257 } | |
| 258 | |
| 259 =head2 _validate_checksum | |
| 260 | |
| 261 Title : _validate_checksum | |
| 262 Usage : n/a - internal method | |
| 263 Function: if parsed gcg sequence contains a checksum field | |
| 264 : we compare it to a value computed here on the parsed | |
| 265 : sequence. A checksum mismatch would indicate some | |
| 266 : type of parsing failure occured. | |
| 267 : | |
| 268 Returns : 1 for success, 0 for failure | |
| 269 Args : string containing parsed seq, value of parsed cheksum | |
| 270 | |
| 271 | |
| 272 =cut | |
| 273 | |
| 274 sub _validate_checksum { | |
| 275 my($seq,$parsed_sum) = @_; | |
| 276 my($i,$len,$computed_sum,$cnt); | |
| 277 | |
| 278 $len = length($seq); | |
| 279 | |
| 280 #Generate the GCG Checksum value | |
| 281 | |
| 282 for($i=0; $i<$len ;$i++) { | |
| 283 $cnt++; | |
| 284 $computed_sum += $cnt * ord(substr($seq,$i,1)); | |
| 285 ($cnt == 57) && ($cnt=0); | |
| 286 } | |
| 287 $computed_sum %= 10000; | |
| 288 | |
| 289 ## Compare and decide if success or failure | |
| 290 | |
| 291 if($parsed_sum == $computed_sum) { | |
| 292 return 1; | |
| 293 } else { return 0; } | |
| 294 | |
| 295 | |
| 296 } | |
| 297 | |
| 298 1; |
