Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqIO/fasta.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: fasta.pm,v 1.41.2.4 2003/09/18 02:43:16 jason Exp $ | |
| 2 # BioPerl module for Bio::SeqIO::fasta | |
| 3 # | |
| 4 # Cared for by Ewan Birney <birney@ebi.ac.uk> | |
| 5 # and Lincoln Stein <lstein@cshl.org> | |
| 6 # | |
| 7 # Copyright Ewan Birney & Lincoln Stein | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # _history | |
| 11 # October 18, 1999 Largely rewritten by Lincoln Stein | |
| 12 | |
| 13 # POD documentation - main docs before the code | |
| 14 | |
| 15 =head1 NAME | |
| 16 | |
| 17 Bio::SeqIO::fasta - fasta sequence input/output stream | |
| 18 | |
| 19 =head1 SYNOPSIS | |
| 20 | |
| 21 Do not use this module directly. Use it via the Bio::SeqIO class. | |
| 22 | |
| 23 =head1 DESCRIPTION | |
| 24 | |
| 25 This object can transform Bio::Seq objects to and from fasta flat | |
| 26 file databases. | |
| 27 | |
| 28 A method L<preferred_id_type()> can be used to specify the type of ID | |
| 29 we would like to parse from the fasta line. By default 'display' is | |
| 30 used, which means it parses everything from the '>' to the first space | |
| 31 and makes that the 'display_id' for the sequence. | |
| 32 | |
| 33 Can be one of: | |
| 34 - accession | |
| 35 - accession.version | |
| 36 - display | |
| 37 - primary | |
| 38 | |
| 39 =head1 FEEDBACK | |
| 40 | |
| 41 =head2 Mailing Lists | |
| 42 | |
| 43 User feedback is an integral part of the evolution of this and other | |
| 44 Bioperl modules. Send your comments and suggestions preferably to one | |
| 45 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 46 | |
| 47 bioperl-l@bioperl.org - General discussion | |
| 48 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 49 | |
| 50 =head2 Reporting Bugs | |
| 51 | |
| 52 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 53 the bugs and their resolution. Bug reports can be submitted via the | |
| 54 web: | |
| 55 | |
| 56 http://bugzilla.bioperl.org/ | |
| 57 | |
| 58 =head1 AUTHORS - Ewan Birney & Lincoln Stein | |
| 59 | |
| 60 Email: birney@ebi.ac.uk | |
| 61 lstein@cshl.org | |
| 62 | |
| 63 =head1 CONTRIBUTORS | |
| 64 | |
| 65 Jason Stajich, jason-at-bioperl.org | |
| 66 | |
| 67 =head1 APPENDIX | |
| 68 | |
| 69 The rest of the documentation details each of the object | |
| 70 methods. Internal methods are usually preceded with a _ | |
| 71 | |
| 72 =cut | |
| 73 | |
| 74 # Let the code begin... | |
| 75 | |
| 76 package Bio::SeqIO::fasta; | |
| 77 use vars qw(@ISA $WIDTH @SEQ_ID_TYPES $DEFAULT_SEQ_ID_TYPE); | |
| 78 use strict; | |
| 79 # Object preamble - inherits from Bio::Root::Object | |
| 80 | |
| 81 use Bio::SeqIO; | |
| 82 use Bio::Seq::SeqFactory; | |
| 83 use Bio::Seq::SeqFastaSpeedFactory; | |
| 84 | |
| 85 @ISA = qw(Bio::SeqIO); | |
| 86 | |
| 87 @SEQ_ID_TYPES = qw(accession accession.version display primary); | |
| 88 $DEFAULT_SEQ_ID_TYPE = 'display'; | |
| 89 | |
| 90 BEGIN { $WIDTH = 60} | |
| 91 | |
| 92 sub _initialize { | |
| 93 my($self,@args) = @_; | |
| 94 $self->SUPER::_initialize(@args); | |
| 95 my ($width) = $self->_rearrange([qw(WIDTH)], @args); | |
| 96 $width && $self->width($width); | |
| 97 unless ( defined $self->sequence_factory ) { | |
| 98 $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new()); | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 =head2 next_seq | |
| 103 | |
| 104 Title : next_seq | |
| 105 Usage : $seq = $stream->next_seq() | |
| 106 Function: returns the next sequence in the stream | |
| 107 Returns : Bio::Seq object | |
| 108 Args : NONE | |
| 109 | |
| 110 =cut | |
| 111 | |
| 112 sub next_seq { | |
| 113 my( $self ) = @_; | |
| 114 my $seq; | |
| 115 my $alphabet; | |
| 116 local $/ = "\n>"; | |
| 117 return unless my $entry = $self->_readline; | |
| 118 | |
| 119 chomp($entry); | |
| 120 if ($entry =~ m/\A\s*\Z/s) { # very first one | |
| 121 return unless $entry = $self->_readline; | |
| 122 chomp($entry); | |
| 123 } | |
| 124 $entry =~ s/^>//; | |
| 125 | |
| 126 my ($top,$sequence) = split(/\n/,$entry,2); | |
| 127 defined $sequence && $sequence =~ s/>//g; | |
| 128 # my ($top,$sequence) = $entry =~ /^>?(.+?)\n+([^>]*)/s | |
| 129 # or $self->throw("Can't parse fasta entry"); | |
| 130 | |
| 131 my ($id,$fulldesc); | |
| 132 if( $top =~ /^\s*(\S+)\s*(.*)/ ) { | |
| 133 ($id,$fulldesc) = ($1,$2); | |
| 134 } | |
| 135 | |
| 136 if (defined $id && $id eq '') {$id=$fulldesc;} # FIX incase no space | |
| 137 # between > and name \AE | |
| 138 defined $sequence && $sequence =~ s/\s//g; # Remove whitespace | |
| 139 | |
| 140 # for empty sequences we need to know the mol.type | |
| 141 $alphabet = $self->alphabet(); | |
| 142 if(defined $sequence && length($sequence) == 0) { | |
| 143 if(! defined($alphabet)) { | |
| 144 # let's default to dna | |
| 145 $alphabet = "dna"; | |
| 146 } | |
| 147 } else { | |
| 148 # we don't need it really, so disable | |
| 149 $alphabet = undef; | |
| 150 } | |
| 151 | |
| 152 $seq = $self->sequence_factory->create( | |
| 153 -seq => $sequence, | |
| 154 -id => $id, | |
| 155 # Ewan's note - I don't think this healthy | |
| 156 # but obviously to taste. | |
| 157 #-primary_id => $id, | |
| 158 -desc => $fulldesc, | |
| 159 -alphabet => $alphabet, | |
| 160 -direct => 1, | |
| 161 ); | |
| 162 | |
| 163 | |
| 164 | |
| 165 | |
| 166 # if there wasn't one before, set the guessed type | |
| 167 unless ( defined $alphabet ) { | |
| 168 $self->alphabet($seq->alphabet()); | |
| 169 } | |
| 170 return $seq; | |
| 171 | |
| 172 } | |
| 173 | |
| 174 =head2 write_seq | |
| 175 | |
| 176 Title : write_seq | |
| 177 Usage : $stream->write_seq(@seq) | |
| 178 Function: writes the $seq object into the stream | |
| 179 Returns : 1 for success and 0 for error | |
| 180 Args : array of 1 to n Bio::PrimarySeqI objects | |
| 181 | |
| 182 | |
| 183 =cut | |
| 184 | |
| 185 sub write_seq { | |
| 186 my ($self,@seq) = @_; | |
| 187 my $width = $self->width; | |
| 188 foreach my $seq (@seq) { | |
| 189 $self->throw("Did not provide a valid Bio::PrimarySeqI object") | |
| 190 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI'); | |
| 191 | |
| 192 my $str = $seq->seq; | |
| 193 my $top; | |
| 194 | |
| 195 # Allow for different ids | |
| 196 my $id_type = $self->preferred_id_type; | |
| 197 if( $id_type =~ /^acc/i ) { | |
| 198 $top = $seq->accession_number(); | |
| 199 if( $id_type =~ /vers/i ) { | |
| 200 $top .= "." . $seq->version(); | |
| 201 } | |
| 202 } elsif($id_type =~ /^displ/i ) { | |
| 203 $top = $seq->display_id(); | |
| 204 } elsif($id_type =~ /^pri/i ) { | |
| 205 $top = $seq->primary_id(); | |
| 206 } | |
| 207 | |
| 208 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
| 209 $desc =~ s/\n//g; | |
| 210 $top .= " $desc"; | |
| 211 } | |
| 212 if(length($str) > 0) { | |
| 213 $str =~ s/(.{1,$width})/$1\n/g; | |
| 214 } else { | |
| 215 $str = "\n"; | |
| 216 } | |
| 217 $self->_print (">",$top,"\n",$str) or return; | |
| 218 } | |
| 219 | |
| 220 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
| 221 return 1; | |
| 222 } | |
| 223 | |
| 224 =head2 width | |
| 225 | |
| 226 Title : width | |
| 227 Usage : $obj->width($newval) | |
| 228 Function: Get/Set the line width for FASTA output | |
| 229 Returns : value of width | |
| 230 Args : newvalue (optional) | |
| 231 | |
| 232 | |
| 233 =cut | |
| 234 | |
| 235 sub width{ | |
| 236 my ($self,$value) = @_; | |
| 237 if( defined $value) { | |
| 238 $self->{'width'} = $value; | |
| 239 } | |
| 240 return $self->{'width'} || $WIDTH; | |
| 241 } | |
| 242 | |
| 243 =head2 preferred_id_type | |
| 244 | |
| 245 Title : preferred_id_type | |
| 246 Usage : $obj->preferred_id_type('accession') | |
| 247 Function: Get/Set the preferred type of identifier to use in the ">ID" position | |
| 248 for FASTA output. | |
| 249 Returns : string, one of values defined in @Bio::SeqIO::fasta::SEQ_ID_TYPES. | |
| 250 Default = $Bio::SeqIO::fasta::DEFAULT_SEQ_ID_TYPE ('display'). | |
| 251 Args : string when setting. This must be one of values defined in | |
| 252 @Bio::SeqIO::fasta::SEQ_ID_TYPES. Allowable values: | |
| 253 accession, accession.version, display, primary | |
| 254 Throws : fatal exception if the supplied id type is not in @SEQ_ID_TYPES. | |
| 255 | |
| 256 =cut | |
| 257 | |
| 258 sub preferred_id_type { | |
| 259 my ($self,$type) = @_; | |
| 260 if( defined $type ) { | |
| 261 if( ! grep lc($type) eq $_, @SEQ_ID_TYPES) { | |
| 262 $self->throw(-class=>'Bio::Root::BadParameter', | |
| 263 -text=>"Invalid ID type \"$type\". Must be one of: @SEQ_ID_TYPES"); | |
| 264 } | |
| 265 $self->{'_seq_id_type'} = lc($type); | |
| 266 # print STDERR "Setting preferred_id_type=$type\n"; | |
| 267 } | |
| 268 $self->{'_seq_id_type'} || $DEFAULT_SEQ_ID_TYPE; | |
| 269 } | |
| 270 | |
| 271 1; |
