annotate variant_effect_predictor/Bio/SeqIO/largefasta.pm @ 1:d6778b5d8382 draft default tip

Deleted selected files
author willmclaren
date Fri, 03 Aug 2012 10:05:43 -0400
parents 21066c0abaf5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 # $Id: largefasta.pm,v 1.18 2002/12/27 19:42:32 birney Exp $
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2 # BioPerl module for Bio::SeqIO::largefasta
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 # Cared for by Jason Stajich
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 # Copyright Jason Stajich
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8 # You may distribute this module under the same terms as perl itself
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 # _history
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10 #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 # POD documentation - main docs before the code
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15 Bio::SeqIO::largefasta - method i/o on very large fasta sequence files
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 =head1 SYNOPSIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 Do not use this module directly. Use it via the Bio::SeqIO class.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 This object can transform Bio::Seq objects to and from fasta flat
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24 file databases.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26 This module handles very large sequence files by using the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 Bio::Seq::LargePrimarySeq module to store all the sequence data in
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28 a file. This can be a problem if you have limited disk space on your
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 computer because this will effectively cause 2 copies of the sequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30 file to reside on disk for the life of the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 Bio::Seq::LargePrimarySeq object. The default location for this is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 specified by the L<File::Spec>-E<gt>tmpdir routine which is usually /tmp
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 on UNIX. If a sequence file is larger than the swap space (capacity
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 of the /tmp dir) this could cause problems for the machine. It is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 possible to set the directory where the temporary file is located by
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 adding the following line to your code BEFORE calling next_seq. See
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 L<Bio::Seq::LargePrimarySeq> for more information.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 $Bio::Seq::LargePrimarySeq::DEFAULT_TEMP_DIR = 'newdir';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 =head1 FEEDBACK
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 =head2 Mailing Lists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45 User feedback is an integral part of the evolution of this and other
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 Bioperl modules. Send your comments and suggestions preferably to one
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 of the Bioperl mailing lists. Your participation is much appreciated.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49 bioperl-l@bioperl.org - General discussion
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 http://bioperl.org/MailList.shtml - About the mailing lists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 =head2 Reporting Bugs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 Report bugs to the Bioperl bug tracking system to help us keep track
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55 the bugs and their resolution. Bug reports can be submitted via email
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 or the web:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 bioperl-bugs@bio.perl.org
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 http://bugzilla.bioperl.org/
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 =head1 AUTHORS - Jason Stajich
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 Email: jason@bioperl.org
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 =head1 APPENDIX
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67 The rest of the documentation details each of the object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 methods. Internal methods are usually preceded with a _
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 # Let the code begin...
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 package Bio::SeqIO::largefasta;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75 use vars qw(@ISA $FASTALINELEN);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77 # Object preamble - inherits from Bio::Root::Object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79 use Bio::SeqIO;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 use Bio::Seq::SeqFactory;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 $FASTALINELEN = 60;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 @ISA = qw(Bio::SeqIO);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 sub _initialize {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 my($self,@args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 $self->SUPER::_initialize(@args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88 if( ! defined $self->sequence_factory ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 $self->sequence_factory(new Bio::Seq::SeqFactory
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 (-verbose => $self->verbose(),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 -type => 'Bio::Seq::LargePrimarySeq'));
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 =head2 next_seq
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97 Title : next_seq
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 Usage : $seq = $stream->next_seq()
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 Function: returns the next sequence in the stream
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 Returns : Bio::Seq object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 Args : NONE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 sub next_seq {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 my ($self) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107 # local $/ = "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 my $largeseq = $self->sequence_factory->create();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 my ($id,$fulldesc,$entry);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 my $count = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111 my $seen = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 while( defined ($entry = $self->_readline) ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 if( $seen == 1 && $entry =~ /^\s*>/ ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 $self->_pushback($entry);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115 return $largeseq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117 # if ( ($entry eq '>') || eof($self->_fh) ) { $seen = 1; next; }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 if ( ($entry eq '>') ) { $seen = 1; next; }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 elsif( $entry =~ /\s*>(.+?)$/ ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 $seen = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 ($id,$fulldesc) = ($1 =~ /^\s*(\S+)\s*(.*)$/)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 or $self->warn("Can't parse fasta header");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123 $largeseq->display_id($id);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 $largeseq->primary_id($id);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125 $largeseq->desc($fulldesc);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 $entry =~ s/\s+//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128 $largeseq->add_sequence_as_string($entry);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 (++$count % 1000 == 0 && $self->verbose() > 0) && print "line $count\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 if( ! $seen ) { return undef; }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133 return $largeseq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 =head2 write_seq
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138 Title : write_seq
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139 Usage : $stream->write_seq(@seq)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 Function: writes the $seq object into the stream
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 Returns : 1 for success and 0 for error
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 Args : Bio::Seq object
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 sub write_seq {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 my ($self,@seq) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 foreach my $seq (@seq) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 my $top = $seq->id();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 if ($seq->can('desc') and my $desc = $seq->desc()) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152 $desc =~ s/\n//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 $top .= " $desc";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 $self->_print (">",$top,"\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 my $end = $seq->length();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 my $start = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 while( $start < $end ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 my $stop = $start + $FASTALINELEN - 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 $stop = $end if( $stop > $end );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 $self->_print($seq->subseq($start,$stop), "\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 $start += $FASTALINELEN;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 $self->flush if $self->_flush_on_write && defined $self->_fh;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 return 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 1;