annotate variant_effect_predictor/Bio/DB/Fasta.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1 =head1 NAME
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
2
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
3 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
4
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
5 =head1 SYNOPSIS
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
6
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
7 use Bio::DB::Fasta;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
8
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
9 # create database from directory of fasta files
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
10 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
11
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
12 # simple access (for those without Bioperl)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
13 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
14 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
15 my @ids = $db->ids;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
16 my $length = $db->length('CHROMOSOME_I');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
17 my $alphabet = $db->alphabet('CHROMOSOME_I');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
18 my $header = $db->header('CHROMOSOME_I');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
19
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
20 # Bioperl-style access
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
21 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
22
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
23 my $obj = $db->get_Seq_by_id('CHROMOSOME_I');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
24 my $seq = $obj->seq;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
25 my $subseq = $obj->subseq(4_000_000 => 4_100_000);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
26 my $length = $obj->length;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
27 # (etc)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
28
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
29 # Bio::SeqIO-style access
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
30 my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
31 while (my $seq = $stream->next_seq) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
32 # Bio::PrimarySeqI stuff
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
33 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
34
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
35 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
36 while (my $seq = <$fh>) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
37 # Bio::PrimarySeqI stuff
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
38 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
39
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
40 # tied hash access
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
41 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
42 print $sequences{'CHROMOSOME_I:1,20000'};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
43
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
44 =head1 DESCRIPTION
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
45
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
46 Bio::DB::Fasta provides indexed access to one or more Fasta files. It
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
47 provides random access to each sequence entry, and to subsequences
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
48 within each entry, allowing you to retrieve portions of very large
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
49 sequences without bringing the entire sequence into memory.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
50
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
51 When you initialize the module, you point it at a single fasta file or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
52 a directory of multiple such files. The first time it is run, the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
53 module generates an index of the contents of the file or directory
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
54 using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
55 NDBM_File, and SDBM_File). Thereafter it uses the index file to find
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
56 the file and offset for any requested sequence. If one of the source
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
57 fasta files is updated, the module reindexes just that one file. (You
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
58 can also force reindexing manually). For improved performance, the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
59 module keeps a cache of open filehandles, closing less-recently used
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
60 ones when the cache is full.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
61
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
62 The fasta files may contain any combination of nucleotide and protein
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
63 sequences; during indexing the module guesses the molecular type.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
64 Entries may have any line length, and different line lengths are
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
65 allowed in the same file. However, within a sequence entry, all lines
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
66 must be the same length except for the last.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
67
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
68 The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
69 the Fasta header. During indexing, you may pass a callback routine to
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
70 modify this primary ID. For example, you may wish to extract a
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
71 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
72 The original header line can be recovered later.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
73
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
74 This module was developed for use with the C. elegans and human
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
75 genomes, and has been tested with sequence segments as large as 20
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
76 megabases. Indexing the C. elegans genome (100 megabases of genomic
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
77 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
78 laptop. On the same system, average access time for any 200-mer within
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
79 the C. elegans genome was E<lt>0.02s.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
80
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
81 =head1 DATABASE CREATION AND INDEXING
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
82
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
83 The two constructors for this class are new() and newFh(). The former
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
84 creates a Bio::DB::Fasta object which is accessed via method calls.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
85 The latter creates a tied filehandle which can be used Bio::SeqIO
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
86 style to fetch sequence objects in a stream fashion. There is also a
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
87 tied hash interface.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
88
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
89 =over 4
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
90
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
91 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
92
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
93 Create a new Bio::DB::Fasta object from the Fasta file or files
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
94 indicated by $fasta_path. Indexing will be performed automatically if
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
95 needed. If successful, new() will return the database accessor
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
96 object. Otherwise it will return undef.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
97
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
98 $fasta_path may be an individual Fasta file, or may refer to a
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
99 directory containing one or more of such files. Following the path,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
100 you may pass a series of name=E<gt>value options or a hash with these
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
101 same name=E<gt>value pairs. Valid options are:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
102
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
103 Option Name Description Default
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
104 ----------- ----------- -------
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
105
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
106 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna}
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
107 for searching for Fasta
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
108 files in directories.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
109
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
110 -makeid A code subroutine for None
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
111 transforming Fasta IDs.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
112
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
113 -maxopen Maximum size of 32
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
114 filehandle cache.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
115
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
116 -debug Turn on status 0
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
117 messages.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
118
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
119 -reindex Force the index to be 0
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
120 rebuilt.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
121
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
122 -dbmargs Additional arguments none
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
123 to pass to the DBM
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
124 routines when tied
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
125 (scalar or array ref).
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
126
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
127 -dbmargs can be used to control the format of the index. For example,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
128 you can pass $DB_BTREE to this argument so as to force the IDs to be
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
129 sorted and retrieved alphabetically. Note that you must use the same
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
130 arguments every time you open the index!
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
131
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
132 -reindex can be used to force the index to be recreated from scratch.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
133
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
134 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
135
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
136 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
137 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
138 Bio::SeqIO style.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
139
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
140 =back
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
141
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
142 The -makeid option gives you a chance to modify sequence IDs during
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
143 indexing. The option's value should be a code reference that will
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
144 take a scalar argument and return a scalar result, like this:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
145
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
146 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
147
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
148 sub make_my_id {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
149 my $description_line = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
150 # get a new id from the fasta header
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
151 return $new_id;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
152 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
153
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
154 make_my_id() will be called with the full fasta id line (including the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
155 "E<gt>" symbol!). For example:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
156
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
157 >A12345.3 Predicted C. elegans protein egl-2
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
158
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
159 By default, this module will use the regular expression /^E<gt>(\S+)/
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
160 to extract "A12345.3" for use as the ID. If you pass a -makeid
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
161 callback, you can extract any portion of this, such as the "egl-2"
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
162 symbol.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
163
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
164 The -makeid option is ignored after the index is constructed.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
165
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
166 =head1 OBJECT METHODS
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
167
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
168 The following object methods are provided.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
169
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
170 =over 4
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
171
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
172 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
173
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
174 Return the raw sequence (a string) given an ID and optionally a start
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
175 and stop position in the sequence. In the case of DNA sequence, if
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
176 $stop is less than $start, then the reverse complement of the sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
177 is returned (this violates Bio::Seq conventions).
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
178
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
179 For your convenience, subsequences can be indicated with this compound
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
180 ID:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
181
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
182 $db->seq("$id:$start,$stop")
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
183
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
184 =item $length = $db-E<gt>length($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
185
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
186 Return the length of the indicated sequence.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
187
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
188 =item $header = $db-E<gt>header($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
189
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
190 Return the header line for the ID, including the initial "E<gt>".
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
191
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
192 =item $type = $db-E<gt>alphabet($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
193
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
194 Return the molecular type of the indicated sequence. One of "dna",
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
195 "rna" or "protein".
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
196
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
197 =item $filename = $db-E<gt>file($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
198
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
199 Return the name of the file in which the indicated sequence can be
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
200 found.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
201
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
202 =item $offset = $db-E<gt>offset($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
203
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
204 Return the offset of the indicated sequence from the beginning of the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
205 file in which it is located. The offset points to the beginning of
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
206 the sequence, not the beginning of the header line.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
207
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
208 =item $header_length = $db-E<gt>headerlen($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
209
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
210 Return the length of the header line for the indicated sequence.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
211
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
212 =item $header_offset = $db-E<gt>header_offset($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
213
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
214 Return the offset of the header line for the indicated sequence from
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
215 the beginning of the file in which it is located.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
216
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
217 =item $index_name = $db-E<gt>index_name
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
218
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
219 Return the path to the index file.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
220
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
221 =item $path = $db-E<gt>path
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
222
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
223 Return the path to the Fasta file(s).
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
224
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
225 =back
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
226
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
227 For BioPerl-style access, the following methods are provided:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
228
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
229 =over 4
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
230
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
231 =item $seq = $db-E<gt>get_Seq_by_id($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
232
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
233 Return a Bio::PrimarySeq::Fasta object, which obeys the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
234 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
235 protein sequence, call $seq-E<gt>seq().
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
236
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
237 Note that get_Seq_by_id() does not bring the entire sequence into
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
238 memory until requested. Internally, the returned object uses the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
239 accessor to generate subsequences as needed.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
240
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
241 =item $seq = $db-E<gt>get_Seq_by_acc($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
242
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
243 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
244
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
245 These methods all do the same thing as get_Seq_by_id().
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
246
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
247 =item $stream = $db-E<gt>get_PrimarySeq_stream()
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
248
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
249 Return a Bio::DB::Fasta::Stream object, which supports a single method
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
250 next_seq(). Each call to next_seq() returns a new
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
251 Bio::PrimarySeq::Fasta object, until no more sequences remain.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
252
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
253 =back
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
254
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
255 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
256 returned from get_Seq_by_id() and get_PrimarySeq_stream().
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
257
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
258 =head1 TIED INTERFACES
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
259
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
260 This module provides two tied interfaces, one which allows you to
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
261 treat the sequence database as a hash, and the other which allows you
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
262 to treat the database as an I/O stream.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
263
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
264 =head2 Creating a Tied Hash
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
265
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
266 The tied hash interface is very straightforward
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
267
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
268 =over 4
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
269
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
270 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
271
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
272 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
273 The optional @args list is the same set of named argument/value pairs
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
274 used by Bio::DB::Fasta-E<gt>new().
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
275
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
276 If successful, tie() will return the tied object. Otherwise it will
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
277 return undef.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
278
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
279 =back
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
280
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
281 Once tied, you can use the hash to retrieve an individual sequence by
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
282 its ID, like this:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
283
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
284 my $seq = $db{CHROMOSOME_I};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
285
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
286 You may select a subsequence by appending the comma-separated range to
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
287 the sequence ID in the format "$id:$start,$stop". For example, here
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
288 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
289
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
290 my $seq = $db{'CHROMOSOME_I:1,1000'};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
291
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
292 (The regular expression used to parse this format allows sequence IDs
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
293 to contain colons.)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
294
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
295 When selecting subsequences, if $start E<gt> stop, then the reverse
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
296 complement will be returned for DNA sequences.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
297
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
298 The keys() and values() functions will return the sequence IDs and
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
299 their sequences, respectively. In addition, each() can be used to
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
300 iterate over the entire data set:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
301
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
302 while (my ($id,$sequence) = each %db) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
303 print "$id => $sequence\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
304 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
305
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
306 When dealing with very large sequences, you can avoid bringing them
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
307 into memory by calling each() in a scalar context. This returns the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
308 key only. You can then use tied(%db) to recover the Bio::DB::Fasta
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
309 object and call its methods.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
310
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
311 while (my $id = each %db) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
312 print "$id => $db{$sequence:1,100}\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
313 print "$id => ",tied(%db)->length($id),"\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
314 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
315
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
316 You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
317 hash methods directly.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
318
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
319 =over 4
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
320
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
321 =item $id = $db-E<gt>FIRSTKEY
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
322
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
323 Return the first ID in the database.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
324
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
325 =item $id = $db-E<gt>NEXTKEY($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
326
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
327 Given an ID, return the next ID in sequence.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
328
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
329 =back
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
330
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
331 This allows you to write the following iterative loop using just the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
332 object-oriented interface:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
333
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
334 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
335 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
336 # do something with sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
337 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
338
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
339 =head2 Creating a Tied Filehandle
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
340
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
341 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
342 which you can read Bio::PrimarySeq::Fasta sequence objects
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
343 sequentially. The following bit of code will iterate sequentially
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
344 over all sequences in the database:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
345
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
346 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
347 while (my $seq = <$fh>) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
348 print $seq->id,' => ',$seq->length,"\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
349 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
350
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
351 When no more sequences remain to be retrieved, the stream will return
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
352 undef.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
353
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
354 =head1 BUGS
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
355
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
356 When a sequence is deleted from one of the Fasta files, this deletion
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
357 is not detected by the module and removed from the index. As a
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
358 result, a "ghost" entry will remain in the index and will return
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
359 garbage results if accessed.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
360
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
361 Currently, the only way to accomodate deletions is to rebuild the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
362 entire index, either by deleting it manually, or by passing
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
363 -reindex=E<gt>1 to new() when initializing the module.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
364
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
365 =head1 SEE ALSO
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
366
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
367 L<bioperl>
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
368
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
369 =head1 AUTHOR
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
370
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
372
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
373 Copyright (c) 2001 Cold Spring Harbor Laboratory.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
374
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
375 This library is free software; you can redistribute it and/or modify
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
376 it under the same terms as Perl itself. See DISCLAIMER.txt for
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
377 disclaimers of warranty.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
378
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
379 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
380
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
381 #'
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
382 package Bio::DB::Fasta;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
383
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
384 BEGIN {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
385 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
386 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
387
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
388 use strict;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
389 use IO::File;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
390 use AnyDBM_File;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
391 use Fcntl;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
392 use File::Basename qw(basename dirname);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
393 use Bio::DB::SeqI;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
394 use Bio::Root::Root;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
395 use vars qw($VERSION @ISA);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
396
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
397 @ISA = qw(Bio::DB::SeqI Bio::Root::Root);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
398
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
399 $VERSION = '1.03';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
400
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
401 *seq = *sequence = \&subseq;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
402 *ids = \&get_all_ids;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
403 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
404
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
405 use constant STRUCT =>'NNnnCa*';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
406 use constant DNA => 1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
407 use constant RNA => 2;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
408 use constant PROTEIN => 3;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
409
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
410 # Bio::DB-like object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
411 # providing fast random access to a directory of FASTA files
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
412
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
413 =head2 new
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
414
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
415 Title : new
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
416 Usage : my $db = new Bio::DB::Fasta( $path, @options);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
417 Function: initialize a new Bio::DB::Fasta object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
418 Returns : new Bio::DB::Fasta object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
419 Args : path to dir of fasta files or a single filename
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
420
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
421 These are optional arguments to pass in as well.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
422
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
423 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST}
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
424 for searching for Fasta
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
425 files in directories.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
426
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
427 -makeid A code subroutine for None
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
428 transforming Fasta IDs.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
429
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
430 -maxopen Maximum size of 32
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
431 filehandle cache.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
432
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
433 -debug Turn on status 0
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
434 messages.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
435
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
436 -reindex Force the index to be 0
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
437 rebuilt.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
438
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
439 -dbmargs Additional arguments none
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
440 to pass to the DBM
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
441 routines when tied
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
442 (scalar or array ref).
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
443
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
444 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
445
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
446 sub new {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
447 my $class = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
448 my $path = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
449 my %opts = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
450
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
451 my $self = bless { debug => $opts{-debug},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
452 makeid => $opts{-makeid},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
453 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}',
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
454 maxopen => $opts{-maxfh} || 32,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
455 dbmargs => $opts{-dbmargs} || undef,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
456 fhcache => {},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
457 cacheseq => {},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
458 curopen => 0,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
459 openseq => 1,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
460 dirname => undef,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
461 offsets => undef,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
462 }, $class;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
463 my ($offsets,$dirname);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
464
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
465 if (-d $path) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
466 $offsets = $self->index_dir($path,$opts{-reindex});
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
467 $dirname = $path;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
468 } elsif (-f _) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
469 $offsets = $self->index_file($path,$opts{-reindex});
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
470 $dirname = dirname($path);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
471 } else {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
472 $self->throw( "$path: Invalid file or dirname");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
473 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
474 @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
475
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
476 $self;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
477 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
478
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
479 =head2 newFh
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
480
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
481 Title : newFh
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
482 Function: gets a new Fh for a file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
483 Example : internal method
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
484 Returns : GLOB
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
485 Args :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
486
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
487 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
488
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
489 sub newFh {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
490 my $class = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
491 my $self = $class->new(@_);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
492 require Symbol;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
493 my $fh = Symbol::gensym or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
494 tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
495 $fh;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
496 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
497
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
498 sub _open_index {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
499 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
500 my ($index,$write) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
501 my %offsets;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
502 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
503 my @dbmargs = $self->dbmargs;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
504 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
505 return \%offsets;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
506 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
507
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
508 =head2 index_dir
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
509
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
510 Title : index_dir
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
511 Usage : $db->index_dir($dir)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
512 Function: set the index dir and load all files in the dir
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
513 Returns : hashref of seq offsets in each file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
514 Args : dirname, boolean to force a reload of all files
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
515
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
516 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
517
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
518 sub index_dir {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
519 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
520 my $dir = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
521 my $force_reindex = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
522
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
523 # find all fasta files
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
524 my @files = glob("$dir/$self->{glob}");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
525 $self->throw( "no fasta files in $dir") unless @files;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
526
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
527 # get name of index
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
528 my $index = $self->index_name($dir,1);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
529
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
530 # if caller has requested reindexing, then unlink
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
531 # the index file.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
532 unlink $index if $force_reindex;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
533
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
534 # get the modification time of the index
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
535 my $indextime = (stat($index))[9] || 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
536
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
537 # get the most recent modification time of any of the contents
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
538 my $modtime = 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
539 my %modtime;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
540 foreach (@files) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
541 my $m = (stat($_))[9];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
542 $modtime{$_} = $m;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
543 $modtime = $m if $modtime < $m;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
544 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
545
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
546 my $reindex = $force_reindex || $indextime < $modtime;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
547 my $offsets = $self->_open_index($index,$reindex) or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
548 $self->{offsets} = $offsets;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
549
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
550 # no indexing needed
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
551 return $offsets unless $reindex;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
552
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
553 # otherwise reindex contents of changed files
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
554 $self->{indexing} = $index;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
555 foreach (@files) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
556 next if( defined $indextime && $modtime{$_} <= $indextime);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
557 $self->calculate_offsets($_,$offsets);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
558 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
559 delete $self->{indexing};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
560 return $self->{offsets};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
561 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
562
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
563 =head2 get_Seq_by_id
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
564
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
565 Title : get_Seq_by_id
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
566 Usage : my $seq = $db->get_Seq_by_id($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
567 Function: Bio::DB::RandomAccessI method implemented
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
568 Returns : Bio::PrimarySeqI object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
569 Args : id
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
570
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
571 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
572
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
573 sub get_Seq_by_id {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
574 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
575 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
576 return Bio::PrimarySeq::Fasta->new($self,$id);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
577 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
578
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
579 =head2 index_file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
580
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
581 Title : index_file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
582 Usage : $db->index_file($filename)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
583 Function: (re)loads a sequence file and indexes sequences offsets in the file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
584 Returns : seq offsets in the file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
585 Args : filename,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
586 boolean to force reloading a file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
587
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
588 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
589
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
590
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
591 sub index_file {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
592 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
593 my $file = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
594 my $force_reindex = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
595
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
596 my $index = $self->index_name($file);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
597 # if caller has requested reindexing, then unlink the index
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
598 unlink $index if $force_reindex;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
599
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
600 # get the modification time of the index
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
601 my $indextime = (stat($index))[9];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
602 my $modtime = (stat($file))[9];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
603
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
604 my $reindex = $force_reindex || $indextime < $modtime;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
605 my $offsets = $self->_open_index($index,$reindex) or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
606 $self->{offsets} = $offsets;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
607
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
608 return $self->{offsets} unless $reindex;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
609
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
610 $self->{indexing} = $index;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
611 $self->calculate_offsets($file,$offsets);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
612 delete $self->{indexing};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
613 return $self->{offsets};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
614 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
615
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
616 =head2 dbmargs
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
617
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
618 Title : dbmargs
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
619 Usage : my @args = $db->dbmargs;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
620 Function: gets stored dbm arguments
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
621 Returns : array
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
622 Args : none
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
623
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
624
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
625 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
626
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
627 sub dbmargs {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
628 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
629 my $args = $self->{dbmargs} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
630 return ref($args) eq 'ARRAY' ? @$args : $args;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
631 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
632
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
633 =head2 index_name
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
634
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
635 Title : index_name
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
636 Usage : my $indexname = $db->index_name($path,$isdir);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
637 Function: returns the name of the index for a specific path
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
638 Returns : string
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
639 Args : path to check,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
640 boolean if it is a dir
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
641
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
642 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
643
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
644 sub index_name {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
645 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
646 my ($path,$isdir) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
647 unless ($path) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
648 my $dir = $self->{dirname} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
649 return $self->index_name($dir,-d $dir);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
650 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
651 return "$path/directory.index" if $isdir;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
652 return "$path.index";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
653 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
654
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
655 =head2 calculate_offsets
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
656
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
657 Title : calculate_offsets
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
658 Usage : $db->calculate_offsets($filename,$offsets);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
659 Function: calculates the sequence offsets in a file based on id
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
660 Returns : offset hash for each file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
661 Args : file to process
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
662 $offsets - hashref of id to offset storage
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
663
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
664 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
665
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
666 sub calculate_offsets {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
667 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
668 my ($file,$offsets) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
669 my $base = $self->path2fileno(basename($file));
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
670
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
671 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
672 warn "indexing $file\n" if $self->{debug};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
673 my ($offset,$id,$linelength,$type,$firstline,$count,%offsets);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
674 while (<$fh>) { # don't try this at home
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
675 if (/^>(\S+)/) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
676 print STDERR "indexed $count sequences...\n"
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
677 if $self->{debug} && (++$count%1000) == 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
678 my $pos = tell($fh);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
679 if ($id) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
680 my $seqlength = $pos - $offset - length($_) - 1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
681 $seqlength -= int($seqlength/$linelength);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
682 $offsets->{$id} = $self->_pack($offset,$seqlength,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
683 $linelength,$firstline,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
684 $type,$base);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
685 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
686 $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
687 ($offset,$firstline,$linelength) = ($pos,length($_),0);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
688 } else {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
689 $linelength ||= length($_);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
690 $type ||= $self->_type($_);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
691 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
692 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
693 # deal with last entry
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
694 if ($id) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
695 my $pos = tell($fh);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
696
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
697 # my $seqlength = $pos - $offset - length($_) - 1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
698 # $_ is always null should not be part of this calculation
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
699 my $seqlength = $pos - $offset - 1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
700
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
701 if ($linelength == 0) { # yet another pesky empty chr_random.fa file
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
702 $seqlength = 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
703 } else {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
704 $seqlength -= int($seqlength/$linelength);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
705 };
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
706 $offsets->{$id} = $self->_pack($offset,$seqlength,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
707 $linelength,$firstline,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
708 $type,$base);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
709 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
710 return \%offsets;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
711 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
712
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
713 =head2 get_all_ids
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
714
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
715 Title : get_all_ids
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
716 Usage : my @ids = $db->get_all_ids
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
717 Function: gets all the stored ids in all indexes
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
718 Returns : list of ids
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
719 Args : none
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
720
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
721 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
722
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
723 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
724
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
725 sub offset {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
726 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
727 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
728 my $offset = $self->{offsets}{$id} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
729 ($self->_unpack($offset))[0];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
730 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
731
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
732 sub length {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
733 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
734 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
735 my $offset = $self->{offsets}{$id} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
736 ($self->_unpack($offset))[1];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
737 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
738
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
739 sub linelen {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
740 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
741 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
742 my $offset = $self->{offsets}{$id} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
743 ($self->_unpack($offset))[2];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
744 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
745
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
746 sub headerlen {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
747 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
748 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
749 my $offset = $self->{offsets}{$id} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
750 ($self->_unpack($offset))[3];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
751 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
752
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
753 sub alphabet {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
754 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
755 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
756 my $offset = $self->{offsets}{$id} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
757 my $type = ($self->_unpack($offset))[4];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
758 return $type == DNA ? 'dna'
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
759 : $type == RNA ? 'rna'
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
760 : 'protein';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
761
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
762 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
763
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
764 sub path { shift->{dirname} }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
765
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
766 sub header_offset {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
767 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
768 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
769 return unless $self->{offsets}{$id};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
770 return $self->offset($id) - $self->headerlen($id);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
771 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
772
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
773 sub file {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
774 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
775 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
776 my $offset = $self->{offsets}{$id} or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
777 $self->fileno2path(($self->_unpack($offset))[5]);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
778 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
779
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
780 sub fileno2path {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
781 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
782 my $no = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
783 return $self->{offsets}{"__file_$no"};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
784 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
785
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
786 sub path2fileno {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
787 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
788 my $path = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
789 if ( !defined $self->{offsets}{"__path_$path"} ) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
790 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
791 $self->{offsets}{"__file_$fileno"} = $path;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
792 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
793 return $self->{offsets}{"__path_$path"}
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
794 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
795
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
796 =head2 subseq
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
797
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
798 Title : subseq
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
799 Usage : $seqdb->subseq($id,$start,$stop);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
800 Function: returns a subseq of a sequence in the db
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
801 Returns : subsequence data
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
802 Args : id of sequence, starting point, ending point
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
803
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
804 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
805
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
806 sub subseq {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
807 my ($self,$id,$start,$stop) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
808 if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
809 ($id,$start,$stop) = ($1,$2,$3);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
810 $start =~ s/_//g;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
811 $stop =~ s/_//g;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
812 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
813 $start ||= 1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
814 $stop ||= $self->length($id);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
815
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
816 my $reversed;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
817 if ($start > $stop) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
818 ($start,$stop) = ($stop,$start);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
819 $reversed++;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
820 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
821
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
822 my $data;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
823
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
824 my $fh = $self->fh($id) or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
825 my $filestart = $self->caloffset($id,$start);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
826 my $filestop = $self->caloffset($id,$stop);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
827
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
828 seek($fh,$filestart,0);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
829 read($fh,$data,$filestop-$filestart+1);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
830 $data =~ s/\n//g;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
831 if ($reversed) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
832 $data = reverse $data;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
833 $data =~ tr/gatcGATC/ctagCTAG/;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
834 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
835 $data;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
836 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
837
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
838 sub fh {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
839 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
840 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
841 my $file = $self->file($id) or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
842 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
843 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
844
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
845 sub header {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
846 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
847 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
848 my ($offset,$seqlength,$linelength,$firstline,$type,$file)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
849 = $self->_unpack($self->{offsets}{$id}) or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
850 $offset -= $firstline;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
851 my $data;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
852 my $fh = $self->fh($id) or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
853 seek($fh,$offset,0);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
854 read($fh,$data,$firstline);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
855 chomp $data;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
856 substr($data,0,1) = '';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
857 $data;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
858 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
859
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
860 sub caloffset {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
861 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
862 my $id = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
863 my $a = shift()-1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
864 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id});
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
865 $a = 0 if $a < 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
866 $a = $seqlength-1 if $a >= $seqlength;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
867 $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
868 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
869
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
870 sub fhcache {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
871 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
872 my $path = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
873 if (!$self->{fhcache}{$path}) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
874 if ($self->{curopen} >= $self->{maxopen}) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
875 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
876 splice(@lru, $self->{maxopen} / 3);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
877 $self->{curopen} -= @lru;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
878 for (@lru) { delete $self->{fhcache}{$_} }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
879 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
880 $self->{fhcache}{$path} = IO::File->new($path) or return;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
881 $self->{curopen}++;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
882 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
883 $self->{cacheseq}{$path}++;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
884 $self->{fhcache}{$path}
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
885 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
886
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
887 sub _pack {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
888 shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
889 pack STRUCT,@_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
890 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
891
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
892 sub _unpack {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
893 shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
894 unpack STRUCT,shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
895 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
896
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
897 sub _type {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
898 shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
899 local $_ = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
900 return /^[gatcnGATCN*-]+$/ ? DNA
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
901 : /^[gaucnGAUCN*-]+$/ ? RNA
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
902 : PROTEIN;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
903 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
904
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
905 =head2 get_PrimarySeq_stream
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
906
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
907 Title : get_PrimarySeq_stream
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
908 Usage :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
909 Function:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
910 Example :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
911 Returns :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
912 Args :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
913
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
914
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
915 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
916
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
917 sub get_PrimarySeq_stream {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
918 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
919 return Bio::DB::Fasta::Stream->new($self);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
920 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
921
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
922 sub TIEHASH {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
923 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
924 return $self->new(@_);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
925 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
926
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
927 sub FETCH {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
928 shift->subseq(@_);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
929 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
930 sub STORE {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
931 shift->throw("Read-only database");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
932 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
933 sub DELETE {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
934 shift->throw("Read-only database");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
935 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
936 sub CLEAR {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
937 shift->throw("Read-only database");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
938 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
939 sub EXISTS {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
940 defined shift->offset(@_);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
941 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
942 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
943 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
944
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
945 sub DESTROY {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
946 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
947 if ($self->{indexing}) { # killed prematurely, so index file is no good!
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
948 warn "indexing was interrupted, so unlinking $self->{indexing}";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
949 unlink $self->{indexing};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
950 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
951 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
952
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
953 #-------------------------------------------------------------
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
954 # Bio::PrimarySeqI compatibility
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
955 #
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
956 package Bio::PrimarySeq::Fasta;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
957 use overload '""' => 'display_id';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
958
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
959 use vars '@ISA';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
960 eval {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
961 require Bio::PrimarySeqI;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
962 require Bio::Root::Root;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
963 } && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI'));
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
964
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
965 sub new {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
966 my $class = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
967 $class = ref($class) if ref $class;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
968 my ($db,$id,$start,$stop) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
969 return bless { db => $db,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
970 id => $id,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
971 start => $start || 1,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
972 stop => $stop || $db->length($id)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
973 },$class;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
974 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
975
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
976 sub seq {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
977 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
978 return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
979 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
980
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
981 sub subseq {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
982 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
983 my ($start,$stop) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
984 $self->throw("Stop cannot be smaller than start") unless $start <= $stop;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
985 return $self->{start} <= $self->{stop} ? $self->new($self->{db},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
986 $self->{id},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
987 $self->{start}+$start-1,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
988 $self->{start}+$stop-1)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
989 : $self->new($self->{db},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
990 $self->{id},
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
991 $self->{start}-($start-1),
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
992 $self->{start}-($stop-1)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
993 );
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
994
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
995 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
996
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
997 sub display_id {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
998 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
999 return $self->{id};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1000 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1001
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1002 sub accession_number {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1003 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1004 return "unknown";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1005 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1006
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1007 sub primary_id {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1008 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1009 return overload::StrVal($self);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1010 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1011
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1012 sub can_call_new { return 0 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1013
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1014 sub alphabet {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1015 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1016 return $self->{db}->alphabet($self->{id});
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1017 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1018
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1019 sub revcom {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1020 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1021 return $self->new(@{$self}{'db','id','stop','start'});
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1022 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1023
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1024 sub length {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1025 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1026 return $self->{db}->length($self->{id});
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1027 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1028
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1029 sub desc {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1030 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1031 return '';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1032 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1033
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1034 #-------------------------------------------------------------
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1035 # stream-based access to the database
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1036 #
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1037 package Bio::DB::Fasta::Stream;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1038 use Tie::Handle;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1039 use vars qw(@ISA);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1040 @ISA = qw(Tie::Handle);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1041 eval {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1042 require Bio::DB::SeqI;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1043 } && (push @ISA,'Bio::DB::SeqI');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1044
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1045
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1046 sub new {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1047 my $class = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1048 my $db = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1049 my $key = $db->FIRSTKEY;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1050 return bless { db=>$db,key=>$key },$class;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1051 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1052
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1053 sub next_seq {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1054 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1055 my ($key,$db) = @{$self}{'key','db'};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1056 my $value = $db->get_Seq_by_id($key);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1057 $self->{key} = $db->NEXTKEY($key);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1058 $value;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1059 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1060
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1061 sub TIEHANDLE {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1062 my $class = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1063 my $db = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1064 return $class->new($db);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1065 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1066 sub READLINE {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1067 my $self = shift;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1068 $self->next_seq;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1069 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1070
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1071 1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1072
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1073 __END__
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1074