annotate variant_effect_predictor/Bio/DB/Fasta.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 =head1 SYNOPSIS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 use Bio::DB::Fasta;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 # create database from directory of fasta files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12 # simple access (for those without Bioperl)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 my @ids = $db->ids;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16 my $length = $db->length('CHROMOSOME_I');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 my $alphabet = $db->alphabet('CHROMOSOME_I');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 my $header = $db->header('CHROMOSOME_I');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 # Bioperl-style access
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 my $obj = $db->get_Seq_by_id('CHROMOSOME_I');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24 my $seq = $obj->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 my $subseq = $obj->subseq(4_000_000 => 4_100_000);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26 my $length = $obj->length;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 # (etc)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 # Bio::SeqIO-style access
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 while (my $seq = $stream->next_seq) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32 # Bio::PrimarySeqI stuff
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 while (my $seq = <$fh>) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 # Bio::PrimarySeqI stuff
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 # tied hash access
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42 print $sequences{'CHROMOSOME_I:1,20000'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 Bio::DB::Fasta provides indexed access to one or more Fasta files. It
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 provides random access to each sequence entry, and to subsequences
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 within each entry, allowing you to retrieve portions of very large
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49 sequences without bringing the entire sequence into memory.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51 When you initialize the module, you point it at a single fasta file or
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 a directory of multiple such files. The first time it is run, the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 module generates an index of the contents of the file or directory
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 NDBM_File, and SDBM_File). Thereafter it uses the index file to find
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 the file and offset for any requested sequence. If one of the source
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 fasta files is updated, the module reindexes just that one file. (You
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 can also force reindexing manually). For improved performance, the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 module keeps a cache of open filehandles, closing less-recently used
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 ones when the cache is full.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 The fasta files may contain any combination of nucleotide and protein
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 sequences; during indexing the module guesses the molecular type.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64 Entries may have any line length, and different line lengths are
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 allowed in the same file. However, within a sequence entry, all lines
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 must be the same length except for the last.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 the Fasta header. During indexing, you may pass a callback routine to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 modify this primary ID. For example, you may wish to extract a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 The original header line can be recovered later.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 This module was developed for use with the C. elegans and human
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 genomes, and has been tested with sequence segments as large as 20
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 megabases. Indexing the C. elegans genome (100 megabases of genomic
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 laptop. On the same system, average access time for any 200-mer within
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 the C. elegans genome was E<lt>0.02s.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 =head1 DATABASE CREATION AND INDEXING
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 The two constructors for this class are new() and newFh(). The former
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 creates a Bio::DB::Fasta object which is accessed via method calls.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 The latter creates a tied filehandle which can be used Bio::SeqIO
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 style to fetch sequence objects in a stream fashion. There is also a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87 tied hash interface.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 =over 4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 Create a new Bio::DB::Fasta object from the Fasta file or files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 indicated by $fasta_path. Indexing will be performed automatically if
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 needed. If successful, new() will return the database accessor
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 object. Otherwise it will return undef.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98 $fasta_path may be an individual Fasta file, or may refer to a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 directory containing one or more of such files. Following the path,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 you may pass a series of name=E<gt>value options or a hash with these
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 same name=E<gt>value pairs. Valid options are:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 Option Name Description Default
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 ----------- ----------- -------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107 for searching for Fasta
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 files in directories.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 -makeid A code subroutine for None
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 transforming Fasta IDs.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 -maxopen Maximum size of 32
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114 filehandle cache.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116 -debug Turn on status 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 messages.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 -reindex Force the index to be 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 rebuilt.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 -dbmargs Additional arguments none
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 to pass to the DBM
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 routines when tied
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125 (scalar or array ref).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 -dbmargs can be used to control the format of the index. For example,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 you can pass $DB_BTREE to this argument so as to force the IDs to be
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 sorted and retrieved alphabetically. Note that you must use the same
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 arguments every time you open the index!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 -reindex can be used to force the index to be recreated from scratch.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 Bio::SeqIO style.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 The -makeid option gives you a chance to modify sequence IDs during
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143 indexing. The option's value should be a code reference that will
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 take a scalar argument and return a scalar result, like this:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 sub make_my_id {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 my $description_line = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150 # get a new id from the fasta header
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 return $new_id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 make_my_id() will be called with the full fasta id line (including the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 "E<gt>" symbol!). For example:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 >A12345.3 Predicted C. elegans protein egl-2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 By default, this module will use the regular expression /^E<gt>(\S+)/
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 to extract "A12345.3" for use as the ID. If you pass a -makeid
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 callback, you can extract any portion of this, such as the "egl-2"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162 symbol.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 The -makeid option is ignored after the index is constructed.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166 =head1 OBJECT METHODS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 The following object methods are provided.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 =over 4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174 Return the raw sequence (a string) given an ID and optionally a start
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175 and stop position in the sequence. In the case of DNA sequence, if
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 $stop is less than $start, then the reverse complement of the sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177 is returned (this violates Bio::Seq conventions).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 For your convenience, subsequences can be indicated with this compound
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 ID:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 $db->seq("$id:$start,$stop")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 =item $length = $db-E<gt>length($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186 Return the length of the indicated sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188 =item $header = $db-E<gt>header($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190 Return the header line for the ID, including the initial "E<gt>".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 =item $type = $db-E<gt>alphabet($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 Return the molecular type of the indicated sequence. One of "dna",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 "rna" or "protein".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 =item $filename = $db-E<gt>file($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 Return the name of the file in which the indicated sequence can be
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 found.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202 =item $offset = $db-E<gt>offset($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204 Return the offset of the indicated sequence from the beginning of the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 file in which it is located. The offset points to the beginning of
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 the sequence, not the beginning of the header line.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 =item $header_length = $db-E<gt>headerlen($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210 Return the length of the header line for the indicated sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 =item $header_offset = $db-E<gt>header_offset($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214 Return the offset of the header line for the indicated sequence from
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215 the beginning of the file in which it is located.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217 =item $index_name = $db-E<gt>index_name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 Return the path to the index file.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221 =item $path = $db-E<gt>path
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223 Return the path to the Fasta file(s).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 For BioPerl-style access, the following methods are provided:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229 =over 4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231 =item $seq = $db-E<gt>get_Seq_by_id($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 Return a Bio::PrimarySeq::Fasta object, which obeys the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 protein sequence, call $seq-E<gt>seq().
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 Note that get_Seq_by_id() does not bring the entire sequence into
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238 memory until requested. Internally, the returned object uses the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239 accessor to generate subsequences as needed.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 =item $seq = $db-E<gt>get_Seq_by_acc($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 These methods all do the same thing as get_Seq_by_id().
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247 =item $stream = $db-E<gt>get_PrimarySeq_stream()
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249 Return a Bio::DB::Fasta::Stream object, which supports a single method
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250 next_seq(). Each call to next_seq() returns a new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251 Bio::PrimarySeq::Fasta object, until no more sequences remain.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256 returned from get_Seq_by_id() and get_PrimarySeq_stream().
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258 =head1 TIED INTERFACES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260 This module provides two tied interfaces, one which allows you to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261 treat the sequence database as a hash, and the other which allows you
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262 to treat the database as an I/O stream.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264 =head2 Creating a Tied Hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266 The tied hash interface is very straightforward
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268 =over 4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273 The optional @args list is the same set of named argument/value pairs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274 used by Bio::DB::Fasta-E<gt>new().
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276 If successful, tie() will return the tied object. Otherwise it will
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277 return undef.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281 Once tied, you can use the hash to retrieve an individual sequence by
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282 its ID, like this:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284 my $seq = $db{CHROMOSOME_I};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 You may select a subsequence by appending the comma-separated range to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287 the sequence ID in the format "$id:$start,$stop". For example, here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290 my $seq = $db{'CHROMOSOME_I:1,1000'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292 (The regular expression used to parse this format allows sequence IDs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293 to contain colons.)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295 When selecting subsequences, if $start E<gt> stop, then the reverse
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296 complement will be returned for DNA sequences.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298 The keys() and values() functions will return the sequence IDs and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299 their sequences, respectively. In addition, each() can be used to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300 iterate over the entire data set:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302 while (my ($id,$sequence) = each %db) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303 print "$id => $sequence\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306 When dealing with very large sequences, you can avoid bringing them
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307 into memory by calling each() in a scalar context. This returns the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308 key only. You can then use tied(%db) to recover the Bio::DB::Fasta
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 object and call its methods.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311 while (my $id = each %db) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312 print "$id => $db{$sequence:1,100}\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313 print "$id => ",tied(%db)->length($id),"\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316 You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317 hash methods directly.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319 =over 4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321 =item $id = $db-E<gt>FIRSTKEY
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
323 Return the first ID in the database.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
324
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
325 =item $id = $db-E<gt>NEXTKEY($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
326
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
327 Given an ID, return the next ID in sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
328
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
329 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
330
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
331 This allows you to write the following iterative loop using just the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
332 object-oriented interface:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
333
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
334 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
335 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
336 # do something with sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
337 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
338
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
339 =head2 Creating a Tied Filehandle
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
340
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
341 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
342 which you can read Bio::PrimarySeq::Fasta sequence objects
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
343 sequentially. The following bit of code will iterate sequentially
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
344 over all sequences in the database:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
345
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
346 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
347 while (my $seq = <$fh>) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
348 print $seq->id,' => ',$seq->length,"\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
349 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
350
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
351 When no more sequences remain to be retrieved, the stream will return
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
352 undef.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
353
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
354 =head1 BUGS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
355
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
356 When a sequence is deleted from one of the Fasta files, this deletion
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
357 is not detected by the module and removed from the index. As a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
358 result, a "ghost" entry will remain in the index and will return
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
359 garbage results if accessed.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
360
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
361 Currently, the only way to accomodate deletions is to rebuild the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
362 entire index, either by deleting it manually, or by passing
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
363 -reindex=E<gt>1 to new() when initializing the module.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
364
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
365 =head1 SEE ALSO
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
366
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
367 L<bioperl>
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
368
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
369 =head1 AUTHOR
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
370
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
372
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
373 Copyright (c) 2001 Cold Spring Harbor Laboratory.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
374
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
375 This library is free software; you can redistribute it and/or modify
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
376 it under the same terms as Perl itself. See DISCLAIMER.txt for
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
377 disclaimers of warranty.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
378
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
379 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
380
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
381 #'
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
382 package Bio::DB::Fasta;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
383
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
384 BEGIN {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
385 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
386 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
387
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
388 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
389 use IO::File;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
390 use AnyDBM_File;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
391 use Fcntl;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
392 use File::Basename qw(basename dirname);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
393 use Bio::DB::SeqI;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
394 use Bio::Root::Root;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
395 use vars qw($VERSION @ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
396
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
397 @ISA = qw(Bio::DB::SeqI Bio::Root::Root);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
398
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
399 $VERSION = '1.03';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
400
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
401 *seq = *sequence = \&subseq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
402 *ids = \&get_all_ids;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
403 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
404
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
405 use constant STRUCT =>'NNnnCa*';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
406 use constant DNA => 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
407 use constant RNA => 2;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
408 use constant PROTEIN => 3;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
409
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
410 # Bio::DB-like object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
411 # providing fast random access to a directory of FASTA files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
412
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
413 =head2 new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
414
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
415 Title : new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
416 Usage : my $db = new Bio::DB::Fasta( $path, @options);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
417 Function: initialize a new Bio::DB::Fasta object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
418 Returns : new Bio::DB::Fasta object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
419 Args : path to dir of fasta files or a single filename
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
420
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
421 These are optional arguments to pass in as well.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
422
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
423 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
424 for searching for Fasta
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
425 files in directories.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
426
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
427 -makeid A code subroutine for None
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
428 transforming Fasta IDs.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
429
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
430 -maxopen Maximum size of 32
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
431 filehandle cache.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
432
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
433 -debug Turn on status 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
434 messages.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
435
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
436 -reindex Force the index to be 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
437 rebuilt.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
438
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
439 -dbmargs Additional arguments none
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
440 to pass to the DBM
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
441 routines when tied
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
442 (scalar or array ref).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
443
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
444 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
445
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
446 sub new {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
447 my $class = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
448 my $path = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
449 my %opts = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
450
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
451 my $self = bless { debug => $opts{-debug},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
452 makeid => $opts{-makeid},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
453 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
454 maxopen => $opts{-maxfh} || 32,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
455 dbmargs => $opts{-dbmargs} || undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
456 fhcache => {},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
457 cacheseq => {},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
458 curopen => 0,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
459 openseq => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
460 dirname => undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
461 offsets => undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
462 }, $class;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
463 my ($offsets,$dirname);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
464
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
465 if (-d $path) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
466 $offsets = $self->index_dir($path,$opts{-reindex});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
467 $dirname = $path;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
468 } elsif (-f _) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
469 $offsets = $self->index_file($path,$opts{-reindex});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
470 $dirname = dirname($path);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
471 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
472 $self->throw( "$path: Invalid file or dirname");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
473 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
474 @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
475
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
476 $self;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
477 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
478
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
479 =head2 newFh
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
480
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
481 Title : newFh
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
482 Function: gets a new Fh for a file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
483 Example : internal method
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
484 Returns : GLOB
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
485 Args :
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
486
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
487 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
488
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
489 sub newFh {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
490 my $class = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
491 my $self = $class->new(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
492 require Symbol;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
493 my $fh = Symbol::gensym or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
494 tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
495 $fh;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
496 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
497
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
498 sub _open_index {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
499 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
500 my ($index,$write) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
501 my %offsets;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
502 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
503 my @dbmargs = $self->dbmargs;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
504 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
505 return \%offsets;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
506 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
507
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
508 =head2 index_dir
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
509
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
510 Title : index_dir
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
511 Usage : $db->index_dir($dir)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
512 Function: set the index dir and load all files in the dir
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
513 Returns : hashref of seq offsets in each file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
514 Args : dirname, boolean to force a reload of all files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
515
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
516 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
517
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
518 sub index_dir {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
519 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
520 my $dir = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
521 my $force_reindex = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
522
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
523 # find all fasta files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
524 my @files = glob("$dir/$self->{glob}");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
525 $self->throw( "no fasta files in $dir") unless @files;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
526
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
527 # get name of index
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
528 my $index = $self->index_name($dir,1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
529
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
530 # if caller has requested reindexing, then unlink
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
531 # the index file.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
532 unlink $index if $force_reindex;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
533
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
534 # get the modification time of the index
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
535 my $indextime = (stat($index))[9] || 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
536
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
537 # get the most recent modification time of any of the contents
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
538 my $modtime = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
539 my %modtime;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
540 foreach (@files) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
541 my $m = (stat($_))[9];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
542 $modtime{$_} = $m;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
543 $modtime = $m if $modtime < $m;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
544 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
545
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
546 my $reindex = $force_reindex || $indextime < $modtime;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
547 my $offsets = $self->_open_index($index,$reindex) or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
548 $self->{offsets} = $offsets;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
549
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
550 # no indexing needed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
551 return $offsets unless $reindex;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
552
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
553 # otherwise reindex contents of changed files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
554 $self->{indexing} = $index;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
555 foreach (@files) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
556 next if( defined $indextime && $modtime{$_} <= $indextime);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
557 $self->calculate_offsets($_,$offsets);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
558 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
559 delete $self->{indexing};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
560 return $self->{offsets};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
561 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
562
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
563 =head2 get_Seq_by_id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
564
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
565 Title : get_Seq_by_id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
566 Usage : my $seq = $db->get_Seq_by_id($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
567 Function: Bio::DB::RandomAccessI method implemented
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
568 Returns : Bio::PrimarySeqI object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
569 Args : id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
570
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
571 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
572
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
573 sub get_Seq_by_id {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
574 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
575 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
576 return Bio::PrimarySeq::Fasta->new($self,$id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
577 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
578
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
579 =head2 index_file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
580
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
581 Title : index_file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
582 Usage : $db->index_file($filename)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
583 Function: (re)loads a sequence file and indexes sequences offsets in the file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
584 Returns : seq offsets in the file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
585 Args : filename,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
586 boolean to force reloading a file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
587
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
588 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
589
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
590
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
591 sub index_file {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
592 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
593 my $file = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
594 my $force_reindex = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
595
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
596 my $index = $self->index_name($file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
597 # if caller has requested reindexing, then unlink the index
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
598 unlink $index if $force_reindex;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
599
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
600 # get the modification time of the index
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
601 my $indextime = (stat($index))[9];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
602 my $modtime = (stat($file))[9];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
603
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
604 my $reindex = $force_reindex || $indextime < $modtime;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
605 my $offsets = $self->_open_index($index,$reindex) or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
606 $self->{offsets} = $offsets;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
607
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
608 return $self->{offsets} unless $reindex;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
609
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
610 $self->{indexing} = $index;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
611 $self->calculate_offsets($file,$offsets);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
612 delete $self->{indexing};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
613 return $self->{offsets};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
614 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
615
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
616 =head2 dbmargs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
617
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
618 Title : dbmargs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
619 Usage : my @args = $db->dbmargs;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
620 Function: gets stored dbm arguments
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
621 Returns : array
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
622 Args : none
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
623
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
624
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
625 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
626
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
627 sub dbmargs {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
628 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
629 my $args = $self->{dbmargs} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
630 return ref($args) eq 'ARRAY' ? @$args : $args;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
631 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
632
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
633 =head2 index_name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
634
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
635 Title : index_name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
636 Usage : my $indexname = $db->index_name($path,$isdir);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
637 Function: returns the name of the index for a specific path
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
638 Returns : string
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
639 Args : path to check,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
640 boolean if it is a dir
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
641
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
642 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
643
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
644 sub index_name {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
645 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
646 my ($path,$isdir) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
647 unless ($path) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
648 my $dir = $self->{dirname} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
649 return $self->index_name($dir,-d $dir);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
650 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
651 return "$path/directory.index" if $isdir;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
652 return "$path.index";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
653 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
654
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
655 =head2 calculate_offsets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
656
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
657 Title : calculate_offsets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
658 Usage : $db->calculate_offsets($filename,$offsets);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
659 Function: calculates the sequence offsets in a file based on id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
660 Returns : offset hash for each file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
661 Args : file to process
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
662 $offsets - hashref of id to offset storage
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
663
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
664 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
665
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
666 sub calculate_offsets {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
667 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
668 my ($file,$offsets) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
669 my $base = $self->path2fileno(basename($file));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
670
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
671 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
672 warn "indexing $file\n" if $self->{debug};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
673 my ($offset,$id,$linelength,$type,$firstline,$count,%offsets);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
674 while (<$fh>) { # don't try this at home
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
675 if (/^>(\S+)/) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
676 print STDERR "indexed $count sequences...\n"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
677 if $self->{debug} && (++$count%1000) == 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
678 my $pos = tell($fh);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
679 if ($id) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
680 my $seqlength = $pos - $offset - length($_) - 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
681 $seqlength -= int($seqlength/$linelength);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
682 $offsets->{$id} = $self->_pack($offset,$seqlength,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
683 $linelength,$firstline,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
684 $type,$base);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
685 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
686 $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
687 ($offset,$firstline,$linelength) = ($pos,length($_),0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
688 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
689 $linelength ||= length($_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
690 $type ||= $self->_type($_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
691 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
692 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
693 # deal with last entry
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
694 if ($id) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
695 my $pos = tell($fh);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
696
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
697 # my $seqlength = $pos - $offset - length($_) - 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
698 # $_ is always null should not be part of this calculation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
699 my $seqlength = $pos - $offset - 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
700
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
701 if ($linelength == 0) { # yet another pesky empty chr_random.fa file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
702 $seqlength = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
703 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
704 $seqlength -= int($seqlength/$linelength);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
705 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
706 $offsets->{$id} = $self->_pack($offset,$seqlength,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
707 $linelength,$firstline,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
708 $type,$base);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
709 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
710 return \%offsets;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
711 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
712
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
713 =head2 get_all_ids
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
714
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
715 Title : get_all_ids
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
716 Usage : my @ids = $db->get_all_ids
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
717 Function: gets all the stored ids in all indexes
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
718 Returns : list of ids
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
719 Args : none
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
720
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
721 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
722
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
723 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
724
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
725 sub offset {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
726 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
727 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
728 my $offset = $self->{offsets}{$id} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
729 ($self->_unpack($offset))[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
730 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
731
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
732 sub length {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
733 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
734 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
735 my $offset = $self->{offsets}{$id} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
736 ($self->_unpack($offset))[1];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
737 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
738
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
739 sub linelen {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
740 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
741 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
742 my $offset = $self->{offsets}{$id} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
743 ($self->_unpack($offset))[2];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
744 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
745
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
746 sub headerlen {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
747 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
748 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
749 my $offset = $self->{offsets}{$id} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
750 ($self->_unpack($offset))[3];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
751 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
752
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
753 sub alphabet {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
754 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
755 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
756 my $offset = $self->{offsets}{$id} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
757 my $type = ($self->_unpack($offset))[4];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
758 return $type == DNA ? 'dna'
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
759 : $type == RNA ? 'rna'
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
760 : 'protein';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
761
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
762 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
763
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
764 sub path { shift->{dirname} }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
765
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
766 sub header_offset {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
767 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
768 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
769 return unless $self->{offsets}{$id};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
770 return $self->offset($id) - $self->headerlen($id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
771 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
772
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
773 sub file {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
774 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
775 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
776 my $offset = $self->{offsets}{$id} or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
777 $self->fileno2path(($self->_unpack($offset))[5]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
778 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
779
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
780 sub fileno2path {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
781 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
782 my $no = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
783 return $self->{offsets}{"__file_$no"};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
784 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
785
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
786 sub path2fileno {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
787 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
788 my $path = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
789 if ( !defined $self->{offsets}{"__path_$path"} ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
790 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
791 $self->{offsets}{"__file_$fileno"} = $path;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
792 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
793 return $self->{offsets}{"__path_$path"}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
794 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
795
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
796 =head2 subseq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
797
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
798 Title : subseq
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
799 Usage : $seqdb->subseq($id,$start,$stop);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
800 Function: returns a subseq of a sequence in the db
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
801 Returns : subsequence data
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
802 Args : id of sequence, starting point, ending point
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
803
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
804 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
805
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
806 sub subseq {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
807 my ($self,$id,$start,$stop) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
808 if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
809 ($id,$start,$stop) = ($1,$2,$3);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
810 $start =~ s/_//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
811 $stop =~ s/_//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
812 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
813 $start ||= 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
814 $stop ||= $self->length($id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
815
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
816 my $reversed;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
817 if ($start > $stop) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
818 ($start,$stop) = ($stop,$start);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
819 $reversed++;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
820 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
821
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
822 my $data;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
823
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
824 my $fh = $self->fh($id) or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
825 my $filestart = $self->caloffset($id,$start);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
826 my $filestop = $self->caloffset($id,$stop);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
827
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
828 seek($fh,$filestart,0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
829 read($fh,$data,$filestop-$filestart+1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
830 $data =~ s/\n//g;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
831 if ($reversed) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
832 $data = reverse $data;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
833 $data =~ tr/gatcGATC/ctagCTAG/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
834 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
835 $data;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
836 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
837
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
838 sub fh {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
839 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
840 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
841 my $file = $self->file($id) or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
842 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
843 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
844
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
845 sub header {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
846 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
847 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
848 my ($offset,$seqlength,$linelength,$firstline,$type,$file)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
849 = $self->_unpack($self->{offsets}{$id}) or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
850 $offset -= $firstline;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
851 my $data;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
852 my $fh = $self->fh($id) or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
853 seek($fh,$offset,0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
854 read($fh,$data,$firstline);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
855 chomp $data;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
856 substr($data,0,1) = '';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
857 $data;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
858 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
859
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
860 sub caloffset {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
861 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
862 my $id = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
863 my $a = shift()-1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
864 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
865 $a = 0 if $a < 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
866 $a = $seqlength-1 if $a >= $seqlength;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
867 $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
868 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
869
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
870 sub fhcache {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
871 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
872 my $path = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
873 if (!$self->{fhcache}{$path}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
874 if ($self->{curopen} >= $self->{maxopen}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
875 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
876 splice(@lru, $self->{maxopen} / 3);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
877 $self->{curopen} -= @lru;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
878 for (@lru) { delete $self->{fhcache}{$_} }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
879 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
880 $self->{fhcache}{$path} = IO::File->new($path) or return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
881 $self->{curopen}++;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
882 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
883 $self->{cacheseq}{$path}++;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
884 $self->{fhcache}{$path}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
885 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
886
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
887 sub _pack {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
888 shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
889 pack STRUCT,@_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
890 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
891
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
892 sub _unpack {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
893 shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
894 unpack STRUCT,shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
895 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
896
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
897 sub _type {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
898 shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
899 local $_ = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
900 return /^[gatcnGATCN*-]+$/ ? DNA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
901 : /^[gaucnGAUCN*-]+$/ ? RNA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
902 : PROTEIN;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
903 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
904
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
905 =head2 get_PrimarySeq_stream
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
906
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
907 Title : get_PrimarySeq_stream
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
908 Usage :
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
909 Function:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
910 Example :
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
911 Returns :
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
912 Args :
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
913
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
914
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
915 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
916
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
917 sub get_PrimarySeq_stream {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
918 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
919 return Bio::DB::Fasta::Stream->new($self);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
920 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
921
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
922 sub TIEHASH {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
923 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
924 return $self->new(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
925 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
926
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
927 sub FETCH {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
928 shift->subseq(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
929 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
930 sub STORE {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
931 shift->throw("Read-only database");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
932 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
933 sub DELETE {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
934 shift->throw("Read-only database");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
935 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
936 sub CLEAR {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
937 shift->throw("Read-only database");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
938 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
939 sub EXISTS {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
940 defined shift->offset(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
941 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
942 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
943 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
944
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
945 sub DESTROY {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
946 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
947 if ($self->{indexing}) { # killed prematurely, so index file is no good!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
948 warn "indexing was interrupted, so unlinking $self->{indexing}";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
949 unlink $self->{indexing};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
950 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
951 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
952
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
953 #-------------------------------------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
954 # Bio::PrimarySeqI compatibility
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
955 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
956 package Bio::PrimarySeq::Fasta;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
957 use overload '""' => 'display_id';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
958
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
959 use vars '@ISA';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
960 eval {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
961 require Bio::PrimarySeqI;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
962 require Bio::Root::Root;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
963 } && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI'));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
964
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
965 sub new {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
966 my $class = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
967 $class = ref($class) if ref $class;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
968 my ($db,$id,$start,$stop) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
969 return bless { db => $db,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
970 id => $id,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
971 start => $start || 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
972 stop => $stop || $db->length($id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
973 },$class;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
974 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
975
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
976 sub seq {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
977 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
978 return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
979 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
980
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
981 sub subseq {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
982 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
983 my ($start,$stop) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
984 $self->throw("Stop cannot be smaller than start") unless $start <= $stop;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
985 return $self->{start} <= $self->{stop} ? $self->new($self->{db},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
986 $self->{id},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
987 $self->{start}+$start-1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
988 $self->{start}+$stop-1)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
989 : $self->new($self->{db},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
990 $self->{id},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
991 $self->{start}-($start-1),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
992 $self->{start}-($stop-1)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
993 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
994
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
995 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
996
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
997 sub display_id {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
998 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
999 return $self->{id};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1000 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1001
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1002 sub accession_number {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1003 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1004 return "unknown";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1005 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1006
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1007 sub primary_id {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1008 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1009 return overload::StrVal($self);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1010 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1011
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1012 sub can_call_new { return 0 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1013
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1014 sub alphabet {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1015 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1016 return $self->{db}->alphabet($self->{id});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1017 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1018
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1019 sub revcom {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1020 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1021 return $self->new(@{$self}{'db','id','stop','start'});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1022 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1023
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1024 sub length {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1025 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1026 return $self->{db}->length($self->{id});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1027 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1028
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1029 sub desc {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1030 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1031 return '';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1032 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1033
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1034 #-------------------------------------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1035 # stream-based access to the database
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1036 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1037 package Bio::DB::Fasta::Stream;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1038 use Tie::Handle;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1039 use vars qw(@ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1040 @ISA = qw(Tie::Handle);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1041 eval {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1042 require Bio::DB::SeqI;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1043 } && (push @ISA,'Bio::DB::SeqI');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1044
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1045
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1046 sub new {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1047 my $class = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1048 my $db = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1049 my $key = $db->FIRSTKEY;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1050 return bless { db=>$db,key=>$key },$class;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1051 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1052
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1053 sub next_seq {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1054 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1055 my ($key,$db) = @{$self}{'key','db'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1056 my $value = $db->get_Seq_by_id($key);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1057 $self->{key} = $db->NEXTKEY($key);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1058 $value;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1059 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1060
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1061 sub TIEHANDLE {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1062 my $class = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1063 my $db = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1064 return $class->new($db);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1065 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1066 sub READLINE {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1067 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1068 $self->next_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1069 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1070
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1071 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1072
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1073 __END__
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1074