Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/Flat/OBDAIndex.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: OBDAIndex.pm,v 1.12.2.1 2003/06/28 20:47:16 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::DB::Flat::OBDAIndex | |
4 # | |
5 # Cared for by Michele Clamp <michele@sanger.ac.uk>> | |
6 # | |
7 # You may distribute this module under the same terms as perl itself | |
8 | |
9 # POD documentation - main docs before the code | |
10 | |
11 =head1 NAME | |
12 | |
13 Bio::DB::Flat::OBDAIndex - Binary search indexing system for sequence files | |
14 | |
15 =head1 SYNOPSIS | |
16 | |
17 This module can be used both to index sequence files and also to retrieve | |
18 sequences from existing sequence files. | |
19 | |
20 =head2 Index creation | |
21 | |
22 my $sequencefile; # Some fasta sequence file | |
23 | |
24 Patterns have to be entered to define where the keys are to be | |
25 indexed and also where the start of each record. E.g. for fasta | |
26 | |
27 my $start_pattern = "^>"; | |
28 my $primary_pattern = "^>(\\S+)"; | |
29 | |
30 | |
31 So the start of a record is a line starting with a E<gt> and the primary | |
32 key is all characters up to the first space afterf the E<gt> | |
33 | |
34 A string also has to be entered to defined what the primary key | |
35 (primary_namespace) is called. | |
36 | |
37 The index can now be created using | |
38 | |
39 my $index = new Bio::DB::Flat::OBDAIndex( | |
40 -start_pattern => $start_pattern, | |
41 -primary_pattern => $primary_pattern, | |
42 -primary_namespace => "ACC", | |
43 ); | |
44 | |
45 To actually write it out to disk we need to enter a directory where the | |
46 indices will live, a database name and an array of sequence files to index. | |
47 | |
48 my @files = ("file1","file2","file3"); | |
49 | |
50 $index->make_index("/Users/michele/indices","mydatabase",@files); | |
51 | |
52 The index is now ready to use. For large sequence files the perl | |
53 way of indexing takes a *long* time and a *huge* amount of memory. | |
54 For indexing things like dbEST I recommend using the C indexer. | |
55 | |
56 =head2 Creating indices with secondary keys | |
57 | |
58 Sometimes just indexing files with one id per entry is not enough. For | |
59 instance you may want to retrieve sequences from swissprot using | |
60 their accessions as well as their ids. | |
61 | |
62 To be able to do this when creating your index you need to pass in | |
63 a hash of secondary_patterns which have their namespaces as the keys | |
64 to the hash. | |
65 | |
66 e.g. For Indexing something like | |
67 | |
68 ID 1433_CAEEL STANDARD; PRT; 248 AA. | |
69 AC P41932; | |
70 DT 01-NOV-1995 (Rel. 32, Created) | |
71 DT 01-NOV-1995 (Rel. 32, Last sequence update) | |
72 DT 15-DEC-1998 (Rel. 37, Last annotation update) | |
73 DE 14-3-3-LIKE PROTEIN 1. | |
74 GN FTT-1 OR M117.2. | |
75 OS Caenorhabditis elegans. | |
76 OC Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; | |
77 OC Rhabditidae; Peloderinae; Caenorhabditis. | |
78 OX NCBI_TaxID=6239; | |
79 RN [1] | |
80 | |
81 where we want to index the accession (P41932) as the primary key and the | |
82 id (1433_CAEEL) as the secondary id. The index is created as follows | |
83 | |
84 my %secondary_patterns; | |
85 | |
86 my $start_pattern = "^ID (\\S+)"; | |
87 my $primary_pattern = "^AC (\\S+)\;"; | |
88 | |
89 $secondary_patterns{"ID"} = "^ID (\\S+)"; | |
90 | |
91 my $index = new Bio::DB::Flat::OBDAIndex( | |
92 -start_pattern => $start_pattern, | |
93 -primary_pattern => $primary_pattern, | |
94 -primary_namespace => 'ACC', | |
95 -secondary_patterns => \%secondary_patterns); | |
96 | |
97 $index->make_index("/Users/michele/indices","mydb",($seqfile)); | |
98 | |
99 Of course having secondary indices makes indexing slower and more | |
100 of a memory hog. | |
101 | |
102 | |
103 =head2 Index reading | |
104 | |
105 To fetch sequences using an existing index first of all create your sequence | |
106 object | |
107 | |
108 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_directory, | |
109 -dbname => 'swissprot'); | |
110 | |
111 Now you can happily fetch sequences either by the primary key or | |
112 by the secondary keys. | |
113 | |
114 my $entry = $index->get_entry_by_id('HBA_HUMAN'); | |
115 | |
116 This returns just a string containing the whole entry. This is | |
117 useful is you just want to print the sequence to screen or write it to a file. | |
118 | |
119 Other ways of getting sequences are | |
120 | |
121 my $fh = $index->get_stream_by_id('HBA_HUMAN'); | |
122 | |
123 This can then be passed to a seqio object for output or converting | |
124 into objects. | |
125 | |
126 my $seq = new Bio::SeqIO(-fh => $fh, | |
127 -format => 'fasta'); | |
128 | |
129 The last way is to retrieve a sequence directly. This is the | |
130 slowest way of extracting as the sequence objects need to be made. | |
131 | |
132 my $seq = $index->get_Seq_by_id('HBA_HUMAN'); | |
133 | |
134 To access the secondary indices the secondary namespace needs to be known | |
135 (use $index-E<gt>secondary_namespaces) and the following call used | |
136 | |
137 my $seq = $index->get_Seq_by_secondary('ACC','Q21973'); | |
138 my $fh = $index->get_stream_by_secondary('ACC','Q21973'); | |
139 my $entry = $index->get_entry_by_secondary('ACC','Q21973'); | |
140 | |
141 =head1 DESCRIPTION | |
142 | |
143 This object allows indexing of sequence files both by a primary key | |
144 (say accession) and multiple secondary keys (say ids). This is | |
145 different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>) | |
146 which uses DBM files as storage. This module uses a binary search to | |
147 retrieve sequences which is more efficient for large datasets. | |
148 | |
149 | |
150 =head1 FEEDBACK | |
151 | |
152 =head2 Mailing Lists | |
153 | |
154 User feedback is an integral part of the evolution of this and other | |
155 Bioperl modules. Send your comments and suggestions preferably to one | |
156 of the Bioperl mailing lists. Your participation is much appreciated. | |
157 | |
158 bioperl-l@bioperl.org - General discussion | |
159 http://bioperl.org/MailList.shtml - About the mailing lists | |
160 | |
161 =head2 Reporting Bugs | |
162 | |
163 Report bugs to the Bioperl bug tracking system to help us keep track | |
164 the bugs and their resolution. Bug reports can be submitted via | |
165 email or the web: | |
166 | |
167 bioperl-bugs@bio.perl.org | |
168 http://bugzilla.bioperl.org/ | |
169 | |
170 =head1 AUTHOR - Michele Clamp | |
171 | |
172 Email - michele@sanger.ac.uk | |
173 | |
174 =head1 APPENDIX | |
175 | |
176 The rest of the documentation details each of the object methods. Internal | |
177 methods are usually preceded with an "_" (underscore). | |
178 | |
179 =cut | |
180 | |
181 package Bio::DB::Flat::OBDAIndex; | |
182 | |
183 use strict; | |
184 use vars qw(@ISA); | |
185 | |
186 use Fcntl qw(SEEK_END SEEK_CUR); | |
187 # rather than using tell which might be buffered | |
188 sub systell{ sysseek($_[0], 0, SEEK_CUR) } | |
189 sub syseof{ sysseek($_[0], 0, SEEK_END) } | |
190 | |
191 | |
192 use Bio::DB::RandomAccessI; | |
193 use Bio::Root::RootI; | |
194 use Bio::SeqIO; | |
195 use Bio::Seq; | |
196 | |
197 @ISA = qw(Bio::DB::RandomAccessI); | |
198 | |
199 use constant CONFIG_FILE_NAME => 'config.dat'; | |
200 use constant HEADER_SIZE => 4; | |
201 | |
202 my @formats = ['FASTA','SWISSPROT','EMBL']; | |
203 | |
204 =head2 new | |
205 | |
206 Title : new | |
207 Usage : For reading | |
208 my $index = new Bio::DB::Flat::OBDAIndex( | |
209 -index_dir => '/Users/michele/indices/', | |
210 -dbname => 'dbEST', | |
211 -format => 'fasta'); | |
212 | |
213 For writing | |
214 | |
215 my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"} | |
216 my $index = new Bio::DB::Flat::OBDAIndex( | |
217 -index_dir => '/Users/michele/indices', | |
218 -primary_pattern => "^>(\\S+)", | |
219 -secondary_patterns => \%secondary_patterns, | |
220 -primary_namespace => "ID"); | |
221 | |
222 my @files = ('file1','file2','file3'); | |
223 | |
224 $index->make_index('mydbname',@files); | |
225 | |
226 | |
227 Function: create a new Bio::DB::Flat::OBDAIndex object | |
228 Returns : new Bio::DB::Flat::OBDAIndex | |
229 Args : -index_dir Directory containing the indices | |
230 -primary_pattern Regexp defining the primary id | |
231 -secondary_patterns A hash ref containing the secondary | |
232 patterns with the namespaces as keys | |
233 -primary_namespace A string defining what the primary key | |
234 is | |
235 | |
236 Status : Public | |
237 | |
238 =cut | |
239 | |
240 sub new { | |
241 my($class, @args) = @_; | |
242 | |
243 my $self = $class->SUPER::new(@args); | |
244 | |
245 bless $self, $class; | |
246 | |
247 my ($index_dir,$dbname,$format,$primary_pattern,$primary_namespace, | |
248 $start_pattern,$secondary_patterns) = | |
249 $self->_rearrange([qw(INDEX_DIR | |
250 DBNAME | |
251 FORMAT | |
252 PRIMARY_PATTERN | |
253 PRIMARY_NAMESPACE | |
254 START_PATTERN | |
255 SECONDARY_PATTERNS)], @args); | |
256 | |
257 $self->index_directory($index_dir); | |
258 $self->database_name ($dbname); | |
259 | |
260 if ($self->index_directory && $dbname) { | |
261 | |
262 $self->read_config_file; | |
263 | |
264 my $fh = $self->primary_index_filehandle; | |
265 my $record_width = $self->read_header($fh); | |
266 | |
267 $self->record_size($record_width); | |
268 } | |
269 | |
270 | |
271 $self->format ($format); | |
272 $self->primary_pattern ($primary_pattern); | |
273 $self->primary_namespace ($primary_namespace); | |
274 $self->start_pattern ($start_pattern); | |
275 $self->secondary_patterns($secondary_patterns); | |
276 | |
277 return $self; | |
278 } | |
279 | |
280 sub new_from_registry { | |
281 my ($self,%config) = @_; | |
282 | |
283 my $dbname = $config{'dbname'}; | |
284 my $location = $config{'location'}; | |
285 | |
286 my $index = new Bio::DB::Flat::OBDAIndex(-dbname => $dbname, | |
287 -index_dir => $location, | |
288 ); | |
289 } | |
290 | |
291 =head2 get_Seq_by_id | |
292 | |
293 Title : get_Seq_by_id | |
294 Usage : $obj->get_Seq_by_id($newval) | |
295 Function: | |
296 Example : | |
297 Returns : value of get_Seq_by_id | |
298 Args : newvalue (optional) | |
299 | |
300 =cut | |
301 | |
302 sub get_Seq_by_id { | |
303 my ($self,$id) = @_; | |
304 | |
305 my ($fh,$length) = $self->get_stream_by_id($id); | |
306 | |
307 if (!defined($self->format)) { | |
308 $self->throw("Can't create sequence - format is not defined"); | |
309 } | |
310 | |
311 if(!$fh){ | |
312 return; | |
313 } | |
314 if (!defined($self->{_seqio})) { | |
315 | |
316 $self->{_seqio} = new Bio::SeqIO(-fh => $fh, | |
317 -format => $self->format); | |
318 } else { | |
319 | |
320 $self->{_seqio}->fh($fh); | |
321 } | |
322 | |
323 return $self->{_seqio}->next_seq; | |
324 | |
325 } | |
326 | |
327 =head2 get_entry_by_id | |
328 | |
329 Title : get_entry_by_id | |
330 Usage : $obj->get_entry_by_id($newval) | |
331 Function: | |
332 Example : | |
333 Returns : | |
334 Args : | |
335 | |
336 | |
337 =cut | |
338 | |
339 sub get_entry_by_id { | |
340 my ($self,$id) = @_; | |
341 | |
342 my ($fh,$length) = $self->get_stream_by_id($id); | |
343 | |
344 my $entry; | |
345 | |
346 sysread($fh,$entry,$length); | |
347 | |
348 return $entry; | |
349 } | |
350 | |
351 | |
352 =head2 get_stream_by_id | |
353 | |
354 Title : get_stream_by_id | |
355 Usage : $obj->get_stream_by_id($newval) | |
356 Function: | |
357 Example : | |
358 Returns : value of get_stream_by_id | |
359 Args : newvalue (optional) | |
360 | |
361 | |
362 =cut | |
363 | |
364 sub get_stream_by_id { | |
365 my ($self,$id) = @_; | |
366 | |
367 my $indexfh = $self->primary_index_filehandle; | |
368 | |
369 syseof ($indexfh); | |
370 | |
371 my $filesize = systell($indexfh); | |
372 | |
373 my $end = ($filesize-$self->{_start_pos})/$self->record_size; | |
374 | |
375 my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size); | |
376 | |
377 | |
378 my ($fileid,$pos,$length) = split(/\t/,$rest); | |
379 | |
380 #print STDERR "OBDAIndex Found id entry $newid $fileid $pos $length:$rest\n"; | |
381 | |
382 if (!$newid) { | |
383 return; | |
384 } | |
385 | |
386 my $fh = $self->get_filehandle_by_fileid($fileid); | |
387 my $file = $self->{_file}{$fileid}; | |
388 | |
389 open (IN,"<$file"); | |
390 $fh = \*IN; | |
391 | |
392 my $entry; | |
393 | |
394 sysseek($fh,$pos,0); | |
395 | |
396 return ($fh,$length); | |
397 } | |
398 | |
399 =head2 get_Seq_by_acc | |
400 | |
401 Title : get_Seq_by_acc | |
402 Usage : $obj->get_Seq_by_acc($newval) | |
403 Function: | |
404 Example : | |
405 Returns : value of get_Seq_by_acc | |
406 Args : newvalue (optional) | |
407 | |
408 | |
409 =cut | |
410 | |
411 sub get_Seq_by_acc { | |
412 my ($self,$acc) = @_; | |
413 | |
414 if ($self->primary_namespace eq "ACC") { | |
415 return $self->get_Seq_by_id($acc); | |
416 } else { | |
417 return $self->get_Seq_by_secondary("ACC",$acc); | |
418 } | |
419 } | |
420 | |
421 =head2 get_Seq_by_secondary | |
422 | |
423 Title : get_Seq_by_secondary | |
424 Usage : $obj->get_Seq_by_secondary($newval) | |
425 Function: | |
426 Example : | |
427 Returns : value of get_Seq_by_secondary | |
428 Args : newvalue (optional) | |
429 | |
430 | |
431 =cut | |
432 | |
433 sub get_Seq_by_secondary { | |
434 my ($self,$name,$id) = @_; | |
435 | |
436 my @names = $self->secondary_namespaces; | |
437 | |
438 my $found = 0; | |
439 foreach my $tmpname (@names) { | |
440 if ($name eq $tmpname) { | |
441 $found = 1; | |
442 } | |
443 } | |
444 | |
445 if ($found == 0) { | |
446 $self->throw("Secondary index for $name doesn't exist\n"); | |
447 } | |
448 | |
449 my $fh = $self->open_secondary_index($name); | |
450 | |
451 syseof ($fh); | |
452 | |
453 my $filesize = systell($fh); | |
454 | |
455 my $recsize = $self->{_secondary_record_size}{$name}; | |
456 # print "Name " . $recsize . "\n"; | |
457 | |
458 my $end = ($filesize-$self->{_start_pos})/$recsize; | |
459 | |
460 # print "End $end $filesize\n"; | |
461 | |
462 my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize); | |
463 | |
464 sysseek($fh,$pos,0); | |
465 | |
466 # print "Found new id $newid $primary_id\n"; | |
467 # We now need to shuffle up the index file to find the top secondary entry | |
468 | |
469 my $record = $newid; | |
470 | |
471 while ($record =~ /^$newid/ && $pos >= 0) { | |
472 | |
473 $record = $self->read_record($fh,$pos,$recsize); | |
474 $pos = $pos - $recsize; | |
475 # print "Up record = $record:$newid\n"; | |
476 } | |
477 | |
478 $pos += $recsize; | |
479 | |
480 # print "Top position is $pos\n"; | |
481 | |
482 # Now we have to shuffle back down again to read all the secondary entries | |
483 | |
484 my $current_id = $newid; | |
485 my %primary_id; | |
486 | |
487 $primary_id{$primary_id} = 1; | |
488 | |
489 while ($current_id eq $newid) { | |
490 $record = $self->read_record($fh,$pos,$recsize); | |
491 print "Record is :$record:\n"; | |
492 my ($secid,$primary_id) = split(/\t/,$record,2); | |
493 $current_id = $secid; | |
494 | |
495 if ($current_id eq $newid) { | |
496 $primary_id =~ s/ //g; | |
497 # print "Primary $primary_id\n"; | |
498 $primary_id{$primary_id} = 1; | |
499 | |
500 $pos = $pos + $recsize; | |
501 # print "Down record = $record\n"; | |
502 } | |
503 } | |
504 | |
505 if (!defined($newid)) { | |
506 return; | |
507 } | |
508 | |
509 my $entry; | |
510 | |
511 foreach my $id (keys %primary_id) { | |
512 $entry .= $self->get_Seq_by_id($id); | |
513 } | |
514 return $entry; | |
515 | |
516 } | |
517 | |
518 =head2 read_header | |
519 | |
520 Title : read_header | |
521 Usage : $obj->read_header($newval) | |
522 Function: | |
523 Example : | |
524 Returns : value of read_header | |
525 Args : newvalue (optional) | |
526 | |
527 | |
528 =cut | |
529 | |
530 sub read_header { | |
531 my ($self,$fh) = @_; | |
532 | |
533 my $record_width; | |
534 | |
535 sysread($fh,$record_width,HEADER_SIZE); | |
536 | |
537 $self->{_start_pos} = HEADER_SIZE; | |
538 $record_width =~ s/ //g; | |
539 $record_width = $record_width * 1; | |
540 | |
541 return $record_width; | |
542 } | |
543 | |
544 =head2 read_record | |
545 | |
546 Title : read_record | |
547 Usage : $obj->read_record($newval) | |
548 Function: | |
549 Example : | |
550 Returns : value of read_record | |
551 Args : newvalue (optional) | |
552 | |
553 | |
554 =cut | |
555 | |
556 sub read_record { | |
557 my ($self,$fh,$pos,$len) = @_; | |
558 | |
559 sysseek($fh,$pos,0); | |
560 | |
561 my $record; | |
562 | |
563 sysread($fh,$record,$len); | |
564 | |
565 return $record; | |
566 | |
567 } | |
568 | |
569 | |
570 =head2 find_entry | |
571 | |
572 Title : find_entry | |
573 Usage : $obj->find_entry($newval) | |
574 Function: | |
575 Example : | |
576 Returns : value of find_entry | |
577 Args : newvalue (optional) | |
578 | |
579 | |
580 =cut | |
581 | |
582 sub find_entry { | |
583 my ($self,$fh,$start,$end,$id,$recsize) = @_; | |
584 | |
585 my $mid = int(($end+1+$start)/2); | |
586 my $pos = ($mid-1)*$recsize + $self->{_start_pos}; | |
587 | |
588 my ($record) = $self->read_record($fh,$pos,$recsize); | |
589 my ($entryid,$rest) = split(/\t/,$record,2); | |
590 | |
591 # print "Mid $recsize $mid $pos:$entryid:$rest:$record\n"; | |
592 # print "Entry :$id:$entryid:$rest\n"; | |
593 | |
594 | |
595 my ($first,$second) = sort { $a cmp $b} ($id,$entryid); | |
596 | |
597 if ($id eq $entryid) { | |
598 | |
599 return ($id,$rest,$pos-$recsize); | |
600 | |
601 } elsif ($first eq $id) { | |
602 | |
603 if ($end-$start <= 1) { | |
604 return; | |
605 } | |
606 my $end = $mid; | |
607 # print "Moving up $entryid $id\n"; | |
608 $self->find_entry($fh,$start,$end,$id,$recsize); | |
609 | |
610 } elsif ($second eq $id ) { | |
611 # print "Moving down $entryid $id\n"; | |
612 if ($end-$start <= 1) { | |
613 return; | |
614 } | |
615 | |
616 $start = $mid; | |
617 | |
618 $self->find_entry($fh,$start,$end,$id,$recsize); | |
619 } | |
620 | |
621 } | |
622 | |
623 | |
624 =head2 make_index | |
625 | |
626 Title : make_index | |
627 Usage : $obj->make_index($newval) | |
628 Function: | |
629 Example : | |
630 Returns : value of make_index | |
631 Args : newvalue (optional) | |
632 | |
633 | |
634 =cut | |
635 | |
636 sub make_index { | |
637 my ($self,$dbname,@files) = @_;; | |
638 | |
639 my $rootdir = $self->index_directory; | |
640 | |
641 if (!defined($rootdir)) { | |
642 $self->throw("No index directory set - can't build indices"); | |
643 } | |
644 | |
645 if (! -d $rootdir) { | |
646 $self->throw("Index directory [$rootdir] is not a directory. Cant' build indices"); | |
647 } | |
648 if (!(@files)) { | |
649 $self->throw("Must enter an array of filenames to index"); | |
650 } | |
651 | |
652 if (!defined($dbname)) { | |
653 $self->throw("Must enter an index name for your files"); | |
654 } | |
655 | |
656 my $pwd = `pwd`; chomp($pwd); | |
657 | |
658 foreach my $file (@files) { | |
659 if ($file !~ /^\//) { | |
660 $file = $pwd . "/$file"; | |
661 } | |
662 if (! -e $file) { | |
663 $self->throw("Can't index file [$file] as it doesn't exist"); | |
664 } | |
665 } | |
666 | |
667 $self->database_name($dbname); | |
668 $self->make_indexdir($rootdir);; | |
669 $self->make_config_file(\@files); | |
670 | |
671 # Finally lets index | |
672 foreach my $file (@files) { | |
673 $self->_index_file($file); | |
674 } | |
675 | |
676 # And finally write out the indices | |
677 $self->write_primary_index; | |
678 $self->write_secondary_indices; | |
679 } | |
680 | |
681 =head2 _index_file | |
682 | |
683 Title : _index_file | |
684 Usage : $obj->_index_file($newval) | |
685 Function: | |
686 Example : | |
687 Returns : value of _index_file | |
688 Args : newvalue (optional) | |
689 | |
690 | |
691 =cut | |
692 | |
693 sub _index_file { | |
694 my ($self,$file) = @_; | |
695 | |
696 open(FILE,"<$file") || $self->throw("Can't open file [$file]"); | |
697 | |
698 my $recstart = 0; | |
699 my $fileid = $self->get_fileid_by_filename($file); | |
700 my $found = 0; | |
701 my $id; | |
702 my $count; | |
703 | |
704 my $primary = $self->primary_pattern; | |
705 my $start_pattern = $self->start_pattern; | |
706 | |
707 my $pos = 0; | |
708 | |
709 my $new_primary_entry; | |
710 | |
711 my $length; | |
712 #my $pos = 0; | |
713 my $fh = \*FILE; | |
714 | |
715 my $done = -1; | |
716 | |
717 my @secondary_names = $self->secondary_namespaces; | |
718 my %secondary_id; | |
719 | |
720 while (<$fh>) { | |
721 if ($_ =~ /$start_pattern/) { | |
722 if ($done == 0) { | |
723 $id = $new_primary_entry; | |
724 | |
725 my $tmplen = tell($fh) - length($_); | |
726 | |
727 $length = $tmplen - $pos; | |
728 | |
729 if (!defined($id)) { | |
730 $self->throw("No id defined for sequence"); | |
731 } | |
732 if (!defined($fileid)) { | |
733 $self->throw("No fileid defined for file $file"); | |
734 } | |
735 if (!defined($pos)) { | |
736 $self->throw("No position defined for " . $id . "\n"); | |
737 } | |
738 if (!defined($length)) { | |
739 $self->throw("No length defined for " . $id . "\n"); | |
740 } | |
741 | |
742 $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id); | |
743 | |
744 $pos = $tmplen; | |
745 | |
746 if ($count%1000 == 0) { | |
747 print STDERR "Indexed $count ids\n"; | |
748 } | |
749 | |
750 $count++; | |
751 } else { | |
752 $done = 0; | |
753 } | |
754 } | |
755 | |
756 if ($_ =~ /$primary/) { | |
757 $new_primary_entry = $1; | |
758 } | |
759 | |
760 my $secondary_patterns = $self->secondary_patterns; | |
761 | |
762 foreach my $sec (@secondary_names) { | |
763 my $pattern = $secondary_patterns->{$sec}; | |
764 | |
765 if ($_ =~ /$pattern/) { | |
766 $secondary_id{$sec} = $1; | |
767 } | |
768 } | |
769 | |
770 } | |
771 | |
772 # Remeber to add in the last one | |
773 | |
774 $id = $new_primary_entry; | |
775 | |
776 my $tmplen = tell($fh) - length($_); | |
777 | |
778 $length = $tmplen - $pos; | |
779 | |
780 if (!defined($id)) { | |
781 $self->throw("No id defined for sequence"); | |
782 } | |
783 if (!defined($fileid)) { | |
784 $self->throw("No fileid defined for file $file"); | |
785 } | |
786 if (!defined($pos)) { | |
787 $self->throw("No position defined for " . $id . "\n"); | |
788 } | |
789 if (!defined($length)) { | |
790 $self->throw("No length defined for " . $id . "\n"); | |
791 } | |
792 | |
793 $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id); | |
794 | |
795 close(FILE); | |
796 } | |
797 | |
798 =head2 write_primary_index | |
799 | |
800 Title : write_primary_index | |
801 Usage : $obj->write_primary_index($newval) | |
802 Function: | |
803 Example : | |
804 Returns : value of write_primary_index | |
805 Args : newvalue (optional) | |
806 | |
807 | |
808 =cut | |
809 | |
810 sub write_primary_index { | |
811 my ($self) = @_; | |
812 | |
813 my @ids = keys %{$self->{_id}}; | |
814 | |
815 @ids = sort {$a cmp $b} @ids; | |
816 | |
817 print STDERR "Number of ids = " . scalar(@ids) . "\n"; | |
818 | |
819 open (INDEX,">" . $self->primary_index_file) || $self->throw("Can't open primary index file [" . $self->primary_index_file . "]"); | |
820 | |
821 my $recordlength = $self->{_maxidlength} + | |
822 $self->{_maxfileidlength} + | |
823 $self->{_maxposlength} + | |
824 $self->{_maxlengthlength} + 3; | |
825 | |
826 | |
827 print INDEX sprintf("%4d",$recordlength); | |
828 | |
829 foreach my $id (@ids) { | |
830 | |
831 if (!defined($self->{_id}{$id}{_fileid})) { | |
832 $self->throw("No fileid for $id\n"); | |
833 } | |
834 if (!defined($self->{_id}{$id}{_pos})) { | |
835 $self->throw("No position for $id\n"); | |
836 } | |
837 if (!defined($self->{_id}{$id}{_length})) { | |
838 $self->throw("No length for $id"); | |
839 } | |
840 | |
841 my $record = $id . "\t" . | |
842 $self->{_id}{$id}{_fileid} . "\t" . | |
843 $self->{_id}{$id}{_pos} . "\t" . | |
844 $self->{_id}{$id}{_length}; | |
845 | |
846 print INDEX sprintf("%-${recordlength}s",$record); | |
847 | |
848 } | |
849 close(INDEX); | |
850 } | |
851 | |
852 =head2 write_secondary_indices | |
853 | |
854 Title : write_secondary_indices | |
855 Usage : $obj->write_secondary_indices($newval) | |
856 Function: | |
857 Example : | |
858 Returns : value of write_secondary_indices | |
859 Args : newvalue (optional) | |
860 | |
861 | |
862 =cut | |
863 | |
864 sub write_secondary_indices { | |
865 my ($self) = @_; | |
866 | |
867 # These are the different | |
868 my @names = keys (%{$self->{_secondary_id}}); | |
869 | |
870 | |
871 foreach my $name (@names) { | |
872 | |
873 my @seconds = keys %{$self->{_secondary_id}{$name}}; | |
874 | |
875 # First we need to loop over to get the longest record. | |
876 my $length = 0; | |
877 | |
878 foreach my $second (@seconds) { | |
879 my $tmplen = length($second) + 1; | |
880 my @prims = keys %{$self->{_secondary_id}{$name}{$second}}; | |
881 | |
882 foreach my $prim (@prims) { | |
883 my $recordlen = $tmplen + length($prim); | |
884 | |
885 if ($recordlen > $length) { | |
886 $length = $recordlen; | |
887 } | |
888 } | |
889 } | |
890 | |
891 # Now we can print the index | |
892 | |
893 my $fh = $self->new_secondary_filehandle($name); | |
894 | |
895 print $fh sprintf("%4d",$length); | |
896 @seconds = sort @seconds; | |
897 | |
898 foreach my $second (@seconds) { | |
899 | |
900 my @prims = keys %{$self->{_secondary_id}{$name}{$second}}; | |
901 my $tmp = $second; | |
902 | |
903 foreach my $prim (@prims) { | |
904 my $record = $tmp . "\t" . $prim; | |
905 if (length($record) > $length) { | |
906 $self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n"); | |
907 } else { | |
908 print $fh sprintf("%-${length}s",$record); | |
909 print $fh sprintf("%-${length}s",$record); | |
910 } | |
911 } | |
912 } | |
913 | |
914 close($fh); | |
915 } | |
916 } | |
917 | |
918 =head2 new_secondary_filehandle | |
919 | |
920 Title : new_secondary_filehandle | |
921 Usage : $obj->new_secondary_filehandle($newval) | |
922 Function: | |
923 Example : | |
924 Returns : value of new_secondary_filehandle | |
925 Args : newvalue (optional) | |
926 | |
927 | |
928 =cut | |
929 | |
930 sub new_secondary_filehandle { | |
931 my ($self,$name) = @_; | |
932 | |
933 my $indexdir = $self->index_directory; | |
934 | |
935 my $secindex = $indexdir . $self->database_name . "/id_$name.index"; | |
936 | |
937 my $fh = new FileHandle(">$secindex"); | |
938 | |
939 return $fh; | |
940 } | |
941 | |
942 =head2 open_secondary_index | |
943 | |
944 Title : open_secondary_index | |
945 Usage : $obj->open_secondary_index($newval) | |
946 Function: | |
947 Example : | |
948 Returns : value of open_secondary_index | |
949 Args : newvalue (optional) | |
950 | |
951 | |
952 =cut | |
953 | |
954 sub open_secondary_index { | |
955 my ($self,$name) = @_; | |
956 | |
957 if (!defined($self->{_secondary_filehandle}{$name})) { | |
958 | |
959 my $indexdir = $self->index_directory; | |
960 my $secindex = $indexdir . $self->database_name . "/id_$name.index"; | |
961 | |
962 if (! -e $secindex) { | |
963 $self->throw("Index is not present for namespace [$name]\n"); | |
964 } | |
965 | |
966 my $newfh = new FileHandle("<$secindex"); | |
967 my $reclen = $self->read_header($newfh); | |
968 | |
969 $self->{_secondary_filehandle} {$name} = $newfh; | |
970 $self->{_secondary_record_size}{$name} = $reclen; | |
971 } | |
972 | |
973 return $self->{_secondary_filehandle}{$name}; | |
974 | |
975 } | |
976 | |
977 =head2 _add_id_position | |
978 | |
979 Title : _add_id_position | |
980 Usage : $obj->_add_id_position($newval) | |
981 Function: | |
982 Example : | |
983 Returns : value of _add_id_position | |
984 Args : newvalue (optional) | |
985 | |
986 | |
987 =cut | |
988 | |
989 sub _add_id_position { | |
990 my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_; | |
991 | |
992 if (!defined($id)) { | |
993 $self->throw("No id defined. Can't add id position"); | |
994 } | |
995 if (!defined($pos)) { | |
996 v $self->throw("No position defined. Can't add id position"); | |
997 } | |
998 if (!defined($fileid)) { | |
999 $self->throw("No fileid defined. Can't add id position"); | |
1000 } | |
1001 if (!defined($length) || $length <= 0) { | |
1002 $self->throw("No length defined or <= 0 [$length]. Can't add id position"); | |
1003 } | |
1004 | |
1005 $self->{_id}{$id}{_pos} = $pos; | |
1006 $self->{_id}{$id}{_length} = $length; | |
1007 $self->{_id}{$id}{_fileid} = $fileid; | |
1008 | |
1009 # Now the secondary ids | |
1010 | |
1011 foreach my $sec (keys (%$secondary_id)) { | |
1012 my $value = $secondary_id->{$sec}; | |
1013 | |
1014 $self->{_secondary_id}{$sec}{$value}{$id} = 1; | |
1015 } | |
1016 | |
1017 if (length($id) >= $self->{_maxidlength}) { | |
1018 $self->{_maxidlength} = length($id); | |
1019 } | |
1020 | |
1021 if (length($fileid) >= $self->{_maxfileidlength}) { | |
1022 $self->{_maxfileidlength} = length($fileid); | |
1023 } | |
1024 | |
1025 if (length($pos) >= $self->{_maxposlength}) { | |
1026 $self->{_maxposlength} = length($pos); | |
1027 } | |
1028 | |
1029 if (length($length) >= $self->{_maxlengthlength}) { | |
1030 $self->{_maxlengthlength} = length($length); | |
1031 } | |
1032 } | |
1033 | |
1034 =head2 make_indexdir | |
1035 | |
1036 Title : make_indexdir | |
1037 Usage : $obj->make_indexdir($newval) | |
1038 Function: | |
1039 Example : | |
1040 Returns : value of make_indexdir | |
1041 Args : newvalue (optional) | |
1042 | |
1043 | |
1044 =cut | |
1045 | |
1046 sub make_indexdir { | |
1047 my ($self,$rootdir) = @_; | |
1048 | |
1049 if (!defined($rootdir)) { | |
1050 $self->throw("Must enter an index directory name for make_indexdir"); | |
1051 } | |
1052 if (! -e $rootdir) { | |
1053 $self->throw("Root index directory [$rootdir] doesn't exist"); | |
1054 } | |
1055 | |
1056 if (! -d $rootdir) { | |
1057 $self->throw("[$rootdir] exists but is not a directory"); | |
1058 } | |
1059 | |
1060 if ($rootdir !~ /\/$/) { | |
1061 $rootdir .= "/"; | |
1062 } | |
1063 | |
1064 my $indexdir = $rootdir . $self->database_name; | |
1065 | |
1066 if (! -e $indexdir) { | |
1067 mkdir $indexdir,0755; | |
1068 } else { | |
1069 $self->throw("Index directory " . $indexdir . " already exists. Exiting\n"); | |
1070 } | |
1071 | |
1072 } | |
1073 | |
1074 =head2 make_config_file | |
1075 | |
1076 Title : make_config_file | |
1077 Usage : $obj->make_config_file($newval) | |
1078 Function: | |
1079 Example : | |
1080 Returns : value of make_config_file | |
1081 Args : newvalue (optional) | |
1082 | |
1083 =cut | |
1084 | |
1085 sub make_config_file { | |
1086 my ($self,$files) = @_; | |
1087 | |
1088 my @files = @$files; | |
1089 | |
1090 my $dir = $self->index_directory; | |
1091 | |
1092 my $configfile = $dir . $self->database_name . "/" .CONFIG_FILE_NAME; | |
1093 | |
1094 open(CON,">$configfile") || $self->throw("Can't create config file [$configfile]"); | |
1095 | |
1096 # First line must be the type of index - in this case flat | |
1097 | |
1098 print CON "index\tflat/1\n"; | |
1099 | |
1100 # Now the fileids | |
1101 | |
1102 my $count = 0; | |
1103 | |
1104 foreach my $file (@files) { | |
1105 | |
1106 my $size = -s $file; | |
1107 | |
1108 print CON "fileid_$count\t$file\t$size\n"; | |
1109 | |
1110 my $fh = new FileHandle("<$file"); | |
1111 $self->{_fileid}{$count} = $fh; | |
1112 $self->{_file} {$count} = $file; | |
1113 $self->{_dbfile}{$file} = $count; | |
1114 $self->{_size}{$count} = $size; | |
1115 | |
1116 $count++; | |
1117 } | |
1118 | |
1119 # Now the namespaces | |
1120 | |
1121 print CON "primary_namespace\t" .$self->primary_namespace. "\n"; | |
1122 | |
1123 # Needs fixing for the secondary stuff | |
1124 | |
1125 my $second_patterns = $self->secondary_patterns; | |
1126 | |
1127 my @second = keys %$second_patterns; | |
1128 | |
1129 if ((@second)) { | |
1130 print CON "secondary_namespaces"; | |
1131 | |
1132 foreach my $second (@second) { | |
1133 print CON "\t$second"; | |
1134 } | |
1135 print CON "\n"; | |
1136 } | |
1137 | |
1138 # Now the config format | |
1139 | |
1140 if (!defined($self->format)) { | |
1141 $self->throw("Format does not exist in module - can't write config file"); | |
1142 } else { | |
1143 print CON "format\t" . $self->format . "\n"; | |
1144 } | |
1145 | |
1146 | |
1147 close(CON); | |
1148 } | |
1149 | |
1150 =head2 read_config_file | |
1151 | |
1152 Title : read_config_file | |
1153 Usage : $obj->read_config_file($newval) | |
1154 Function: | |
1155 Example : | |
1156 Returns : value of read_config_file | |
1157 Args : newvalue (optional) | |
1158 | |
1159 | |
1160 =cut | |
1161 | |
1162 sub read_config_file { | |
1163 my ($self) = @_; | |
1164 | |
1165 my $dir = $self->index_directory . $self->database_name . "/";; | |
1166 | |
1167 if (! -d $dir) { | |
1168 $self->throw("No index directory [" . $dir . "]. Can't read ". CONFIG_FILE_NAME); | |
1169 } | |
1170 | |
1171 my $configfile = $dir . CONFIG_FILE_NAME; | |
1172 | |
1173 if (! -e $configfile) { | |
1174 $self->throw("No config file [$configfile]. Can't read namespace"); | |
1175 } | |
1176 | |
1177 open(CON,"<$configfile") || $self->throw("Can't open configfile [$configfile]"); | |
1178 | |
1179 # First line must be type | |
1180 | |
1181 my $line = <CON>; chomp($line); | |
1182 my $version; | |
1183 | |
1184 # This is hard coded as we only index flatfiles here | |
1185 if ($line =~ /index\tflat\/(\d+)/) { | |
1186 $version = $1; | |
1187 } else { | |
1188 $self->throw("First line not compatible with flat file index. Should be something like\n\nindex\tflat/1"); | |
1189 } | |
1190 | |
1191 $self->index_type("flat"); | |
1192 $self->index_version($version); | |
1193 | |
1194 while (<CON>) { | |
1195 chomp; | |
1196 | |
1197 # Look for fileid lines | |
1198 if ($_ =~ /^fileid_(\d+)\t(\S+)\t(\d+)/) { | |
1199 my $fileid = $1; | |
1200 my $filename = $2; | |
1201 my $filesize = $3; | |
1202 | |
1203 if (! -e $filename) { | |
1204 $self->throw("File [$filename] does not exist!"); | |
1205 } | |
1206 if (-s $filename != $filesize) { | |
1207 $self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s $filename) . "] Index thinks it is [" . $filesize . "]"); | |
1208 } | |
1209 | |
1210 my $fh = new FileHandle("<$filename"); | |
1211 | |
1212 $self->{_fileid}{$fileid} = $fh; | |
1213 $self->{_file} {$fileid} = $filename; | |
1214 $self->{_dbfile}{$filename} = $fileid; | |
1215 $self->{_size} {$fileid} = $filesize; | |
1216 | |
1217 } | |
1218 | |
1219 # Look for namespace lines | |
1220 if ($_ =~ /(.*)_namespace.*\t(\S+)/) { | |
1221 if ($1 eq "primary") { | |
1222 $self->primary_namespace($2); | |
1223 } elsif ($1 eq "secondary") { | |
1224 $self->secondary_namespaces($2); | |
1225 } else { | |
1226 $self->throw("Unknown namespace name in config file [$1"); | |
1227 } | |
1228 } | |
1229 | |
1230 # Look for format lines | |
1231 | |
1232 if ($_ =~ /format\t(\S+)/) { | |
1233 | |
1234 # Check the format here? | |
1235 | |
1236 $self->format($1); | |
1237 } | |
1238 } | |
1239 close(CON); | |
1240 | |
1241 # Now check we have all that we need | |
1242 | |
1243 my @fileid_keys = keys (%{$self->{_fileid}}); | |
1244 | |
1245 if (!(@fileid_keys)) { | |
1246 $self->throw("No flatfile fileid files in config - check the index has been made correctly"); | |
1247 } | |
1248 | |
1249 if (!defined($self->primary_namespace)) { | |
1250 $self->throw("No primary namespace exists"); | |
1251 } | |
1252 | |
1253 if (! -e $self->primary_index_file) { | |
1254 $self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist"); | |
1255 } | |
1256 } | |
1257 | |
1258 =head2 get_fileid_by_filename | |
1259 | |
1260 Title : get_fileid_by_filename | |
1261 Usage : $obj->get_fileid_by_filename($newval) | |
1262 Function: | |
1263 Example : | |
1264 Returns : value of get_fileid_by_filename | |
1265 Args : newvalue (optional) | |
1266 | |
1267 | |
1268 =cut | |
1269 | |
1270 sub get_fileid_by_filename { | |
1271 my ($self,$file) = @_; | |
1272 | |
1273 if (!defined($self->{_dbfile})) { | |
1274 $self->throw("No file to fileid mapping present. Has the fileid file been read?"); | |
1275 } | |
1276 | |
1277 | |
1278 return $self->{_dbfile}{$file}; | |
1279 } | |
1280 | |
1281 =head2 get_filehandle_by_fileid | |
1282 | |
1283 Title : get_filehandle_by_fileid | |
1284 Usage : $obj->get_filehandle_by_fileid($newval) | |
1285 Function: | |
1286 Example : | |
1287 Returns : value of get_filehandle_by_fileid | |
1288 Args : newvalue (optional) | |
1289 | |
1290 | |
1291 =cut | |
1292 | |
1293 sub get_filehandle_by_fileid { | |
1294 my ($self,$fileid) = @_; | |
1295 | |
1296 if (!defined($self->{_fileid}{$fileid})) { | |
1297 $self->throw("ERROR: undefined fileid in index [$fileid]"); | |
1298 } | |
1299 | |
1300 return $self->{_fileid}{$fileid}; | |
1301 } | |
1302 | |
1303 =head2 primary_index_file | |
1304 | |
1305 Title : primary_index_file | |
1306 Usage : $obj->primary_index_file($newval) | |
1307 Function: | |
1308 Example : | |
1309 Returns : value of primary_index_file | |
1310 Args : newvalue (optional) | |
1311 | |
1312 | |
1313 =cut | |
1314 | |
1315 sub primary_index_file { | |
1316 my ($self) = @_; | |
1317 | |
1318 return $self->index_directory . $self->database_name . "/key_" . $self->primary_namespace . ".key"; | |
1319 } | |
1320 | |
1321 =head2 primary_index_filehandle | |
1322 | |
1323 Title : primary_index_filehandle | |
1324 Usage : $obj->primary_index_filehandle($newval) | |
1325 Function: | |
1326 Example : | |
1327 Returns : value of primary_index_filehandle | |
1328 Args : newvalue (optional) | |
1329 | |
1330 | |
1331 =cut | |
1332 | |
1333 sub primary_index_filehandle { | |
1334 my ($self) = @_; | |
1335 | |
1336 if (!defined ($self->{_primary_index_handle})) { | |
1337 $self->{_primary_index_handle} = new FileHandle("<" . $self->primary_index_file); | |
1338 } | |
1339 return $self->{_primary_index_handle}; | |
1340 } | |
1341 | |
1342 =head2 database_name | |
1343 | |
1344 Title : database_name | |
1345 Usage : $obj->database_name($newval) | |
1346 Function: | |
1347 Example : | |
1348 Returns : value of database_name | |
1349 Args : newvalue (optional) | |
1350 | |
1351 | |
1352 =cut | |
1353 | |
1354 | |
1355 sub database_name { | |
1356 my ($self,$arg) = @_; | |
1357 | |
1358 if (defined($arg)) { | |
1359 $self->{_database_name} = $arg; | |
1360 } | |
1361 return $self->{_database_name}; | |
1362 | |
1363 } | |
1364 | |
1365 =head2 format | |
1366 | |
1367 Title : format | |
1368 Usage : $obj->format($newval) | |
1369 Function: | |
1370 Example : | |
1371 Returns : value of format | |
1372 Args : newvalue (optional) | |
1373 | |
1374 | |
1375 =cut | |
1376 | |
1377 sub format{ | |
1378 my ($obj,$value) = @_; | |
1379 if( defined $value) { | |
1380 $obj->{'format'} = $value; | |
1381 } | |
1382 return $obj->{'format'}; | |
1383 | |
1384 } | |
1385 | |
1386 =head2 index_directory | |
1387 | |
1388 Title : index_directory | |
1389 Usage : $obj->index_directory($newval) | |
1390 Function: | |
1391 Example : | |
1392 Returns : value of index_directory | |
1393 Args : newvalue (optional) | |
1394 | |
1395 | |
1396 =cut | |
1397 | |
1398 sub index_directory { | |
1399 my ($self,$arg) = @_; | |
1400 | |
1401 if (defined($arg)) { | |
1402 if ($arg !~ /\/$/) { | |
1403 $arg .= "/"; | |
1404 } | |
1405 $self->{_index_directory} = $arg; | |
1406 } | |
1407 return $self->{_index_directory}; | |
1408 | |
1409 } | |
1410 | |
1411 =head2 record_size | |
1412 | |
1413 Title : record_size | |
1414 Usage : $obj->record_size($newval) | |
1415 Function: | |
1416 Example : | |
1417 Returns : value of record_size | |
1418 Args : newvalue (optional) | |
1419 | |
1420 | |
1421 =cut | |
1422 | |
1423 sub record_size { | |
1424 my ($self,$arg) = @_; | |
1425 | |
1426 if (defined($arg)) { | |
1427 $self->{_record_size} = $arg; | |
1428 } | |
1429 return $self->{_record_size}; | |
1430 } | |
1431 | |
1432 =head2 primary_namespace | |
1433 | |
1434 Title : primary_namespace | |
1435 Usage : $obj->primary_namespace($newval) | |
1436 Function: | |
1437 Example : | |
1438 Returns : value of primary_namespace | |
1439 Args : newvalue (optional) | |
1440 | |
1441 =cut | |
1442 | |
1443 sub primary_namespace { | |
1444 my ($self,$arg) = @_; | |
1445 | |
1446 if (defined($arg)) { | |
1447 $self->{_primary_namespace} = $arg; | |
1448 } | |
1449 return $self->{_primary_namespace}; | |
1450 } | |
1451 | |
1452 =head2 index_type | |
1453 | |
1454 Title : index_type | |
1455 Usage : $obj->index_type($newval) | |
1456 Function: | |
1457 Example : | |
1458 Returns : value of index_type | |
1459 Args : newvalue (optional) | |
1460 | |
1461 | |
1462 =cut | |
1463 | |
1464 sub index_type { | |
1465 my ($self,$arg) = @_; | |
1466 | |
1467 if (defined($arg)) { | |
1468 $self->{_index_type} = $arg; | |
1469 } | |
1470 return $self->{_index_type}; | |
1471 } | |
1472 | |
1473 =head2 index_version | |
1474 | |
1475 Title : index_version | |
1476 Usage : $obj->index_version($newval) | |
1477 Function: | |
1478 Example : | |
1479 Returns : value of index_version | |
1480 Args : newvalue (optional) | |
1481 | |
1482 | |
1483 =cut | |
1484 | |
1485 sub index_version { | |
1486 my ($self,$arg) = @_; | |
1487 | |
1488 if (defined($arg)) { | |
1489 $self->{_index_version} = $arg; | |
1490 } | |
1491 return $self->{_index_version}; | |
1492 } | |
1493 | |
1494 =head2 primary_pattern | |
1495 | |
1496 Title : primary_pattern | |
1497 Usage : $obj->primary_pattern($newval) | |
1498 Function: | |
1499 Example : | |
1500 Returns : value of primary_pattern | |
1501 Args : newvalue (optional) | |
1502 | |
1503 | |
1504 =cut | |
1505 | |
1506 sub primary_pattern{ | |
1507 my ($obj,$value) = @_; | |
1508 if( defined $value) { | |
1509 $obj->{'primary_pattern'} = $value; | |
1510 } | |
1511 | |
1512 return $obj->{'primary_pattern'}; | |
1513 | |
1514 } | |
1515 =head2 start_pattern | |
1516 | |
1517 Title : start_pattern | |
1518 Usage : $obj->start_pattern($newval) | |
1519 Function: | |
1520 Example : | |
1521 Returns : value of start_pattern | |
1522 Args : newvalue (optional) | |
1523 | |
1524 | |
1525 =cut | |
1526 | |
1527 sub start_pattern{ | |
1528 my ($obj,$value) = @_; | |
1529 if( defined $value) { | |
1530 $obj->{'start_pattern'} = $value; | |
1531 } | |
1532 return $obj->{'start_pattern'}; | |
1533 | |
1534 } | |
1535 | |
1536 =head2 secondary_patterns | |
1537 | |
1538 Title : secondary_patterns | |
1539 Usage : $obj->secondary_patterns($newval) | |
1540 Function: | |
1541 Example : | |
1542 Returns : value of secondary_patterns | |
1543 Args : newvalue (optional) | |
1544 | |
1545 | |
1546 =cut | |
1547 | |
1548 sub secondary_patterns{ | |
1549 my ($obj,$value) = @_; | |
1550 if( defined $value) { | |
1551 $obj->{'secondary_patterns'} = $value; | |
1552 | |
1553 my @names = keys %$value; | |
1554 | |
1555 foreach my $name (@names) { | |
1556 $obj->secondary_namespaces($name); | |
1557 } | |
1558 } | |
1559 return $obj->{'secondary_patterns'}; | |
1560 | |
1561 } | |
1562 | |
1563 =head2 secondary_namespaces | |
1564 | |
1565 Title : secondary_namespaces | |
1566 Usage : $obj->secondary_namespaces($newval) | |
1567 Function: | |
1568 Example : | |
1569 Returns : value of secondary_namespaces | |
1570 Args : newvalue (optional) | |
1571 | |
1572 | |
1573 =cut | |
1574 | |
1575 sub secondary_namespaces{ | |
1576 my ($obj,$value) = @_; | |
1577 | |
1578 if (!defined($obj->{secondary_namespaces})) { | |
1579 $obj->{secondary_namespaces} = []; | |
1580 } | |
1581 if( defined $value) { | |
1582 push(@{$obj->{'secondary_namespaces'}},$value); | |
1583 } | |
1584 return @{$obj->{'secondary_namespaces'}}; | |
1585 | |
1586 } | |
1587 | |
1588 | |
1589 | |
1590 ## These are indexing routines to index commonly used format - fasta | |
1591 ## swissprot and embl | |
1592 | |
1593 sub new_SWISSPROT_index { | |
1594 my ($self,$index_dir,$dbname,@files) = @_; | |
1595 | |
1596 my %secondary_patterns; | |
1597 | |
1598 my $start_pattern = "^ID (\\S+)"; | |
1599 my $primary_pattern = "^AC (\\S+)\\;"; | |
1600 | |
1601 $secondary_patterns{"ID"} = $start_pattern; | |
1602 | |
1603 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir, | |
1604 -format => 'swiss', | |
1605 -primary_pattern => $primary_pattern, | |
1606 -primary_namespace => "ACC", | |
1607 -start_pattern => $start_pattern, | |
1608 -secondary_patterns => \%secondary_patterns); | |
1609 | |
1610 $index->make_index($dbname,@files); | |
1611 } | |
1612 | |
1613 sub new_EMBL_index { | |
1614 my ($self,$index_dir,$dbname,@files) = @_; | |
1615 | |
1616 my %secondary_patterns; | |
1617 | |
1618 my $start_pattern = "^ID (\\S+)"; | |
1619 my $primary_pattern = "^AC (\\S+)\\;"; | |
1620 my $primary_namespace = "ACC"; | |
1621 | |
1622 $secondary_patterns{"ID"} = $start_pattern; | |
1623 | |
1624 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir, | |
1625 -format => 'embl', | |
1626 -primary_pattern => $primary_pattern, | |
1627 -primary_namespace => "ACC", | |
1628 -start_pattern => $start_pattern, | |
1629 -secondary_patterns => \%secondary_patterns); | |
1630 | |
1631 $index->make_index($dbname,@files); | |
1632 | |
1633 return $index; | |
1634 } | |
1635 | |
1636 sub new_FASTA_index { | |
1637 my ($self,$index_dir,$dbname,@files) = @_; | |
1638 | |
1639 my %secondary_patterns; | |
1640 | |
1641 my $start_pattern = "^>"; | |
1642 my $primary_pattern = "^>(\\S+)"; | |
1643 my $primary_namespace = "ACC"; | |
1644 | |
1645 $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)"; | |
1646 | |
1647 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir, | |
1648 -format => 'fasta', | |
1649 -primary_pattern => $primary_pattern, | |
1650 -primary_namespace => "ACC", | |
1651 -start_pattern => $start_pattern, | |
1652 -secondary_patterns => \%secondary_patterns); | |
1653 | |
1654 $index->make_index($dbname,@files); | |
1655 | |
1656 return $index; | |
1657 | |
1658 } | |
1659 | |
1660 | |
1661 | |
1662 1; | |
1663 | |
1664 | |
1665 | |
1666 | |
1667 | |
1668 | |
1669 | |
1670 | |
1671 | |
1672 | |
1673 | |
1674 | |
1675 | |
1676 | |
1677 | |
1678 | |
1679 | |
1680 | |
1681 | |
1682 | |
1683 | |
1684 | |
1685 | |
1686 | |
1687 | |
1688 | |
1689 | |
1690 | |
1691 | |
1692 |