comparison variant_effect_predictor/Bio/DB/Flat/OBDAIndex.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: OBDAIndex.pm,v 1.12.2.1 2003/06/28 20:47:16 jason Exp $
2 #
3 # BioPerl module for Bio::DB::Flat::OBDAIndex
4 #
5 # Cared for by Michele Clamp <michele@sanger.ac.uk>>
6 #
7 # You may distribute this module under the same terms as perl itself
8
9 # POD documentation - main docs before the code
10
11 =head1 NAME
12
13 Bio::DB::Flat::OBDAIndex - Binary search indexing system for sequence files
14
15 =head1 SYNOPSIS
16
17 This module can be used both to index sequence files and also to retrieve
18 sequences from existing sequence files.
19
20 =head2 Index creation
21
22 my $sequencefile; # Some fasta sequence file
23
24 Patterns have to be entered to define where the keys are to be
25 indexed and also where the start of each record. E.g. for fasta
26
27 my $start_pattern = "^>";
28 my $primary_pattern = "^>(\\S+)";
29
30
31 So the start of a record is a line starting with a E<gt> and the primary
32 key is all characters up to the first space afterf the E<gt>
33
34 A string also has to be entered to defined what the primary key
35 (primary_namespace) is called.
36
37 The index can now be created using
38
39 my $index = new Bio::DB::Flat::OBDAIndex(
40 -start_pattern => $start_pattern,
41 -primary_pattern => $primary_pattern,
42 -primary_namespace => "ACC",
43 );
44
45 To actually write it out to disk we need to enter a directory where the
46 indices will live, a database name and an array of sequence files to index.
47
48 my @files = ("file1","file2","file3");
49
50 $index->make_index("/Users/michele/indices","mydatabase",@files);
51
52 The index is now ready to use. For large sequence files the perl
53 way of indexing takes a *long* time and a *huge* amount of memory.
54 For indexing things like dbEST I recommend using the C indexer.
55
56 =head2 Creating indices with secondary keys
57
58 Sometimes just indexing files with one id per entry is not enough. For
59 instance you may want to retrieve sequences from swissprot using
60 their accessions as well as their ids.
61
62 To be able to do this when creating your index you need to pass in
63 a hash of secondary_patterns which have their namespaces as the keys
64 to the hash.
65
66 e.g. For Indexing something like
67
68 ID 1433_CAEEL STANDARD; PRT; 248 AA.
69 AC P41932;
70 DT 01-NOV-1995 (Rel. 32, Created)
71 DT 01-NOV-1995 (Rel. 32, Last sequence update)
72 DT 15-DEC-1998 (Rel. 37, Last annotation update)
73 DE 14-3-3-LIKE PROTEIN 1.
74 GN FTT-1 OR M117.2.
75 OS Caenorhabditis elegans.
76 OC Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
77 OC Rhabditidae; Peloderinae; Caenorhabditis.
78 OX NCBI_TaxID=6239;
79 RN [1]
80
81 where we want to index the accession (P41932) as the primary key and the
82 id (1433_CAEEL) as the secondary id. The index is created as follows
83
84 my %secondary_patterns;
85
86 my $start_pattern = "^ID (\\S+)";
87 my $primary_pattern = "^AC (\\S+)\;";
88
89 $secondary_patterns{"ID"} = "^ID (\\S+)";
90
91 my $index = new Bio::DB::Flat::OBDAIndex(
92 -start_pattern => $start_pattern,
93 -primary_pattern => $primary_pattern,
94 -primary_namespace => 'ACC',
95 -secondary_patterns => \%secondary_patterns);
96
97 $index->make_index("/Users/michele/indices","mydb",($seqfile));
98
99 Of course having secondary indices makes indexing slower and more
100 of a memory hog.
101
102
103 =head2 Index reading
104
105 To fetch sequences using an existing index first of all create your sequence
106 object
107
108 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_directory,
109 -dbname => 'swissprot');
110
111 Now you can happily fetch sequences either by the primary key or
112 by the secondary keys.
113
114 my $entry = $index->get_entry_by_id('HBA_HUMAN');
115
116 This returns just a string containing the whole entry. This is
117 useful is you just want to print the sequence to screen or write it to a file.
118
119 Other ways of getting sequences are
120
121 my $fh = $index->get_stream_by_id('HBA_HUMAN');
122
123 This can then be passed to a seqio object for output or converting
124 into objects.
125
126 my $seq = new Bio::SeqIO(-fh => $fh,
127 -format => 'fasta');
128
129 The last way is to retrieve a sequence directly. This is the
130 slowest way of extracting as the sequence objects need to be made.
131
132 my $seq = $index->get_Seq_by_id('HBA_HUMAN');
133
134 To access the secondary indices the secondary namespace needs to be known
135 (use $index-E<gt>secondary_namespaces) and the following call used
136
137 my $seq = $index->get_Seq_by_secondary('ACC','Q21973');
138 my $fh = $index->get_stream_by_secondary('ACC','Q21973');
139 my $entry = $index->get_entry_by_secondary('ACC','Q21973');
140
141 =head1 DESCRIPTION
142
143 This object allows indexing of sequence files both by a primary key
144 (say accession) and multiple secondary keys (say ids). This is
145 different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>)
146 which uses DBM files as storage. This module uses a binary search to
147 retrieve sequences which is more efficient for large datasets.
148
149
150 =head1 FEEDBACK
151
152 =head2 Mailing Lists
153
154 User feedback is an integral part of the evolution of this and other
155 Bioperl modules. Send your comments and suggestions preferably to one
156 of the Bioperl mailing lists. Your participation is much appreciated.
157
158 bioperl-l@bioperl.org - General discussion
159 http://bioperl.org/MailList.shtml - About the mailing lists
160
161 =head2 Reporting Bugs
162
163 Report bugs to the Bioperl bug tracking system to help us keep track
164 the bugs and their resolution. Bug reports can be submitted via
165 email or the web:
166
167 bioperl-bugs@bio.perl.org
168 http://bugzilla.bioperl.org/
169
170 =head1 AUTHOR - Michele Clamp
171
172 Email - michele@sanger.ac.uk
173
174 =head1 APPENDIX
175
176 The rest of the documentation details each of the object methods. Internal
177 methods are usually preceded with an "_" (underscore).
178
179 =cut
180
181 package Bio::DB::Flat::OBDAIndex;
182
183 use strict;
184 use vars qw(@ISA);
185
186 use Fcntl qw(SEEK_END SEEK_CUR);
187 # rather than using tell which might be buffered
188 sub systell{ sysseek($_[0], 0, SEEK_CUR) }
189 sub syseof{ sysseek($_[0], 0, SEEK_END) }
190
191
192 use Bio::DB::RandomAccessI;
193 use Bio::Root::RootI;
194 use Bio::SeqIO;
195 use Bio::Seq;
196
197 @ISA = qw(Bio::DB::RandomAccessI);
198
199 use constant CONFIG_FILE_NAME => 'config.dat';
200 use constant HEADER_SIZE => 4;
201
202 my @formats = ['FASTA','SWISSPROT','EMBL'];
203
204 =head2 new
205
206 Title : new
207 Usage : For reading
208 my $index = new Bio::DB::Flat::OBDAIndex(
209 -index_dir => '/Users/michele/indices/',
210 -dbname => 'dbEST',
211 -format => 'fasta');
212
213 For writing
214
215 my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
216 my $index = new Bio::DB::Flat::OBDAIndex(
217 -index_dir => '/Users/michele/indices',
218 -primary_pattern => "^>(\\S+)",
219 -secondary_patterns => \%secondary_patterns,
220 -primary_namespace => "ID");
221
222 my @files = ('file1','file2','file3');
223
224 $index->make_index('mydbname',@files);
225
226
227 Function: create a new Bio::DB::Flat::OBDAIndex object
228 Returns : new Bio::DB::Flat::OBDAIndex
229 Args : -index_dir Directory containing the indices
230 -primary_pattern Regexp defining the primary id
231 -secondary_patterns A hash ref containing the secondary
232 patterns with the namespaces as keys
233 -primary_namespace A string defining what the primary key
234 is
235
236 Status : Public
237
238 =cut
239
240 sub new {
241 my($class, @args) = @_;
242
243 my $self = $class->SUPER::new(@args);
244
245 bless $self, $class;
246
247 my ($index_dir,$dbname,$format,$primary_pattern,$primary_namespace,
248 $start_pattern,$secondary_patterns) =
249 $self->_rearrange([qw(INDEX_DIR
250 DBNAME
251 FORMAT
252 PRIMARY_PATTERN
253 PRIMARY_NAMESPACE
254 START_PATTERN
255 SECONDARY_PATTERNS)], @args);
256
257 $self->index_directory($index_dir);
258 $self->database_name ($dbname);
259
260 if ($self->index_directory && $dbname) {
261
262 $self->read_config_file;
263
264 my $fh = $self->primary_index_filehandle;
265 my $record_width = $self->read_header($fh);
266
267 $self->record_size($record_width);
268 }
269
270
271 $self->format ($format);
272 $self->primary_pattern ($primary_pattern);
273 $self->primary_namespace ($primary_namespace);
274 $self->start_pattern ($start_pattern);
275 $self->secondary_patterns($secondary_patterns);
276
277 return $self;
278 }
279
280 sub new_from_registry {
281 my ($self,%config) = @_;
282
283 my $dbname = $config{'dbname'};
284 my $location = $config{'location'};
285
286 my $index = new Bio::DB::Flat::OBDAIndex(-dbname => $dbname,
287 -index_dir => $location,
288 );
289 }
290
291 =head2 get_Seq_by_id
292
293 Title : get_Seq_by_id
294 Usage : $obj->get_Seq_by_id($newval)
295 Function:
296 Example :
297 Returns : value of get_Seq_by_id
298 Args : newvalue (optional)
299
300 =cut
301
302 sub get_Seq_by_id {
303 my ($self,$id) = @_;
304
305 my ($fh,$length) = $self->get_stream_by_id($id);
306
307 if (!defined($self->format)) {
308 $self->throw("Can't create sequence - format is not defined");
309 }
310
311 if(!$fh){
312 return;
313 }
314 if (!defined($self->{_seqio})) {
315
316 $self->{_seqio} = new Bio::SeqIO(-fh => $fh,
317 -format => $self->format);
318 } else {
319
320 $self->{_seqio}->fh($fh);
321 }
322
323 return $self->{_seqio}->next_seq;
324
325 }
326
327 =head2 get_entry_by_id
328
329 Title : get_entry_by_id
330 Usage : $obj->get_entry_by_id($newval)
331 Function:
332 Example :
333 Returns :
334 Args :
335
336
337 =cut
338
339 sub get_entry_by_id {
340 my ($self,$id) = @_;
341
342 my ($fh,$length) = $self->get_stream_by_id($id);
343
344 my $entry;
345
346 sysread($fh,$entry,$length);
347
348 return $entry;
349 }
350
351
352 =head2 get_stream_by_id
353
354 Title : get_stream_by_id
355 Usage : $obj->get_stream_by_id($newval)
356 Function:
357 Example :
358 Returns : value of get_stream_by_id
359 Args : newvalue (optional)
360
361
362 =cut
363
364 sub get_stream_by_id {
365 my ($self,$id) = @_;
366
367 my $indexfh = $self->primary_index_filehandle;
368
369 syseof ($indexfh);
370
371 my $filesize = systell($indexfh);
372
373 my $end = ($filesize-$self->{_start_pos})/$self->record_size;
374
375 my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size);
376
377
378 my ($fileid,$pos,$length) = split(/\t/,$rest);
379
380 #print STDERR "OBDAIndex Found id entry $newid $fileid $pos $length:$rest\n";
381
382 if (!$newid) {
383 return;
384 }
385
386 my $fh = $self->get_filehandle_by_fileid($fileid);
387 my $file = $self->{_file}{$fileid};
388
389 open (IN,"<$file");
390 $fh = \*IN;
391
392 my $entry;
393
394 sysseek($fh,$pos,0);
395
396 return ($fh,$length);
397 }
398
399 =head2 get_Seq_by_acc
400
401 Title : get_Seq_by_acc
402 Usage : $obj->get_Seq_by_acc($newval)
403 Function:
404 Example :
405 Returns : value of get_Seq_by_acc
406 Args : newvalue (optional)
407
408
409 =cut
410
411 sub get_Seq_by_acc {
412 my ($self,$acc) = @_;
413
414 if ($self->primary_namespace eq "ACC") {
415 return $self->get_Seq_by_id($acc);
416 } else {
417 return $self->get_Seq_by_secondary("ACC",$acc);
418 }
419 }
420
421 =head2 get_Seq_by_secondary
422
423 Title : get_Seq_by_secondary
424 Usage : $obj->get_Seq_by_secondary($newval)
425 Function:
426 Example :
427 Returns : value of get_Seq_by_secondary
428 Args : newvalue (optional)
429
430
431 =cut
432
433 sub get_Seq_by_secondary {
434 my ($self,$name,$id) = @_;
435
436 my @names = $self->secondary_namespaces;
437
438 my $found = 0;
439 foreach my $tmpname (@names) {
440 if ($name eq $tmpname) {
441 $found = 1;
442 }
443 }
444
445 if ($found == 0) {
446 $self->throw("Secondary index for $name doesn't exist\n");
447 }
448
449 my $fh = $self->open_secondary_index($name);
450
451 syseof ($fh);
452
453 my $filesize = systell($fh);
454
455 my $recsize = $self->{_secondary_record_size}{$name};
456 # print "Name " . $recsize . "\n";
457
458 my $end = ($filesize-$self->{_start_pos})/$recsize;
459
460 # print "End $end $filesize\n";
461
462 my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize);
463
464 sysseek($fh,$pos,0);
465
466 # print "Found new id $newid $primary_id\n";
467 # We now need to shuffle up the index file to find the top secondary entry
468
469 my $record = $newid;
470
471 while ($record =~ /^$newid/ && $pos >= 0) {
472
473 $record = $self->read_record($fh,$pos,$recsize);
474 $pos = $pos - $recsize;
475 # print "Up record = $record:$newid\n";
476 }
477
478 $pos += $recsize;
479
480 # print "Top position is $pos\n";
481
482 # Now we have to shuffle back down again to read all the secondary entries
483
484 my $current_id = $newid;
485 my %primary_id;
486
487 $primary_id{$primary_id} = 1;
488
489 while ($current_id eq $newid) {
490 $record = $self->read_record($fh,$pos,$recsize);
491 print "Record is :$record:\n";
492 my ($secid,$primary_id) = split(/\t/,$record,2);
493 $current_id = $secid;
494
495 if ($current_id eq $newid) {
496 $primary_id =~ s/ //g;
497 # print "Primary $primary_id\n";
498 $primary_id{$primary_id} = 1;
499
500 $pos = $pos + $recsize;
501 # print "Down record = $record\n";
502 }
503 }
504
505 if (!defined($newid)) {
506 return;
507 }
508
509 my $entry;
510
511 foreach my $id (keys %primary_id) {
512 $entry .= $self->get_Seq_by_id($id);
513 }
514 return $entry;
515
516 }
517
518 =head2 read_header
519
520 Title : read_header
521 Usage : $obj->read_header($newval)
522 Function:
523 Example :
524 Returns : value of read_header
525 Args : newvalue (optional)
526
527
528 =cut
529
530 sub read_header {
531 my ($self,$fh) = @_;
532
533 my $record_width;
534
535 sysread($fh,$record_width,HEADER_SIZE);
536
537 $self->{_start_pos} = HEADER_SIZE;
538 $record_width =~ s/ //g;
539 $record_width = $record_width * 1;
540
541 return $record_width;
542 }
543
544 =head2 read_record
545
546 Title : read_record
547 Usage : $obj->read_record($newval)
548 Function:
549 Example :
550 Returns : value of read_record
551 Args : newvalue (optional)
552
553
554 =cut
555
556 sub read_record {
557 my ($self,$fh,$pos,$len) = @_;
558
559 sysseek($fh,$pos,0);
560
561 my $record;
562
563 sysread($fh,$record,$len);
564
565 return $record;
566
567 }
568
569
570 =head2 find_entry
571
572 Title : find_entry
573 Usage : $obj->find_entry($newval)
574 Function:
575 Example :
576 Returns : value of find_entry
577 Args : newvalue (optional)
578
579
580 =cut
581
582 sub find_entry {
583 my ($self,$fh,$start,$end,$id,$recsize) = @_;
584
585 my $mid = int(($end+1+$start)/2);
586 my $pos = ($mid-1)*$recsize + $self->{_start_pos};
587
588 my ($record) = $self->read_record($fh,$pos,$recsize);
589 my ($entryid,$rest) = split(/\t/,$record,2);
590
591 # print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
592 # print "Entry :$id:$entryid:$rest\n";
593
594
595 my ($first,$second) = sort { $a cmp $b} ($id,$entryid);
596
597 if ($id eq $entryid) {
598
599 return ($id,$rest,$pos-$recsize);
600
601 } elsif ($first eq $id) {
602
603 if ($end-$start <= 1) {
604 return;
605 }
606 my $end = $mid;
607 # print "Moving up $entryid $id\n";
608 $self->find_entry($fh,$start,$end,$id,$recsize);
609
610 } elsif ($second eq $id ) {
611 # print "Moving down $entryid $id\n";
612 if ($end-$start <= 1) {
613 return;
614 }
615
616 $start = $mid;
617
618 $self->find_entry($fh,$start,$end,$id,$recsize);
619 }
620
621 }
622
623
624 =head2 make_index
625
626 Title : make_index
627 Usage : $obj->make_index($newval)
628 Function:
629 Example :
630 Returns : value of make_index
631 Args : newvalue (optional)
632
633
634 =cut
635
636 sub make_index {
637 my ($self,$dbname,@files) = @_;;
638
639 my $rootdir = $self->index_directory;
640
641 if (!defined($rootdir)) {
642 $self->throw("No index directory set - can't build indices");
643 }
644
645 if (! -d $rootdir) {
646 $self->throw("Index directory [$rootdir] is not a directory. Cant' build indices");
647 }
648 if (!(@files)) {
649 $self->throw("Must enter an array of filenames to index");
650 }
651
652 if (!defined($dbname)) {
653 $self->throw("Must enter an index name for your files");
654 }
655
656 my $pwd = `pwd`; chomp($pwd);
657
658 foreach my $file (@files) {
659 if ($file !~ /^\//) {
660 $file = $pwd . "/$file";
661 }
662 if (! -e $file) {
663 $self->throw("Can't index file [$file] as it doesn't exist");
664 }
665 }
666
667 $self->database_name($dbname);
668 $self->make_indexdir($rootdir);;
669 $self->make_config_file(\@files);
670
671 # Finally lets index
672 foreach my $file (@files) {
673 $self->_index_file($file);
674 }
675
676 # And finally write out the indices
677 $self->write_primary_index;
678 $self->write_secondary_indices;
679 }
680
681 =head2 _index_file
682
683 Title : _index_file
684 Usage : $obj->_index_file($newval)
685 Function:
686 Example :
687 Returns : value of _index_file
688 Args : newvalue (optional)
689
690
691 =cut
692
693 sub _index_file {
694 my ($self,$file) = @_;
695
696 open(FILE,"<$file") || $self->throw("Can't open file [$file]");
697
698 my $recstart = 0;
699 my $fileid = $self->get_fileid_by_filename($file);
700 my $found = 0;
701 my $id;
702 my $count;
703
704 my $primary = $self->primary_pattern;
705 my $start_pattern = $self->start_pattern;
706
707 my $pos = 0;
708
709 my $new_primary_entry;
710
711 my $length;
712 #my $pos = 0;
713 my $fh = \*FILE;
714
715 my $done = -1;
716
717 my @secondary_names = $self->secondary_namespaces;
718 my %secondary_id;
719
720 while (<$fh>) {
721 if ($_ =~ /$start_pattern/) {
722 if ($done == 0) {
723 $id = $new_primary_entry;
724
725 my $tmplen = tell($fh) - length($_);
726
727 $length = $tmplen - $pos;
728
729 if (!defined($id)) {
730 $self->throw("No id defined for sequence");
731 }
732 if (!defined($fileid)) {
733 $self->throw("No fileid defined for file $file");
734 }
735 if (!defined($pos)) {
736 $self->throw("No position defined for " . $id . "\n");
737 }
738 if (!defined($length)) {
739 $self->throw("No length defined for " . $id . "\n");
740 }
741
742 $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
743
744 $pos = $tmplen;
745
746 if ($count%1000 == 0) {
747 print STDERR "Indexed $count ids\n";
748 }
749
750 $count++;
751 } else {
752 $done = 0;
753 }
754 }
755
756 if ($_ =~ /$primary/) {
757 $new_primary_entry = $1;
758 }
759
760 my $secondary_patterns = $self->secondary_patterns;
761
762 foreach my $sec (@secondary_names) {
763 my $pattern = $secondary_patterns->{$sec};
764
765 if ($_ =~ /$pattern/) {
766 $secondary_id{$sec} = $1;
767 }
768 }
769
770 }
771
772 # Remeber to add in the last one
773
774 $id = $new_primary_entry;
775
776 my $tmplen = tell($fh) - length($_);
777
778 $length = $tmplen - $pos;
779
780 if (!defined($id)) {
781 $self->throw("No id defined for sequence");
782 }
783 if (!defined($fileid)) {
784 $self->throw("No fileid defined for file $file");
785 }
786 if (!defined($pos)) {
787 $self->throw("No position defined for " . $id . "\n");
788 }
789 if (!defined($length)) {
790 $self->throw("No length defined for " . $id . "\n");
791 }
792
793 $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
794
795 close(FILE);
796 }
797
798 =head2 write_primary_index
799
800 Title : write_primary_index
801 Usage : $obj->write_primary_index($newval)
802 Function:
803 Example :
804 Returns : value of write_primary_index
805 Args : newvalue (optional)
806
807
808 =cut
809
810 sub write_primary_index {
811 my ($self) = @_;
812
813 my @ids = keys %{$self->{_id}};
814
815 @ids = sort {$a cmp $b} @ids;
816
817 print STDERR "Number of ids = " . scalar(@ids) . "\n";
818
819 open (INDEX,">" . $self->primary_index_file) || $self->throw("Can't open primary index file [" . $self->primary_index_file . "]");
820
821 my $recordlength = $self->{_maxidlength} +
822 $self->{_maxfileidlength} +
823 $self->{_maxposlength} +
824 $self->{_maxlengthlength} + 3;
825
826
827 print INDEX sprintf("%4d",$recordlength);
828
829 foreach my $id (@ids) {
830
831 if (!defined($self->{_id}{$id}{_fileid})) {
832 $self->throw("No fileid for $id\n");
833 }
834 if (!defined($self->{_id}{$id}{_pos})) {
835 $self->throw("No position for $id\n");
836 }
837 if (!defined($self->{_id}{$id}{_length})) {
838 $self->throw("No length for $id");
839 }
840
841 my $record = $id . "\t" .
842 $self->{_id}{$id}{_fileid} . "\t" .
843 $self->{_id}{$id}{_pos} . "\t" .
844 $self->{_id}{$id}{_length};
845
846 print INDEX sprintf("%-${recordlength}s",$record);
847
848 }
849 close(INDEX);
850 }
851
852 =head2 write_secondary_indices
853
854 Title : write_secondary_indices
855 Usage : $obj->write_secondary_indices($newval)
856 Function:
857 Example :
858 Returns : value of write_secondary_indices
859 Args : newvalue (optional)
860
861
862 =cut
863
864 sub write_secondary_indices {
865 my ($self) = @_;
866
867 # These are the different
868 my @names = keys (%{$self->{_secondary_id}});
869
870
871 foreach my $name (@names) {
872
873 my @seconds = keys %{$self->{_secondary_id}{$name}};
874
875 # First we need to loop over to get the longest record.
876 my $length = 0;
877
878 foreach my $second (@seconds) {
879 my $tmplen = length($second) + 1;
880 my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
881
882 foreach my $prim (@prims) {
883 my $recordlen = $tmplen + length($prim);
884
885 if ($recordlen > $length) {
886 $length = $recordlen;
887 }
888 }
889 }
890
891 # Now we can print the index
892
893 my $fh = $self->new_secondary_filehandle($name);
894
895 print $fh sprintf("%4d",$length);
896 @seconds = sort @seconds;
897
898 foreach my $second (@seconds) {
899
900 my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
901 my $tmp = $second;
902
903 foreach my $prim (@prims) {
904 my $record = $tmp . "\t" . $prim;
905 if (length($record) > $length) {
906 $self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n");
907 } else {
908 print $fh sprintf("%-${length}s",$record);
909 print $fh sprintf("%-${length}s",$record);
910 }
911 }
912 }
913
914 close($fh);
915 }
916 }
917
918 =head2 new_secondary_filehandle
919
920 Title : new_secondary_filehandle
921 Usage : $obj->new_secondary_filehandle($newval)
922 Function:
923 Example :
924 Returns : value of new_secondary_filehandle
925 Args : newvalue (optional)
926
927
928 =cut
929
930 sub new_secondary_filehandle {
931 my ($self,$name) = @_;
932
933 my $indexdir = $self->index_directory;
934
935 my $secindex = $indexdir . $self->database_name . "/id_$name.index";
936
937 my $fh = new FileHandle(">$secindex");
938
939 return $fh;
940 }
941
942 =head2 open_secondary_index
943
944 Title : open_secondary_index
945 Usage : $obj->open_secondary_index($newval)
946 Function:
947 Example :
948 Returns : value of open_secondary_index
949 Args : newvalue (optional)
950
951
952 =cut
953
954 sub open_secondary_index {
955 my ($self,$name) = @_;
956
957 if (!defined($self->{_secondary_filehandle}{$name})) {
958
959 my $indexdir = $self->index_directory;
960 my $secindex = $indexdir . $self->database_name . "/id_$name.index";
961
962 if (! -e $secindex) {
963 $self->throw("Index is not present for namespace [$name]\n");
964 }
965
966 my $newfh = new FileHandle("<$secindex");
967 my $reclen = $self->read_header($newfh);
968
969 $self->{_secondary_filehandle} {$name} = $newfh;
970 $self->{_secondary_record_size}{$name} = $reclen;
971 }
972
973 return $self->{_secondary_filehandle}{$name};
974
975 }
976
977 =head2 _add_id_position
978
979 Title : _add_id_position
980 Usage : $obj->_add_id_position($newval)
981 Function:
982 Example :
983 Returns : value of _add_id_position
984 Args : newvalue (optional)
985
986
987 =cut
988
989 sub _add_id_position {
990 my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_;
991
992 if (!defined($id)) {
993 $self->throw("No id defined. Can't add id position");
994 }
995 if (!defined($pos)) {
996 v $self->throw("No position defined. Can't add id position");
997 }
998 if (!defined($fileid)) {
999 $self->throw("No fileid defined. Can't add id position");
1000 }
1001 if (!defined($length) || $length <= 0) {
1002 $self->throw("No length defined or <= 0 [$length]. Can't add id position");
1003 }
1004
1005 $self->{_id}{$id}{_pos} = $pos;
1006 $self->{_id}{$id}{_length} = $length;
1007 $self->{_id}{$id}{_fileid} = $fileid;
1008
1009 # Now the secondary ids
1010
1011 foreach my $sec (keys (%$secondary_id)) {
1012 my $value = $secondary_id->{$sec};
1013
1014 $self->{_secondary_id}{$sec}{$value}{$id} = 1;
1015 }
1016
1017 if (length($id) >= $self->{_maxidlength}) {
1018 $self->{_maxidlength} = length($id);
1019 }
1020
1021 if (length($fileid) >= $self->{_maxfileidlength}) {
1022 $self->{_maxfileidlength} = length($fileid);
1023 }
1024
1025 if (length($pos) >= $self->{_maxposlength}) {
1026 $self->{_maxposlength} = length($pos);
1027 }
1028
1029 if (length($length) >= $self->{_maxlengthlength}) {
1030 $self->{_maxlengthlength} = length($length);
1031 }
1032 }
1033
1034 =head2 make_indexdir
1035
1036 Title : make_indexdir
1037 Usage : $obj->make_indexdir($newval)
1038 Function:
1039 Example :
1040 Returns : value of make_indexdir
1041 Args : newvalue (optional)
1042
1043
1044 =cut
1045
1046 sub make_indexdir {
1047 my ($self,$rootdir) = @_;
1048
1049 if (!defined($rootdir)) {
1050 $self->throw("Must enter an index directory name for make_indexdir");
1051 }
1052 if (! -e $rootdir) {
1053 $self->throw("Root index directory [$rootdir] doesn't exist");
1054 }
1055
1056 if (! -d $rootdir) {
1057 $self->throw("[$rootdir] exists but is not a directory");
1058 }
1059
1060 if ($rootdir !~ /\/$/) {
1061 $rootdir .= "/";
1062 }
1063
1064 my $indexdir = $rootdir . $self->database_name;
1065
1066 if (! -e $indexdir) {
1067 mkdir $indexdir,0755;
1068 } else {
1069 $self->throw("Index directory " . $indexdir . " already exists. Exiting\n");
1070 }
1071
1072 }
1073
1074 =head2 make_config_file
1075
1076 Title : make_config_file
1077 Usage : $obj->make_config_file($newval)
1078 Function:
1079 Example :
1080 Returns : value of make_config_file
1081 Args : newvalue (optional)
1082
1083 =cut
1084
1085 sub make_config_file {
1086 my ($self,$files) = @_;
1087
1088 my @files = @$files;
1089
1090 my $dir = $self->index_directory;
1091
1092 my $configfile = $dir . $self->database_name . "/" .CONFIG_FILE_NAME;
1093
1094 open(CON,">$configfile") || $self->throw("Can't create config file [$configfile]");
1095
1096 # First line must be the type of index - in this case flat
1097
1098 print CON "index\tflat/1\n";
1099
1100 # Now the fileids
1101
1102 my $count = 0;
1103
1104 foreach my $file (@files) {
1105
1106 my $size = -s $file;
1107
1108 print CON "fileid_$count\t$file\t$size\n";
1109
1110 my $fh = new FileHandle("<$file");
1111 $self->{_fileid}{$count} = $fh;
1112 $self->{_file} {$count} = $file;
1113 $self->{_dbfile}{$file} = $count;
1114 $self->{_size}{$count} = $size;
1115
1116 $count++;
1117 }
1118
1119 # Now the namespaces
1120
1121 print CON "primary_namespace\t" .$self->primary_namespace. "\n";
1122
1123 # Needs fixing for the secondary stuff
1124
1125 my $second_patterns = $self->secondary_patterns;
1126
1127 my @second = keys %$second_patterns;
1128
1129 if ((@second)) {
1130 print CON "secondary_namespaces";
1131
1132 foreach my $second (@second) {
1133 print CON "\t$second";
1134 }
1135 print CON "\n";
1136 }
1137
1138 # Now the config format
1139
1140 if (!defined($self->format)) {
1141 $self->throw("Format does not exist in module - can't write config file");
1142 } else {
1143 print CON "format\t" . $self->format . "\n";
1144 }
1145
1146
1147 close(CON);
1148 }
1149
1150 =head2 read_config_file
1151
1152 Title : read_config_file
1153 Usage : $obj->read_config_file($newval)
1154 Function:
1155 Example :
1156 Returns : value of read_config_file
1157 Args : newvalue (optional)
1158
1159
1160 =cut
1161
1162 sub read_config_file {
1163 my ($self) = @_;
1164
1165 my $dir = $self->index_directory . $self->database_name . "/";;
1166
1167 if (! -d $dir) {
1168 $self->throw("No index directory [" . $dir . "]. Can't read ". CONFIG_FILE_NAME);
1169 }
1170
1171 my $configfile = $dir . CONFIG_FILE_NAME;
1172
1173 if (! -e $configfile) {
1174 $self->throw("No config file [$configfile]. Can't read namespace");
1175 }
1176
1177 open(CON,"<$configfile") || $self->throw("Can't open configfile [$configfile]");
1178
1179 # First line must be type
1180
1181 my $line = <CON>; chomp($line);
1182 my $version;
1183
1184 # This is hard coded as we only index flatfiles here
1185 if ($line =~ /index\tflat\/(\d+)/) {
1186 $version = $1;
1187 } else {
1188 $self->throw("First line not compatible with flat file index. Should be something like\n\nindex\tflat/1");
1189 }
1190
1191 $self->index_type("flat");
1192 $self->index_version($version);
1193
1194 while (<CON>) {
1195 chomp;
1196
1197 # Look for fileid lines
1198 if ($_ =~ /^fileid_(\d+)\t(\S+)\t(\d+)/) {
1199 my $fileid = $1;
1200 my $filename = $2;
1201 my $filesize = $3;
1202
1203 if (! -e $filename) {
1204 $self->throw("File [$filename] does not exist!");
1205 }
1206 if (-s $filename != $filesize) {
1207 $self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s $filename) . "] Index thinks it is [" . $filesize . "]");
1208 }
1209
1210 my $fh = new FileHandle("<$filename");
1211
1212 $self->{_fileid}{$fileid} = $fh;
1213 $self->{_file} {$fileid} = $filename;
1214 $self->{_dbfile}{$filename} = $fileid;
1215 $self->{_size} {$fileid} = $filesize;
1216
1217 }
1218
1219 # Look for namespace lines
1220 if ($_ =~ /(.*)_namespace.*\t(\S+)/) {
1221 if ($1 eq "primary") {
1222 $self->primary_namespace($2);
1223 } elsif ($1 eq "secondary") {
1224 $self->secondary_namespaces($2);
1225 } else {
1226 $self->throw("Unknown namespace name in config file [$1");
1227 }
1228 }
1229
1230 # Look for format lines
1231
1232 if ($_ =~ /format\t(\S+)/) {
1233
1234 # Check the format here?
1235
1236 $self->format($1);
1237 }
1238 }
1239 close(CON);
1240
1241 # Now check we have all that we need
1242
1243 my @fileid_keys = keys (%{$self->{_fileid}});
1244
1245 if (!(@fileid_keys)) {
1246 $self->throw("No flatfile fileid files in config - check the index has been made correctly");
1247 }
1248
1249 if (!defined($self->primary_namespace)) {
1250 $self->throw("No primary namespace exists");
1251 }
1252
1253 if (! -e $self->primary_index_file) {
1254 $self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist");
1255 }
1256 }
1257
1258 =head2 get_fileid_by_filename
1259
1260 Title : get_fileid_by_filename
1261 Usage : $obj->get_fileid_by_filename($newval)
1262 Function:
1263 Example :
1264 Returns : value of get_fileid_by_filename
1265 Args : newvalue (optional)
1266
1267
1268 =cut
1269
1270 sub get_fileid_by_filename {
1271 my ($self,$file) = @_;
1272
1273 if (!defined($self->{_dbfile})) {
1274 $self->throw("No file to fileid mapping present. Has the fileid file been read?");
1275 }
1276
1277
1278 return $self->{_dbfile}{$file};
1279 }
1280
1281 =head2 get_filehandle_by_fileid
1282
1283 Title : get_filehandle_by_fileid
1284 Usage : $obj->get_filehandle_by_fileid($newval)
1285 Function:
1286 Example :
1287 Returns : value of get_filehandle_by_fileid
1288 Args : newvalue (optional)
1289
1290
1291 =cut
1292
1293 sub get_filehandle_by_fileid {
1294 my ($self,$fileid) = @_;
1295
1296 if (!defined($self->{_fileid}{$fileid})) {
1297 $self->throw("ERROR: undefined fileid in index [$fileid]");
1298 }
1299
1300 return $self->{_fileid}{$fileid};
1301 }
1302
1303 =head2 primary_index_file
1304
1305 Title : primary_index_file
1306 Usage : $obj->primary_index_file($newval)
1307 Function:
1308 Example :
1309 Returns : value of primary_index_file
1310 Args : newvalue (optional)
1311
1312
1313 =cut
1314
1315 sub primary_index_file {
1316 my ($self) = @_;
1317
1318 return $self->index_directory . $self->database_name . "/key_" . $self->primary_namespace . ".key";
1319 }
1320
1321 =head2 primary_index_filehandle
1322
1323 Title : primary_index_filehandle
1324 Usage : $obj->primary_index_filehandle($newval)
1325 Function:
1326 Example :
1327 Returns : value of primary_index_filehandle
1328 Args : newvalue (optional)
1329
1330
1331 =cut
1332
1333 sub primary_index_filehandle {
1334 my ($self) = @_;
1335
1336 if (!defined ($self->{_primary_index_handle})) {
1337 $self->{_primary_index_handle} = new FileHandle("<" . $self->primary_index_file);
1338 }
1339 return $self->{_primary_index_handle};
1340 }
1341
1342 =head2 database_name
1343
1344 Title : database_name
1345 Usage : $obj->database_name($newval)
1346 Function:
1347 Example :
1348 Returns : value of database_name
1349 Args : newvalue (optional)
1350
1351
1352 =cut
1353
1354
1355 sub database_name {
1356 my ($self,$arg) = @_;
1357
1358 if (defined($arg)) {
1359 $self->{_database_name} = $arg;
1360 }
1361 return $self->{_database_name};
1362
1363 }
1364
1365 =head2 format
1366
1367 Title : format
1368 Usage : $obj->format($newval)
1369 Function:
1370 Example :
1371 Returns : value of format
1372 Args : newvalue (optional)
1373
1374
1375 =cut
1376
1377 sub format{
1378 my ($obj,$value) = @_;
1379 if( defined $value) {
1380 $obj->{'format'} = $value;
1381 }
1382 return $obj->{'format'};
1383
1384 }
1385
1386 =head2 index_directory
1387
1388 Title : index_directory
1389 Usage : $obj->index_directory($newval)
1390 Function:
1391 Example :
1392 Returns : value of index_directory
1393 Args : newvalue (optional)
1394
1395
1396 =cut
1397
1398 sub index_directory {
1399 my ($self,$arg) = @_;
1400
1401 if (defined($arg)) {
1402 if ($arg !~ /\/$/) {
1403 $arg .= "/";
1404 }
1405 $self->{_index_directory} = $arg;
1406 }
1407 return $self->{_index_directory};
1408
1409 }
1410
1411 =head2 record_size
1412
1413 Title : record_size
1414 Usage : $obj->record_size($newval)
1415 Function:
1416 Example :
1417 Returns : value of record_size
1418 Args : newvalue (optional)
1419
1420
1421 =cut
1422
1423 sub record_size {
1424 my ($self,$arg) = @_;
1425
1426 if (defined($arg)) {
1427 $self->{_record_size} = $arg;
1428 }
1429 return $self->{_record_size};
1430 }
1431
1432 =head2 primary_namespace
1433
1434 Title : primary_namespace
1435 Usage : $obj->primary_namespace($newval)
1436 Function:
1437 Example :
1438 Returns : value of primary_namespace
1439 Args : newvalue (optional)
1440
1441 =cut
1442
1443 sub primary_namespace {
1444 my ($self,$arg) = @_;
1445
1446 if (defined($arg)) {
1447 $self->{_primary_namespace} = $arg;
1448 }
1449 return $self->{_primary_namespace};
1450 }
1451
1452 =head2 index_type
1453
1454 Title : index_type
1455 Usage : $obj->index_type($newval)
1456 Function:
1457 Example :
1458 Returns : value of index_type
1459 Args : newvalue (optional)
1460
1461
1462 =cut
1463
1464 sub index_type {
1465 my ($self,$arg) = @_;
1466
1467 if (defined($arg)) {
1468 $self->{_index_type} = $arg;
1469 }
1470 return $self->{_index_type};
1471 }
1472
1473 =head2 index_version
1474
1475 Title : index_version
1476 Usage : $obj->index_version($newval)
1477 Function:
1478 Example :
1479 Returns : value of index_version
1480 Args : newvalue (optional)
1481
1482
1483 =cut
1484
1485 sub index_version {
1486 my ($self,$arg) = @_;
1487
1488 if (defined($arg)) {
1489 $self->{_index_version} = $arg;
1490 }
1491 return $self->{_index_version};
1492 }
1493
1494 =head2 primary_pattern
1495
1496 Title : primary_pattern
1497 Usage : $obj->primary_pattern($newval)
1498 Function:
1499 Example :
1500 Returns : value of primary_pattern
1501 Args : newvalue (optional)
1502
1503
1504 =cut
1505
1506 sub primary_pattern{
1507 my ($obj,$value) = @_;
1508 if( defined $value) {
1509 $obj->{'primary_pattern'} = $value;
1510 }
1511
1512 return $obj->{'primary_pattern'};
1513
1514 }
1515 =head2 start_pattern
1516
1517 Title : start_pattern
1518 Usage : $obj->start_pattern($newval)
1519 Function:
1520 Example :
1521 Returns : value of start_pattern
1522 Args : newvalue (optional)
1523
1524
1525 =cut
1526
1527 sub start_pattern{
1528 my ($obj,$value) = @_;
1529 if( defined $value) {
1530 $obj->{'start_pattern'} = $value;
1531 }
1532 return $obj->{'start_pattern'};
1533
1534 }
1535
1536 =head2 secondary_patterns
1537
1538 Title : secondary_patterns
1539 Usage : $obj->secondary_patterns($newval)
1540 Function:
1541 Example :
1542 Returns : value of secondary_patterns
1543 Args : newvalue (optional)
1544
1545
1546 =cut
1547
1548 sub secondary_patterns{
1549 my ($obj,$value) = @_;
1550 if( defined $value) {
1551 $obj->{'secondary_patterns'} = $value;
1552
1553 my @names = keys %$value;
1554
1555 foreach my $name (@names) {
1556 $obj->secondary_namespaces($name);
1557 }
1558 }
1559 return $obj->{'secondary_patterns'};
1560
1561 }
1562
1563 =head2 secondary_namespaces
1564
1565 Title : secondary_namespaces
1566 Usage : $obj->secondary_namespaces($newval)
1567 Function:
1568 Example :
1569 Returns : value of secondary_namespaces
1570 Args : newvalue (optional)
1571
1572
1573 =cut
1574
1575 sub secondary_namespaces{
1576 my ($obj,$value) = @_;
1577
1578 if (!defined($obj->{secondary_namespaces})) {
1579 $obj->{secondary_namespaces} = [];
1580 }
1581 if( defined $value) {
1582 push(@{$obj->{'secondary_namespaces'}},$value);
1583 }
1584 return @{$obj->{'secondary_namespaces'}};
1585
1586 }
1587
1588
1589
1590 ## These are indexing routines to index commonly used format - fasta
1591 ## swissprot and embl
1592
1593 sub new_SWISSPROT_index {
1594 my ($self,$index_dir,$dbname,@files) = @_;
1595
1596 my %secondary_patterns;
1597
1598 my $start_pattern = "^ID (\\S+)";
1599 my $primary_pattern = "^AC (\\S+)\\;";
1600
1601 $secondary_patterns{"ID"} = $start_pattern;
1602
1603 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir,
1604 -format => 'swiss',
1605 -primary_pattern => $primary_pattern,
1606 -primary_namespace => "ACC",
1607 -start_pattern => $start_pattern,
1608 -secondary_patterns => \%secondary_patterns);
1609
1610 $index->make_index($dbname,@files);
1611 }
1612
1613 sub new_EMBL_index {
1614 my ($self,$index_dir,$dbname,@files) = @_;
1615
1616 my %secondary_patterns;
1617
1618 my $start_pattern = "^ID (\\S+)";
1619 my $primary_pattern = "^AC (\\S+)\\;";
1620 my $primary_namespace = "ACC";
1621
1622 $secondary_patterns{"ID"} = $start_pattern;
1623
1624 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir,
1625 -format => 'embl',
1626 -primary_pattern => $primary_pattern,
1627 -primary_namespace => "ACC",
1628 -start_pattern => $start_pattern,
1629 -secondary_patterns => \%secondary_patterns);
1630
1631 $index->make_index($dbname,@files);
1632
1633 return $index;
1634 }
1635
1636 sub new_FASTA_index {
1637 my ($self,$index_dir,$dbname,@files) = @_;
1638
1639 my %secondary_patterns;
1640
1641 my $start_pattern = "^>";
1642 my $primary_pattern = "^>(\\S+)";
1643 my $primary_namespace = "ACC";
1644
1645 $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1646
1647 my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_dir,
1648 -format => 'fasta',
1649 -primary_pattern => $primary_pattern,
1650 -primary_namespace => "ACC",
1651 -start_pattern => $start_pattern,
1652 -secondary_patterns => \%secondary_patterns);
1653
1654 $index->make_index($dbname,@files);
1655
1656 return $index;
1657
1658 }
1659
1660
1661
1662 1;
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692