Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/DB/Flat/OBDAIndex.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/DB/Flat/OBDAIndex.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,1692 @@
+# $Id: OBDAIndex.pm,v 1.12.2.1 2003/06/28 20:47:16 jason Exp $
+#
+# BioPerl module for Bio::DB::Flat::OBDAIndex
+#
+# Cared for by Michele Clamp <michele@sanger.ac.uk>>
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::DB::Flat::OBDAIndex - Binary search indexing system for sequence files
+
+=head1 SYNOPSIS
+
+This module can be used both to index sequence files and also to retrieve 
+sequences from existing sequence files.
+
+=head2 Index creation
+
+    my $sequencefile;  # Some fasta sequence file
+
+Patterns have to be entered to define where the keys are to be
+indexed and also where the start of each record.  E.g. for fasta
+
+    my $start_pattern   = "^>";
+    my $primary_pattern = "^>(\\S+)";
+
+
+So the start of a record is a line starting with a E<gt> and the primary
+key is all characters up to the first space afterf the E<gt>
+
+A string also has to be entered to defined what the primary key
+(primary_namespace) is called.
+
+The index can now be created using 
+
+    my $index = new Bio::DB::Flat::OBDAIndex(
+	     -start_pattern   => $start_pattern,
+	     -primary_pattern => $primary_pattern,
+             -primary_namespace => "ACC",
+					     );
+
+To actually write it out to disk we need to enter a directory where the 
+indices will live, a database name and an array of sequence files to index.
+
+    my @files = ("file1","file2","file3");
+
+    $index->make_index("/Users/michele/indices","mydatabase",@files);
+
+The index is now ready to use.  For large sequence files the perl
+way of indexing takes a *long* time and a *huge* amount of memory.
+For indexing things like dbEST I recommend using the C indexer.
+
+=head2 Creating indices with secondary keys
+
+Sometimes just indexing files with one id per entry is not enough.  For
+instance you may want to retrieve sequences from swissprot using
+their accessions as well as their ids.
+
+To be able to do this when creating your index you need to pass in 
+a hash of secondary_patterns which have their namespaces as the keys
+to the hash.
+
+e.g. For Indexing something like
+
+ID   1433_CAEEL     STANDARD;      PRT;   248 AA.
+AC   P41932;
+DT   01-NOV-1995 (Rel. 32, Created)
+DT   01-NOV-1995 (Rel. 32, Last sequence update)
+DT   15-DEC-1998 (Rel. 37, Last annotation update)
+DE   14-3-3-LIKE PROTEIN 1.
+GN   FTT-1 OR M117.2.
+OS   Caenorhabditis elegans.
+OC   Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
+OC   Rhabditidae; Peloderinae; Caenorhabditis.
+OX   NCBI_TaxID=6239;
+RN   [1]
+
+where we want to index the accession (P41932) as the primary key and the
+id (1433_CAEEL) as the secondary id.  The index is created as follows
+
+    my %secondary_patterns;
+
+    my $start_pattern   = "^ID   (\\S+)";
+    my $primary_pattern = "^AC   (\\S+)\;";
+
+    $secondary_patterns{"ID"} = "^ID   (\\S+)";
+
+    my $index = new Bio::DB::Flat::OBDAIndex(
+                -start_pattern     => $start_pattern,
+                -primary_pattern   => $primary_pattern,
+                -primary_namespace  => 'ACC',
+                -secondary_patterns => \%secondary_patterns);
+
+    $index->make_index("/Users/michele/indices","mydb",($seqfile));
+
+Of course having secondary indices makes indexing slower and more 
+of a memory hog.
+
+
+=head2 Index reading
+
+To fetch sequences using an existing index first of all create your sequence 
+object 
+
+    my $index = new Bio::DB::Flat::OBDAIndex(-index_dir => $index_directory,
+                                             -dbname    => 'swissprot');
+
+Now you can happily fetch sequences either by the primary key or
+by the secondary keys.
+
+    my $entry = $index->get_entry_by_id('HBA_HUMAN');
+
+This returns just a string containing the whole entry.  This is
+useful is you just want to print the sequence to screen or write it to a file.
+
+Other ways of getting sequences are
+
+    my $fh = $index->get_stream_by_id('HBA_HUMAN');
+
+This can then be passed to a seqio object for output or converting
+into objects.
+
+    my $seq = new Bio::SeqIO(-fh     => $fh,
+			     -format => 'fasta');
+
+The last way is to retrieve a sequence directly.  This is the
+slowest way of extracting as the sequence objects need to be made.
+
+    my $seq = $index->get_Seq_by_id('HBA_HUMAN');
+
+To access the secondary indices the secondary namespace needs to be known
+(use $index-E<gt>secondary_namespaces) and the following call used
+
+    my $seq   = $index->get_Seq_by_secondary('ACC','Q21973');
+    my $fh    = $index->get_stream_by_secondary('ACC','Q21973');
+    my $entry = $index->get_entry_by_secondary('ACC','Q21973');
+
+=head1 DESCRIPTION
+
+This object allows indexing of sequence files both by a primary key
+(say accession) and multiple secondary keys (say ids).  This is
+different from the Bio::Index::Abstract (see L<Bio::Index::Abstract>)
+which uses DBM files as storage.  This module uses a binary search to
+retrieve sequences which is more efficient for large datasets.
+
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org             - General discussion
+  http://bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Michele Clamp
+
+Email - michele@sanger.ac.uk
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal
+methods are usually preceded with an "_" (underscore).
+
+=cut
+
+package Bio::DB::Flat::OBDAIndex;
+
+use strict;
+use vars qw(@ISA);
+
+use Fcntl qw(SEEK_END SEEK_CUR);
+# rather than using tell which might be buffered
+sub systell{ sysseek($_[0], 0, SEEK_CUR) }
+sub syseof{ sysseek($_[0], 0, SEEK_END) }
+
+
+use Bio::DB::RandomAccessI;
+use Bio::Root::RootI;
+use Bio::SeqIO;
+use Bio::Seq;
+
+@ISA = qw(Bio::DB::RandomAccessI);
+
+use constant CONFIG_FILE_NAME => 'config.dat';
+use constant HEADER_SIZE      => 4;
+
+my @formats = ['FASTA','SWISSPROT','EMBL'];
+
+=head2 new
+
+ Title   : new
+ Usage   : For reading 
+             my $index = new Bio::DB::Flat::OBDAIndex(
+                     -index_dir => '/Users/michele/indices/',
+		     -dbname    => 'dbEST',
+                     -format    => 'fasta');
+
+           For writing 
+
+             my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
+             my $index = new Bio::DB::Flat::OBDAIndex(
+		     -index_dir          => '/Users/michele/indices',
+		     -primary_pattern    => "^>(\\S+)",
+                     -secondary_patterns => \%secondary_patterns,
+		     -primary_namespace  => "ID");
+
+             my @files = ('file1','file2','file3');
+
+             $index->make_index('mydbname',@files);    
+
+
+ Function: create a new Bio::DB::Flat::OBDAIndex object
+ Returns : new Bio::DB::Flat::OBDAIndex
+ Args    : -index_dir          Directory containing the indices
+           -primary_pattern    Regexp defining the primary id
+           -secondary_patterns A hash ref containing the secondary
+                               patterns with the namespaces as keys
+           -primary_namespace  A string defining what the primary key
+                               is
+
+ Status  : Public
+
+=cut
+
+sub new {
+    my($class, @args) = @_;
+
+    my $self = $class->SUPER::new(@args);
+
+    bless $self, $class;
+
+    my ($index_dir,$dbname,$format,$primary_pattern,$primary_namespace,
+	$start_pattern,$secondary_patterns) =  
+	    $self->_rearrange([qw(INDEX_DIR
+				  DBNAME
+				  FORMAT
+				  PRIMARY_PATTERN
+				  PRIMARY_NAMESPACE
+				  START_PATTERN
+				  SECONDARY_PATTERNS)], @args);
+    
+    $self->index_directory($index_dir);
+    $self->database_name     ($dbname);
+
+    if ($self->index_directory && $dbname) {
+
+	$self->read_config_file;
+	
+	my $fh = $self->primary_index_filehandle;
+        my $record_width = $self->read_header($fh);
+
+        $self->record_size($record_width);
+    }
+
+
+    $self->format            ($format);
+    $self->primary_pattern   ($primary_pattern);
+    $self->primary_namespace ($primary_namespace);
+    $self->start_pattern     ($start_pattern);
+    $self->secondary_patterns($secondary_patterns);
+
+    return $self;
+}
+
+sub new_from_registry {
+    my ($self,%config) =  @_;
+   
+    my $dbname   = $config{'dbname'};
+    my $location = $config{'location'};
+    
+    my $index =  new Bio::DB::Flat::OBDAIndex(-dbname    => $dbname,
+					      -index_dir => $location,
+					      );
+}
+
+=head2 get_Seq_by_id
+
+ Title   : get_Seq_by_id
+ Usage   : $obj->get_Seq_by_id($newval)
+ Function: 
+ Example : 
+ Returns : value of get_Seq_by_id
+ Args    : newvalue (optional)
+
+=cut
+
+sub get_Seq_by_id {
+    my ($self,$id) = @_;
+   
+    my ($fh,$length) = $self->get_stream_by_id($id);
+    
+    if (!defined($self->format)) {
+	$self->throw("Can't create sequence - format is not defined");
+    }
+   
+    if(!$fh){
+      return;
+    }
+    if (!defined($self->{_seqio})) {
+     
+	$self->{_seqio} = new Bio::SeqIO(-fh => $fh,
+					 -format => $self->format);
+    } else {
+      
+	$self->{_seqio}->fh($fh);
+    }
+    
+    return $self->{_seqio}->next_seq;
+
+}
+
+=head2 get_entry_by_id
+
+ Title   : get_entry_by_id
+ Usage   : $obj->get_entry_by_id($newval)
+ Function: 
+ Example : 
+ Returns : 
+ Args    : 
+
+
+=cut
+
+sub get_entry_by_id {
+    my ($self,$id) = @_;
+
+    my ($fh,$length) = $self->get_stream_by_id($id);
+
+    my $entry;
+
+    sysread($fh,$entry,$length);
+
+    return $entry;
+}
+
+
+=head2 get_stream_by_id
+
+ Title   : get_stream_by_id
+ Usage   : $obj->get_stream_by_id($newval)
+ Function: 
+ Example : 
+ Returns : value of get_stream_by_id
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub get_stream_by_id {
+    my ($self,$id) = @_;
+
+    my $indexfh = $self->primary_index_filehandle;
+
+    syseof ($indexfh);
+
+    my $filesize = systell($indexfh);
+
+    my $end = ($filesize-$self->{_start_pos})/$self->record_size;
+
+    my ($newid,$rest,$fhpos) = $self->find_entry($indexfh,0,$end,$id,$self->record_size);
+
+    
+    my ($fileid,$pos,$length) = split(/\t/,$rest);
+
+    #print STDERR "OBDAIndex Found id entry $newid $fileid $pos $length:$rest\n";
+
+    if (!$newid) {
+      return;
+    }
+
+    my $fh = $self->get_filehandle_by_fileid($fileid);
+    my $file = $self->{_file}{$fileid};
+
+    open (IN,"<$file");
+    $fh = \*IN;
+
+    my $entry;
+    
+    sysseek($fh,$pos,0);
+
+    return ($fh,$length);
+}
+
+=head2 get_Seq_by_acc
+
+ Title   : get_Seq_by_acc
+ Usage   : $obj->get_Seq_by_acc($newval)
+ Function: 
+ Example : 
+ Returns : value of get_Seq_by_acc
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub get_Seq_by_acc {
+    my ($self,$acc) = @_;
+
+    if ($self->primary_namespace eq "ACC") {
+       return $self->get_Seq_by_id($acc);
+    } else {
+      return $self->get_Seq_by_secondary("ACC",$acc);
+    }
+}
+
+=head2 get_Seq_by_secondary
+
+ Title   : get_Seq_by_secondary
+ Usage   : $obj->get_Seq_by_secondary($newval)
+ Function: 
+ Example : 
+ Returns : value of get_Seq_by_secondary
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub get_Seq_by_secondary {
+    my ($self,$name,$id) = @_;
+
+    my @names = $self->secondary_namespaces;
+
+    my $found = 0;
+    foreach my $tmpname (@names) {
+	if ($name eq $tmpname) {
+	    $found = 1;
+	}
+    }
+
+    if ($found == 0) {
+	$self->throw("Secondary index for $name doesn't exist\n");
+    }
+
+    my $fh = $self->open_secondary_index($name);
+
+    syseof ($fh);
+
+    my $filesize = systell($fh);
+
+    my $recsize = $self->{_secondary_record_size}{$name};
+#    print "Name " . $recsize . "\n";
+
+    my $end = ($filesize-$self->{_start_pos})/$recsize;
+
+#    print "End $end $filesize\n";
+
+    my ($newid,$primary_id,$pos) = $self->find_entry($fh,0,$end,$id,$recsize);
+
+    sysseek($fh,$pos,0);
+
+#    print "Found new id $newid $primary_id\n";    
+    # We now need to shuffle up the index file to find the top secondary entry
+
+    my $record = $newid;
+
+    while ($record =~ /^$newid/ && $pos >= 0) {
+
+	$record = $self->read_record($fh,$pos,$recsize);
+	$pos = $pos - $recsize;
+#	print "Up record = $record:$newid\n";
+    }
+    
+    $pos += $recsize;
+
+#    print "Top position is $pos\n";
+
+    # Now we have to shuffle back down again to read all the secondary entries
+
+    my $current_id = $newid;
+    my %primary_id;
+
+    $primary_id{$primary_id} = 1;
+
+    while ($current_id eq $newid) {
+	$record = $self->read_record($fh,$pos,$recsize);
+	print "Record is :$record:\n";
+	my ($secid,$primary_id) = split(/\t/,$record,2);
+	$current_id = $secid;
+
+	if ($current_id eq $newid) {
+	    $primary_id =~ s/ //g;
+	#    print "Primary $primary_id\n";
+	    $primary_id{$primary_id} = 1;
+	    
+	    $pos = $pos + $recsize;
+	 #   print "Down record = $record\n";
+	}
+    }
+	
+    if (!defined($newid)) {
+      return;
+    }
+
+    my $entry;
+
+    foreach my $id (keys %primary_id) {
+	$entry .= $self->get_Seq_by_id($id);
+    }
+    return $entry;
+
+}
+
+=head2 read_header
+
+ Title   : read_header
+ Usage   : $obj->read_header($newval)
+ Function: 
+ Example : 
+ Returns : value of read_header
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub read_header {
+    my ($self,$fh) = @_;
+
+    my $record_width;
+
+    sysread($fh,$record_width,HEADER_SIZE);
+
+    $self->{_start_pos} = HEADER_SIZE;
+    $record_width =~ s/ //g;
+    $record_width = $record_width * 1;
+
+    return $record_width;
+}
+
+=head2 read_record
+
+ Title   : read_record
+ Usage   : $obj->read_record($newval)
+ Function: 
+ Example : 
+ Returns : value of read_record
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub read_record {
+  my ($self,$fh,$pos,$len) = @_;
+
+  sysseek($fh,$pos,0);
+
+  my $record;
+    
+  sysread($fh,$record,$len);
+
+  return $record;
+
+}
+
+
+=head2 find_entry
+
+ Title   : find_entry
+ Usage   : $obj->find_entry($newval)
+ Function: 
+ Example : 
+ Returns : value of find_entry
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub find_entry {
+    my ($self,$fh,$start,$end,$id,$recsize) = @_;
+
+    my $mid = int(($end+1+$start)/2);
+    my $pos = ($mid-1)*$recsize + $self->{_start_pos};
+
+    my ($record) = $self->read_record($fh,$pos,$recsize);
+    my ($entryid,$rest)  = split(/\t/,$record,2);
+
+#    print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
+#    print "Entry :$id:$entryid:$rest\n";
+
+    
+    my ($first,$second) = sort { $a cmp $b} ($id,$entryid);
+
+    if ($id eq $entryid) {
+
+      return ($id,$rest,$pos-$recsize);
+
+    } elsif ($first eq $id) {
+	
+      if ($end-$start <= 1) {
+	return;
+      }
+      my $end = $mid;
+#      print "Moving up $entryid $id\n";
+      $self->find_entry($fh,$start,$end,$id,$recsize);
+
+    } elsif ($second eq $id ) {
+#	print "Moving down $entryid $id\n";
+      if ($end-$start <= 1) {
+	return;
+      }
+
+      $start = $mid;
+      
+      $self->find_entry($fh,$start,$end,$id,$recsize);
+    }
+
+ }   
+
+
+=head2 make_index
+
+ Title   : make_index
+ Usage   : $obj->make_index($newval)
+ Function: 
+ Example : 
+ Returns : value of make_index
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub make_index {
+    my ($self,$dbname,@files) = @_;;
+    
+    my $rootdir = $self->index_directory;
+
+    if (!defined($rootdir)) {
+	$self->throw("No index directory set - can't build indices");
+    }
+    
+    if (! -d $rootdir) {
+	$self->throw("Index directory [$rootdir] is not a directory. Cant' build indices");
+    }
+    if (!(@files)) {
+	$self->throw("Must enter an array of filenames to index");
+    }
+    
+    if (!defined($dbname)) {
+	$self->throw("Must enter an index name for your files");
+    }
+    
+    my $pwd = `pwd`; chomp($pwd);
+
+    foreach my $file (@files) {
+	if ($file !~ /^\//) {
+	    $file = $pwd . "/$file";
+	}
+	if (! -e $file) {
+	    $self->throw("Can't index file [$file] as it doesn't exist");
+	}
+    }
+    
+    $self->database_name($dbname);
+    $self->make_indexdir($rootdir);;
+    $self->make_config_file(\@files);
+    
+    # Finally lets index
+    foreach my $file (@files) {
+	$self->_index_file($file);
+    }
+
+    # And finally write out the indices
+    $self->write_primary_index;
+    $self->write_secondary_indices;
+}
+
+=head2 _index_file
+
+ Title   : _index_file
+ Usage   : $obj->_index_file($newval)
+ Function: 
+ Example : 
+ Returns : value of _index_file
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _index_file {
+    my ($self,$file) = @_;
+
+    open(FILE,"<$file") || $self->throw("Can't open file [$file]");
+
+    my $recstart = 0;
+    my $fileid = $self->get_fileid_by_filename($file);
+    my $found = 0;
+    my $id;
+    my $count;
+
+    my $primary       = $self->primary_pattern;
+    my $start_pattern = $self->start_pattern;
+
+    my $pos = 0;
+
+    my $new_primary_entry;
+    
+    my $length;
+    #my $pos = 0;
+    my $fh = \*FILE;
+
+    my $done = -1;
+
+    my @secondary_names = $self->secondary_namespaces;
+    my %secondary_id;
+
+    while (<$fh>) {
+	if ($_ =~ /$start_pattern/) {
+	    if ($done == 0) {
+		$id = $new_primary_entry;
+		
+		my $tmplen = tell($fh) - length($_);
+
+		$length = $tmplen  - $pos;
+		
+		if (!defined($id)) {
+		    $self->throw("No id defined for sequence");
+		}
+		if (!defined($fileid)) {
+		    $self->throw("No fileid defined for file $file");
+		}
+		if (!defined($pos)) {
+		    $self->throw("No position defined for " . $id . "\n");
+		}
+		if (!defined($length)) {
+		    $self->throw("No length defined for " . $id . "\n");
+		}
+		
+		$self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
+
+		$pos   = $tmplen;
+		
+		if ($count%1000 == 0) {
+		    print STDERR "Indexed $count ids\n";
+		}
+	    
+		$count++;
+	    } else {
+		$done = 0;
+	    }
+	}
+
+	if ($_ =~ /$primary/) {
+	    $new_primary_entry = $1;    
+	}
+
+	my $secondary_patterns = $self->secondary_patterns;
+
+	foreach my $sec (@secondary_names) {
+	    my $pattern = $secondary_patterns->{$sec};
+
+	    if ($_ =~ /$pattern/) {
+		$secondary_id{$sec} = $1;
+	    }
+	}
+	
+    }
+
+    # Remeber to add in the last one
+
+    $id = $new_primary_entry;
+		
+    my $tmplen = tell($fh) - length($_);
+
+    $length = $tmplen  - $pos;
+    
+    if (!defined($id)) {
+	$self->throw("No id defined for sequence");
+    }
+    if (!defined($fileid)) {
+	$self->throw("No fileid defined for file $file");
+    }
+    if (!defined($pos)) {
+	$self->throw("No position defined for " . $id . "\n");
+    }
+    if (!defined($length)) {
+	$self->throw("No length defined for " . $id . "\n");
+    }
+    
+    $self->_add_id_position($id,$pos,$fileid,$length,\%secondary_id);
+    
+    close(FILE);
+}
+
+=head2 write_primary_index
+
+ Title   : write_primary_index
+ Usage   : $obj->write_primary_index($newval)
+ Function: 
+ Example : 
+ Returns : value of write_primary_index
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub write_primary_index {
+    my ($self) = @_;
+
+    my @ids = keys %{$self->{_id}};
+
+    @ids = sort {$a cmp $b} @ids;
+
+    print STDERR "Number of ids = " . scalar(@ids) . "\n";
+
+    open (INDEX,">" . $self->primary_index_file) || $self->throw("Can't open primary index file [" . $self->primary_index_file . "]");
+
+    my $recordlength = $self->{_maxidlength} +
+	               $self->{_maxfileidlength} + 
+	               $self->{_maxposlength} +
+   		       $self->{_maxlengthlength} + 3;
+	
+    
+    print INDEX sprintf("%4d",$recordlength);
+
+    foreach my $id (@ids) {
+
+	if (!defined($self->{_id}{$id}{_fileid})) {
+	    $self->throw("No fileid for $id\n");
+	}
+	if (!defined($self->{_id}{$id}{_pos})) {
+	    $self->throw("No position for $id\n");
+	}
+	if (!defined($self->{_id}{$id}{_length})) {
+	    $self->throw("No length for $id");
+	}
+
+	my $record =  $id              . "\t" . 
+	    $self->{_id}{$id}{_fileid} . "\t" .
+	    $self->{_id}{$id}{_pos}    . "\t" .
+	    $self->{_id}{$id}{_length};
+
+	print INDEX sprintf("%-${recordlength}s",$record);
+
+    }
+    close(INDEX);
+}
+
+=head2 write_secondary_indices
+
+ Title   : write_secondary_indices
+ Usage   : $obj->write_secondary_indices($newval)
+ Function: 
+ Example : 
+ Returns : value of write_secondary_indices
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub write_secondary_indices {
+    my ($self) = @_;
+
+    # These are the different 
+    my @names = keys (%{$self->{_secondary_id}});
+
+    
+    foreach my $name (@names) {
+
+	my @seconds = keys %{$self->{_secondary_id}{$name}};
+
+	# First we need to loop over to get the longest record.
+	my $length = 0;
+
+	foreach my $second (@seconds) {
+	    my $tmplen = length($second) + 1;
+	    my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
+
+	    foreach my $prim (@prims) {
+		my $recordlen = $tmplen + length($prim);
+	    
+		if ($recordlen > $length) {
+		    $length = $recordlen;
+		}
+	    }
+	}
+
+	# Now we can print the index
+	
+	my $fh = $self->new_secondary_filehandle($name);	
+
+	print $fh sprintf("%4d",$length);
+	@seconds = sort @seconds;
+	
+	foreach my $second (@seconds) {
+
+	    my @prims = keys %{$self->{_secondary_id}{$name}{$second}};
+	    my $tmp = $second;
+
+	    foreach my $prim (@prims) {
+		my $record = $tmp . "\t" . $prim;
+		if (length($record) > $length) {
+		    $self->throw("Something has gone horribly wrong - length of record is more than we thought [$length]\n");
+		} else {
+		    print $fh sprintf("%-${length}s",$record);
+		    print $fh sprintf("%-${length}s",$record);
+		}
+	    }
+	}
+		
+	close($fh);
+    }
+}
+
+=head2 new_secondary_filehandle
+
+ Title   : new_secondary_filehandle
+ Usage   : $obj->new_secondary_filehandle($newval)
+ Function: 
+ Example : 
+ Returns : value of new_secondary_filehandle
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub new_secondary_filehandle {
+    my ($self,$name) = @_;
+
+    my $indexdir = $self->index_directory;
+
+    my $secindex = $indexdir . $self->database_name . "/id_$name.index";
+
+    my $fh = new FileHandle(">$secindex");
+
+    return $fh;
+}
+
+=head2 open_secondary_index
+
+ Title   : open_secondary_index
+ Usage   : $obj->open_secondary_index($newval)
+ Function: 
+ Example : 
+ Returns : value of open_secondary_index
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub open_secondary_index {
+    my ($self,$name) = @_;
+
+    if (!defined($self->{_secondary_filehandle}{$name})) {
+
+	my $indexdir = $self->index_directory;
+	my $secindex = $indexdir . $self->database_name . "/id_$name.index";
+	
+	if (! -e $secindex) {
+	    $self->throw("Index is not present for namespace [$name]\n");
+	}
+
+        my $newfh  = new FileHandle("<$secindex");
+	my $reclen = $self->read_header($newfh);
+
+	$self->{_secondary_filehandle} {$name} = $newfh;
+	$self->{_secondary_record_size}{$name} = $reclen;
+    }
+
+    return $self->{_secondary_filehandle}{$name};
+
+}
+
+=head2 _add_id_position
+
+ Title   : _add_id_position
+ Usage   : $obj->_add_id_position($newval)
+ Function: 
+ Example : 
+ Returns : value of _add_id_position
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _add_id_position {
+    my ($self,$id,$pos,$fileid,$length,$secondary_id) = @_;
+
+    if (!defined($id)) {
+	$self->throw("No id defined. Can't add id position");
+    }
+    if (!defined($pos)) {
+v	$self->throw("No position defined. Can't add id position");
+    }
+    if (!defined($fileid)) {
+	$self->throw("No fileid defined. Can't add id position");
+    }
+    if (!defined($length) || $length <= 0) {
+	$self->throw("No length defined or <= 0 [$length]. Can't add id position");
+    }
+
+    $self->{_id}{$id}{_pos}    = $pos;
+    $self->{_id}{$id}{_length} = $length;
+    $self->{_id}{$id}{_fileid} = $fileid;
+    
+    # Now the secondary ids
+
+    foreach my $sec (keys (%$secondary_id)) {
+	my $value = $secondary_id->{$sec};
+
+	$self->{_secondary_id}{$sec}{$value}{$id} = 1;
+    }
+
+    if (length($id) >= $self->{_maxidlength}) {
+	$self->{_maxidlength} = length($id);
+    }
+
+    if (length($fileid) >= $self->{_maxfileidlength}) {
+	$self->{_maxfileidlength} = length($fileid);
+    }
+
+    if (length($pos) >= $self->{_maxposlength}) {
+	$self->{_maxposlength} = length($pos);
+    }
+
+    if (length($length) >= $self->{_maxlengthlength}) {
+	$self->{_maxlengthlength} = length($length);
+    }
+}
+
+=head2 make_indexdir
+
+ Title   : make_indexdir
+ Usage   : $obj->make_indexdir($newval)
+ Function: 
+ Example : 
+ Returns : value of make_indexdir
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub make_indexdir {
+    my ($self,$rootdir) = @_;
+
+    if (!defined($rootdir)) {
+	$self->throw("Must enter an index directory name for make_indexdir");
+    }
+    if (! -e $rootdir) {
+	$self->throw("Root index directory [$rootdir] doesn't exist");
+    }
+
+    if (! -d $rootdir) {
+	$self->throw("[$rootdir] exists but is not a directory");
+    }
+
+    if ($rootdir !~ /\/$/) {
+	$rootdir .= "/";
+    }
+
+    my $indexdir = $rootdir . $self->database_name;
+
+    if (! -e $indexdir) {
+	mkdir $indexdir,0755;
+    } else {
+	$self->throw("Index directory " . $indexdir . " already exists. Exiting\n");
+    }
+
+}
+
+=head2 make_config_file
+
+ Title   : make_config_file
+ Usage   : $obj->make_config_file($newval)
+ Function: 
+ Example : 
+ Returns : value of make_config_file
+ Args    : newvalue (optional)
+
+=cut
+
+sub make_config_file {
+    my ($self,$files) = @_;
+    
+    my @files = @$files;
+
+    my $dir = $self->index_directory;
+
+    my $configfile = $dir . $self->database_name . "/" .CONFIG_FILE_NAME;
+
+    open(CON,">$configfile") || $self->throw("Can't create config file [$configfile]");
+
+    # First line must be the type of index - in this case flat
+
+    print CON "index\tflat/1\n";
+
+    # Now the fileids
+
+    my $count = 0;
+
+    foreach my $file (@files) {
+
+	my $size = -s $file;
+
+	print CON "fileid_$count\t$file\t$size\n";
+
+	my $fh = new FileHandle("<$file");
+	$self->{_fileid}{$count}   = $fh;
+	$self->{_file}  {$count}   = $file;
+	$self->{_dbfile}{$file} = $count;
+	$self->{_size}{$count}     = $size; 
+	
+	$count++;
+    }
+
+    # Now the namespaces
+
+    print CON "primary_namespace\t" .$self->primary_namespace. "\n";
+    
+    # Needs fixing for the secondary stuff
+
+    my $second_patterns = $self->secondary_patterns;
+
+    my @second = keys %$second_patterns;
+
+    if ((@second))  {
+	print CON "secondary_namespaces";
+
+	foreach my $second (@second) {
+	    print CON "\t$second";
+	}
+        print CON "\n";
+    }
+
+    # Now the config format
+
+    if (!defined($self->format)) {
+	$self->throw("Format does not exist in module - can't write config file");
+    } else {
+	print CON "format\t" . $self->format . "\n";
+    }
+
+
+    close(CON);
+}
+
+=head2 read_config_file
+
+ Title   : read_config_file
+ Usage   : $obj->read_config_file($newval)
+ Function: 
+ Example : 
+ Returns : value of read_config_file
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub read_config_file {
+    my ($self) = @_;
+
+    my $dir = $self->index_directory . $self->database_name . "/";;
+
+    if (! -d $dir) {
+	$self->throw("No index directory [" . $dir  . "]. Can't read ".  CONFIG_FILE_NAME);
+    }
+    
+    my $configfile = $dir . CONFIG_FILE_NAME;
+    
+    if (! -e $configfile) {
+	$self->throw("No config file [$configfile]. Can't read namespace");
+    }
+    
+    open(CON,"<$configfile") || $self->throw("Can't open configfile [$configfile]");
+
+    # First line must be type
+
+    my $line = <CON>; chomp($line);
+    my $version;
+
+    # This is hard coded as we only index flatfiles here
+    if ($line =~ /index\tflat\/(\d+)/) {
+	$version = $1;
+    } else {
+	$self->throw("First line not compatible with flat file index.  Should be something like\n\nindex\tflat/1");
+    }
+
+    $self->index_type("flat");
+    $self->index_version($version);
+
+    while (<CON>) {
+	chomp;
+
+	# Look for fileid lines
+	if ($_ =~ /^fileid_(\d+)\t(\S+)\t(\d+)/) {
+	    my $fileid   = $1;
+	    my $filename = $2;
+	    my $filesize = $3;
+	    
+	    if (! -e $filename) {
+		$self->throw("File [$filename] does not exist!");
+	    }
+	    if (-s $filename != $filesize) {
+		$self->throw("Flatfile size for $filename differs from what the index thinks it is. Real size [" . (-s $filename) . "] Index thinks it is [" . $filesize  . "]");
+	    }
+		
+	    my $fh = new FileHandle("<$filename");
+
+	    $self->{_fileid}{$fileid}   = $fh;
+	    $self->{_file}  {$fileid}   = $filename;
+	    $self->{_dbfile}{$filename} = $fileid;
+            $self->{_size}  {$fileid}   = $filesize; 
+
+	}
+
+	# Look for namespace lines
+	if ($_ =~ /(.*)_namespace.*\t(\S+)/) {
+	    if ($1 eq "primary") {
+		$self->primary_namespace($2);
+	    } elsif ($1 eq "secondary") {
+		$self->secondary_namespaces($2);
+	    } else {
+		$self->throw("Unknown namespace name in config file [$1");
+	    }
+	}
+	
+	# Look for format lines
+
+	if ($_ =~ /format\t(\S+)/) {
+
+	    # Check the format here?
+
+	    $self->format($1);
+	}
+    }
+    close(CON);
+
+    # Now check we have all that we need
+
+    my @fileid_keys = keys (%{$self->{_fileid}});
+    
+    if (!(@fileid_keys)) {
+	$self->throw("No flatfile fileid files in config - check the index has been made correctly");
+    }
+
+    if (!defined($self->primary_namespace)) {
+	$self->throw("No primary namespace exists");
+    }
+
+    if (! -e $self->primary_index_file) {
+	$self->throw("Primary index file [" . $self->primary_index_file . "] doesn't exist");
+    }
+}
+
+=head2 get_fileid_by_filename
+
+ Title   : get_fileid_by_filename
+ Usage   : $obj->get_fileid_by_filename($newval)
+ Function: 
+ Example : 
+ Returns : value of get_fileid_by_filename
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub get_fileid_by_filename {
+    my ($self,$file) = @_;
+    
+    if (!defined($self->{_dbfile})) {
+	$self->throw("No file to fileid mapping present.  Has the fileid file been read?");
+    }
+
+    
+    return $self->{_dbfile}{$file};
+}
+
+=head2 get_filehandle_by_fileid
+
+ Title   : get_filehandle_by_fileid
+ Usage   : $obj->get_filehandle_by_fileid($newval)
+ Function: 
+ Example : 
+ Returns : value of get_filehandle_by_fileid
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub get_filehandle_by_fileid {
+    my ($self,$fileid) = @_;
+
+    if (!defined($self->{_fileid}{$fileid})) {
+	$self->throw("ERROR: undefined fileid in index [$fileid]");
+    }
+   
+    return $self->{_fileid}{$fileid};
+}
+
+=head2 primary_index_file
+
+ Title   : primary_index_file
+ Usage   : $obj->primary_index_file($newval)
+ Function: 
+ Example : 
+ Returns : value of primary_index_file
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub primary_index_file {
+    my ($self) = @_;
+
+    return $self->index_directory . $self->database_name . "/key_" . $self->primary_namespace . ".key";
+}
+
+=head2 primary_index_filehandle
+
+ Title   : primary_index_filehandle
+ Usage   : $obj->primary_index_filehandle($newval)
+ Function: 
+ Example : 
+ Returns : value of primary_index_filehandle
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub primary_index_filehandle {
+    my ($self) = @_;
+
+    if (!defined ($self->{_primary_index_handle})) {
+	$self->{_primary_index_handle} = new FileHandle("<" . $self->primary_index_file);
+    }
+    return $self->{_primary_index_handle};
+}
+
+=head2 database_name
+
+ Title   : database_name
+ Usage   : $obj->database_name($newval)
+ Function: 
+ Example : 
+ Returns : value of database_name
+ Args    : newvalue (optional)
+
+
+=cut
+
+
+sub database_name {
+    my ($self,$arg) = @_;
+
+    if (defined($arg)) {
+	$self->{_database_name} = $arg;
+    }
+    return $self->{_database_name};
+
+}
+
+=head2 format
+
+ Title   : format
+ Usage   : $obj->format($newval)
+ Function: 
+ Example : 
+ Returns : value of format
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub format{
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{'format'} = $value;
+    }
+    return $obj->{'format'};
+
+}
+
+=head2 index_directory
+
+ Title   : index_directory
+ Usage   : $obj->index_directory($newval)
+ Function: 
+ Example : 
+ Returns : value of index_directory
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub index_directory {
+    my ($self,$arg) = @_;
+
+    if (defined($arg)) {
+	if ($arg !~ /\/$/) {
+	    $arg .= "/";
+	}
+	$self->{_index_directory} = $arg;
+    }
+    return $self->{_index_directory};
+
+}
+
+=head2 record_size
+
+ Title   : record_size
+ Usage   : $obj->record_size($newval)
+ Function: 
+ Example : 
+ Returns : value of record_size
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub record_size {
+    my ($self,$arg) = @_;
+
+    if (defined($arg)) {
+      $self->{_record_size} = $arg;
+    }
+    return $self->{_record_size};
+}
+
+=head2 primary_namespace
+
+ Title   : primary_namespace
+ Usage   : $obj->primary_namespace($newval)
+ Function: 
+ Example : 
+ Returns : value of primary_namespace
+ Args    : newvalue (optional)
+
+=cut
+
+sub primary_namespace {
+  my ($self,$arg) =  @_;
+
+  if (defined($arg)) {
+    $self->{_primary_namespace} =  $arg;
+  }
+  return $self->{_primary_namespace};
+}
+
+=head2 index_type
+
+ Title   : index_type
+ Usage   : $obj->index_type($newval)
+ Function: 
+ Example : 
+ Returns : value of index_type
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub index_type {
+    my ($self,$arg) = @_;
+
+    if (defined($arg)) {
+	$self->{_index_type} = $arg;
+    }
+    return $self->{_index_type};
+}
+
+=head2 index_version
+
+ Title   : index_version
+ Usage   : $obj->index_version($newval)
+ Function: 
+ Example : 
+ Returns : value of index_version
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub index_version {
+    my ($self,$arg) = @_;
+
+    if (defined($arg)) {
+	$self->{_index_version} = $arg;
+    }
+    return $self->{_index_version};
+}
+
+=head2 primary_pattern
+
+ Title   : primary_pattern
+ Usage   : $obj->primary_pattern($newval)
+ Function: 
+ Example : 
+ Returns : value of primary_pattern
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub primary_pattern{
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{'primary_pattern'} = $value;
+    }
+
+    return $obj->{'primary_pattern'};
+
+}
+=head2 start_pattern
+
+ Title   : start_pattern
+ Usage   : $obj->start_pattern($newval)
+ Function: 
+ Example : 
+ Returns : value of start_pattern
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub start_pattern{
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{'start_pattern'} = $value;
+    }
+    return $obj->{'start_pattern'};
+
+}
+
+=head2 secondary_patterns
+
+ Title   : secondary_patterns
+ Usage   : $obj->secondary_patterns($newval)
+ Function: 
+ Example : 
+ Returns : value of secondary_patterns
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub secondary_patterns{
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{'secondary_patterns'} = $value;
+
+      my @names = keys %$value;
+
+      foreach my $name (@names) {
+	  $obj->secondary_namespaces($name);
+      }
+    }
+    return $obj->{'secondary_patterns'};
+
+}
+
+=head2 secondary_namespaces
+
+ Title   : secondary_namespaces
+ Usage   : $obj->secondary_namespaces($newval)
+ Function: 
+ Example : 
+ Returns : value of secondary_namespaces
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub secondary_namespaces{
+   my ($obj,$value) = @_;
+
+   if (!defined($obj->{secondary_namespaces})) {
+       $obj->{secondary_namespaces} = [];
+   }
+   if( defined $value) {
+       push(@{$obj->{'secondary_namespaces'}},$value);
+    }
+   return @{$obj->{'secondary_namespaces'}};
+
+}
+
+
+
+## These are indexing routines to index commonly used format - fasta
+## swissprot and embl
+
+sub new_SWISSPROT_index {
+    my ($self,$index_dir,$dbname,@files) = @_;
+    
+    my %secondary_patterns;
+    
+    my $start_pattern = "^ID   (\\S+)";
+    my $primary_pattern = "^AC   (\\S+)\\;";
+    
+    $secondary_patterns{"ID"} = $start_pattern;
+
+    my $index =  new Bio::DB::Flat::OBDAIndex(-index_dir          => $index_dir,
+					      -format             => 'swiss',
+					      -primary_pattern    => $primary_pattern,
+					      -primary_namespace  => "ACC",
+					      -start_pattern      => $start_pattern,
+					      -secondary_patterns => \%secondary_patterns);
+    
+    $index->make_index($dbname,@files);
+}
+
+sub new_EMBL_index {
+   my ($self,$index_dir,$dbname,@files) = @_;
+   
+   my %secondary_patterns;
+
+   my $start_pattern = "^ID   (\\S+)";
+   my $primary_pattern = "^AC   (\\S+)\\;";
+   my $primary_namespace = "ACC";
+
+   $secondary_patterns{"ID"} = $start_pattern;
+
+   my $index = new Bio::DB::Flat::OBDAIndex(-index_dir          => $index_dir,
+					    -format             => 'embl',
+					    -primary_pattern    => $primary_pattern,
+					    -primary_namespace  => "ACC",
+					    -start_pattern      => $start_pattern,
+					    -secondary_patterns => \%secondary_patterns);
+   
+    $index->make_index($dbname,@files);
+
+   return $index;
+}
+
+sub new_FASTA_index {
+   my ($self,$index_dir,$dbname,@files) =  @_;
+
+   my %secondary_patterns;
+
+   my $start_pattern = "^>";
+   my $primary_pattern = "^>(\\S+)";
+   my $primary_namespace = "ACC"; 
+
+   $secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
+
+   my $index =  new Bio::DB::Flat::OBDAIndex(-index_dir          => $index_dir,
+					     -format             => 'fasta',
+					     -primary_pattern    => $primary_pattern,
+					     -primary_namespace  => "ACC",
+					     -start_pattern      => $start_pattern,
+					     -secondary_patterns => \%secondary_patterns);
+   
+   $index->make_index($dbname,@files);
+
+   return $index;
+
+}
+
+
+
+1;
+
+	
+
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children