diff variant_effect_predictor/Bio/DB/Flat.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/DB/Flat.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,526 @@
+#
+# $Id: Flat.pm,v 1.6 2002/12/22 22:02:13 lstein Exp $
+#
+# BioPerl module for Bio::DB::Flat
+#
+# Cared for by Lincoln Stein <lstein@cshl.org>
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::DB::Flat - Interface for indexed flat files
+
+=head1 SYNOPSIS
+
+  $db = Bio::DB::Flat->new(-directory  => '/usr/share/embl',
+                           -format     => 'embl',
+                           -write_flag => 1);
+  $db->build_index('/usr/share/embl/primate.embl','/usr/share/embl/protists.embl');
+  $seq       = $db->get_Seq_by_id('BUM');
+  @sequences = $db->get_Seq_by_acc('DIV' => 'primate');
+  $raw       = $db->fetch_raw('BUM');
+
+=head1 DESCRIPTION
+
+This object provides the basic mechanism to associate positions in
+files with primary and secondary name spaces. Unlike
+Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized
+to work with the "flat index" and BerkeleyDB indexed flat file formats
+worked out at the 2002 BioHackathon.
+
+This object is a general front end to the underlying databases.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org             - General discussion
+  http://bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Lincoln Stein
+
+Email - lstein@cshl.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal
+methods are usually preceded with an "_" (underscore).
+
+=cut
+
+
+# Let the code begin...
+package Bio::DB::Flat;
+
+use Bio::DB::RandomAccessI;
+use Bio::Root::Root;
+use Bio::Root::IO;
+use vars '@ISA';
+
+@ISA = qw(Bio::Root::Root Bio::DB::RandomAccessI);
+
+use constant CONFIG_FILE_NAME => 'config.dat';
+
+=head2 new
+
+ Title   : new
+ Usage   : my $db = new Bio::Flat->new(
+                     -directory  => $root_directory,
+		     -write_flag => 0,
+                     -index      => 'bdb'|'flat',
+                     -verbose    => 0,
+		     -out        => 'outputfile',
+                     -format     => 'genbank');
+ Function: create a new Bio::Index::BDB object
+ Returns : new Bio::Index::BDB object
+ Args    : -directory    Root directory containing "config.dat"
+           -write_flag   If true, allows reindexing.
+           -verbose      Verbose messages
+           -out          File to write to when write_seq invoked
+ Status  : Public
+
+The root -directory indicates where the flat file indexes will be
+stored.  The build_index() and write_seq() methods will automatically
+create a human-readable configuration file named "config.dat" in this
+file.
+
+The -write_flag enables writing new entries into the database as well
+as the creation of the indexes.  By default the indexes will be opened
+read only.
+
+-index is one of "bdb" or "flat" and indicates the type of index to
+generate.  "bdb" corresponds to Berkeley DB.  You *must* be using
+BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB extension
+installed (DB_File will *not* work).
+
+The -out argument species the output file for writing objects created
+with write_seq().  
+
+=cut
+
+sub new {
+  my $class = shift;
+  $class  = ref($class) if ref($class);
+  my $self = $class->SUPER::new(@_);
+
+  # first we initialize ourselves
+  my ($flat_directory) = @_ == 1 ? shift
+                                 : $self->_rearrange([qw(DIRECTORY)],@_);
+
+  # set values from configuration file
+  $self->directory($flat_directory);
+  $self->_read_config() if -e $flat_directory;
+
+  # but override with initialization values
+  $self->_initialize(@_);
+
+  # now we figure out what subclass to instantiate
+  my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB'
+                  :$self->indexing_scheme eq 'flat/1'       ? 'Flat'
+                  :$self->throw("unknown indexing scheme: ".$self->indexing_scheme);
+  my $format     = $self->file_format;
+  my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format";
+  eval "use $child_class";
+  $self->throw($@) if $@;
+
+  # rebless & reinitialize with the new class
+  # (this prevents subclasses from forgetting to call our own initialization)
+  bless $self,$child_class;
+  $self->_initialize(@_);
+  $self->_set_namespaces(@_);
+
+  $self;
+}
+
+sub _initialize {
+  my $self = shift;
+
+  my ($flat_write_flag,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format)
+    = $self->_rearrange([qw(WRITE_FLAG INDEX VERBOSE OUT FORMAT)],@_);
+
+  $self->write_flag($flat_write_flag) if defined $flat_write_flag;
+
+  if (defined $flat_indexing) {
+    # very permissive
+    $flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/;
+    $flat_indexing = 'flat/1'       if $flat_indexing =~ /flat/;
+    $self->indexing_scheme($flat_indexing);
+  }
+
+  $self->verbose($flat_verbose)    if defined $flat_verbose;
+  $self->out_file($flat_outfile)   if defined $flat_outfile;
+  $self->file_format($flat_format) if defined $flat_format;
+}
+
+sub _set_namespaces {
+  my $self = shift;
+
+  $self->primary_namespace($self->default_primary_namespace)
+    unless defined $self->{flat_primary_namespace};
+
+  $self->secondary_namespaces($self->default_secondary_namespaces)
+    unless defined $self->{flat_secondary_namespaces};
+
+  $self->file_format($self->default_file_format)
+    unless defined $self->{flat_format};
+}
+
+# accessors
+sub directory {
+  my $self = shift;
+  my $d = $self->{flat_directory};
+  $self->{flat_directory} = shift if @_;
+  $d;
+}
+sub write_flag {
+  my $self = shift;
+  my $d = $self->{flat_write_flag};
+  $self->{flat_write_flag} = shift if @_;
+  $d;
+}
+sub verbose {
+  my $self = shift;
+  my $d = $self->{flat_verbose};
+  $self->{flat_verbose} = shift if @_;
+  $d;
+}
+sub out_file {
+  my $self = shift;
+  my $d = $self->{flat_outfile};
+  $self->{flat_outfile} = shift if @_;
+  $d;
+}
+
+sub primary_namespace {
+  my $self = shift;
+  my $d    = $self->{flat_primary_namespace};
+  $self->{flat_primary_namespace} = shift if @_;
+  $d;
+}
+
+# get/set secondary namespace(s)
+# pass an array ref.
+# get an array ref in scalar context, list in list context.
+sub secondary_namespaces {
+  my $self = shift;
+  my $d    = $self->{flat_secondary_namespaces};
+  $self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_;
+  return unless $d;
+  $d = [$d] if $d && ref($d) ne 'ARRAY';  # just paranoia
+  return wantarray ? @$d : $d;
+}
+
+# return the file format
+sub file_format {
+  my $self = shift;
+  my $d    = $self->{flat_format};
+  $self->{flat_format} = shift if @_;
+  $d;
+}
+
+# return the indexing scheme
+sub indexing_scheme {
+  my $self = shift;
+  my $d    = $self->{flat_indexing};
+  $self->{flat_indexing} = shift if @_;
+  $d;
+}
+
+sub add_flat_file {
+  my $self = shift;
+  my ($file_path,$file_length,$nf) = @_;
+
+  # check that file_path is absolute
+  File::Spec->file_name_is_absolute($file_path)
+      or $self->throw("the flat file path $file_path must be absolute");
+
+  -r $file_path or $self->throw("flat file $file_path cannot be read: $!");
+
+  my $current_size = -s _;
+  if (defined $file_length) {
+    $current_size == $file_length
+      or $self->throw("flat file $file_path has changed size.  Was $file_length bytes; now $current_size");
+  } else {
+    $file_length = $current_size;
+  }
+
+  unless (defined $nf) {
+    $self->{flat_file_index} = 0 unless exists $self->{flat_file_index};
+    $nf = $self->{flat_file_index}++;
+  }
+  $self->{flat_flat_file_path}{$nf}      = $file_path;
+  $self->{flat_flat_file_no}{$file_path} = $nf;
+  $nf;
+}
+
+sub write_config {
+  my $self = shift;
+  $self->write_flag or $self->throw("cannot write configuration file because write_flag is not set");
+  my $path = $self->_config_path;
+
+  open (F,">$path") or $self->throw("open error on $path: $!");
+
+  my $index_type = $self->indexing_scheme;
+  print F "index\t$index_type\n";
+
+  my $format     = $self->file_format;
+  print F "format\t$format\n";
+
+  my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined");
+  for my $nf (@filenos) {
+    my $path = $self->{flat_flat_file_path}{$nf};
+    my $size = -s $path;
+    print F join("\t","fileid_$nf",$path,$size),"\n";
+  }
+
+  # write primary namespace
+  my $primary_ns = $self->primary_namespace
+    or $self->throw('cannot write config file because no primary namespace defined');
+
+  print F join("\t",'primary_namespace',$primary_ns),"\n";
+
+  # write secondary namespaces
+  my @secondary = $self->secondary_namespaces;
+  print F join("\t",'secondary_namespaces',@secondary),"\n";
+
+  close F or $self->throw("close error on $path: $!");
+}
+
+sub files {
+  my $self = shift;
+  return unless $self->{flat_flat_file_no};
+  return keys %{$self->{flat_flat_file_no}};
+}
+
+sub write_seq {
+  my $self  = shift;
+  my $seq   = shift;
+
+  $self->write_flag or $self->throw("cannot write sequences because write_flag is not set");
+
+  my $file  = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()');
+  my $seqio = $self->{flat_cached_parsers}{$file}
+    ||= Bio::SeqIO->new(-Format => $self->file_format,
+			-file   => ">$file")
+      or $self->throw("couldn't create Bio::SeqIO object");
+
+  my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object");
+  my $offset    = tell($fh);
+  $seqio->write_seq($seq);
+  my $length    = tell($fh)-$offset;
+  my $ids       = $self->seq_to_ids($seq);
+  $self->_store_index($ids,$file,$offset,$length);
+
+  $self->{flat_outfile_dirty}++;
+}
+
+sub close {
+  my $self = shift;
+  return unless $self->{flat_outfile_dirty};
+  $self->write_config;
+  delete $self->{flat_outfile_dirty};
+  delete $self->{flat_cached_parsers}{$self->out_file};
+}
+
+
+sub _filenos {
+  my $self = shift;
+  return unless $self->{flat_flat_file_path};
+  return keys %{$self->{flat_flat_file_path}};
+}
+
+# read the configuration file
+sub _read_config {
+  my $self   = shift;
+  my $config = shift;
+
+  my $path = defined $config ? Bio::Root::IO->catfile($config,CONFIG_FILE_NAME) 
+                             : $self->_config_path;
+  return unless -e $path;
+
+  open (F,$path) or $self->throw("open error on $path: $!");
+  my %config;
+  while (<F>) {
+    chomp;
+    my ($tag,@values) = split "\t";
+    $config{$tag} = \@values;
+  }
+  CORE::close F or $self->throw("close error on $path: $!");
+
+  $config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~
+    or $self->throw("invalid configuration file $path: no index line");
+
+  $self->indexing_scheme($1);
+
+  $self->file_format($config{format}[0]) if $config{format};
+
+  # set up primary namespace
+  my $primary_namespace = $config{primary_namespace}[0]
+    or $self->throw("invalid configuration file $path: no primary namespace defined");
+  $self->primary_namespace($primary_namespace);
+
+  # set up secondary namespaces (may be empty)
+  $self->secondary_namespaces($config{secondary_namespaces});
+
+  # get file paths and their normalization information
+  my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config;
+  for my $nf (@normalized_files) {
+    my ($file_path,$file_length) = @{$config{"fileid_${nf}"}};
+    $self->add_flat_file($file_path,$file_length,$nf);
+  }
+  1;
+}
+
+
+sub _config_path {
+  my $self = shift;
+  $self->_catfile($self->_config_name);
+}
+
+sub _catfile {
+  my $self = shift;
+  my $component = shift;
+  Bio::Root::IO->catfile($self->directory,$component);
+}
+
+sub _config_name { CONFIG_FILE_NAME }
+
+sub _path2fileno {
+  my $self = shift;
+  my $path = shift;
+  return $self->add_flat_file($path)
+    unless exists $self->{flat_flat_file_no}{$path};
+  $self->{flat_flat_file_no}{$path};
+}
+
+sub _fileno2path {
+  my $self = shift;
+  my $fileno = shift;
+  $self->{flat_flat_file_path}{$fileno};
+}
+
+sub _files {
+  my $self = shift;
+  my $paths = $self->{flat_flat_file_no};
+  return keys %$paths;
+}
+
+=head2 fetch
+
+  Title   : fetch
+  Usage   : $index->fetch( $id )
+  Function: Returns a Bio::Seq object from the index
+  Example : $seq = $index->fetch( 'dJ67B12' )
+  Returns : Bio::Seq object
+  Args    : ID
+
+Deprecated.  Use get_Seq_by_id instead.
+
+=cut
+
+sub fetch { shift->get_Seq_by_id(@_) }
+
+
+=head2 To Be Implemented in Subclasses
+
+The following methods MUST be implemented by subclasses.
+
+=cut
+
+# create real live Bio::Seq object
+sub get_Seq_by_id {
+  my $self = shift;
+  my $id   = shift;
+  $self->throw_not_implemented;
+}
+
+
+# fetch array of Bio::Seq objects
+sub get_Seq_by_acc {
+  my $self = shift;
+  return $self->get_Seq_by_id(shift) if @_ == 1;
+  my ($ns,$key) = @_;
+
+  $self->throw_not_implemented;
+}
+
+sub fetch_raw {
+  my ($self,$id,$namespace) = @_;
+  $self->throw_not_implemented;
+}
+
+# This is the method that must be implemented in
+# child classes.  It is passed a filehandle which should
+# point to the next record to be indexed in the file, 
+# and returns a two element list
+# consisting of a key and an adjustment value.
+# The key can be a scalar, in which case it is treated
+# as the primary ID, or a hashref containing namespace=>[id] pairs,
+# one of which MUST correspond to the primary namespace.
+# The adjustment value is normally zero, but can be a positive or
+# negative integer which will be added to the current file position
+# in order to calculate the correct end of the record.
+sub parse_one_record {
+  my $self = shift;
+  my $fh   = shift;
+  $self->throw_not_implemented;
+  # here's what you would implement
+  my (%keys,$offset);
+  return (\%keys,$offset);
+}
+
+sub default_file_format {
+  my $self = shift;
+  $self->throw_not_implemented;
+}
+
+sub _store_index {
+   my ($ids,$file,$offset,$length) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 May Be Overridden in Subclasses
+
+The following methods MAY be overridden by subclasses.
+
+=cut
+
+sub default_primary_namespace {
+  return "ACC";
+}
+
+sub default_secondary_namespaces {
+  return;
+}
+
+sub seq_to_ids {
+  my $self = shift;
+  my $seq  = shift;
+  my %ids;
+  $ids{$self->primary_namespace} = $seq->accession_number;
+  \%ids;
+}
+
+sub DESTROY {
+  my $self = shift;
+  $self->close;
+}
+
+
+1;