diff variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,253 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::Flatfile::DumpFile
+
+=head1 DESCRIPTION
+
+The main workhorse of the Flatfile dumping pipeline.
+
+The script is responsible for creating the filenames of these target
+files, taking data from the database and the formatting of the flat files
+headers. The final files are all Gzipped at normal levels of compression.
+
+Allowed parameters are:
+
+=over 8
+
+=item species - The species to dump
+
+=item base_path - The base of the dumps
+
+=item release - The current release we are emitting
+
+=item type - The type of data we are emitting. Should be embl or genbank
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::Flatfile::DumpFile;
+
+use strict;
+use warnings;
+
+use base qw(Bio::EnsEMBL::Pipeline::Flatfile::Base);
+
+use Bio::EnsEMBL::Utils::Exception qw/throw/;
+use Bio::EnsEMBL::Utils::SeqDumper;
+use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file work_with_file/;
+use File::Path qw/rmtree/;
+
+sub param_defaults {
+  my ($self) = @_;
+  return {
+    supported_types => {embl => 1, genbank => 1},
+  };
+}
+
+sub fetch_input {
+  my ($self) = @_;
+  
+  my $type = $self->param('type');
+  throw "No type specified" unless $type;
+  throw "Unsupported type '$type' specified" unless $self->param('supported_types')->{$type};
+  
+  throw "Need a species" unless $self->param('species');
+  throw "Need a release" unless $self->param('release');
+  throw "Need a base_path" unless $self->param('base_path');
+  
+  return;
+}
+
+sub run {
+  my ($self) = @_;
+  
+  my $root = $self->data_path();
+  if(-d $root) {
+    $self->info('Directory "%s" already exists; removing', $root);
+    rmtree($root);
+  }
+  
+  my $type = $self->param('type');
+  my $target = "dump_${type}";
+  my $seq_dumper = $self->_seq_dumper();
+  
+  my @chromosomes;
+  my @non_chromosomes;
+  foreach my $s (@{$self->get_Slices()}) {
+    my $chr = $s->is_chromosome();
+    push(@chromosomes, $s) if $chr;
+    push(@non_chromosomes, $s) if ! $chr;
+  }
+  
+  if(@non_chromosomes) {
+    my $path = $self->_generate_file_name('nonchromosomal');
+    $self->info('Dumping non-chromosomal data to %s', $path);
+    gz_work_with_file($path, 'w', sub {
+      my ($fh) = @_;
+      foreach my $slice (@non_chromosomes) {
+        $self->fine('Dumping non-chromosomal %s', $slice->name());
+        $seq_dumper->$target($slice, $fh);
+      }
+      return;
+    });
+  }
+  else {
+    $self->info('Did not find any non-chromosomal data');
+  }
+  
+  foreach my $slice (@chromosomes) {
+    $self->fine('Dumping chromosome %s', $slice->name());
+    my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
+    my $args = {};
+    if(-f $path) {
+      $self->fine('Path "%s" already exists; appending', $path);
+      $args->{Append} = 1;
+    }
+    gz_work_with_file($path, 'w', sub {
+      my ($fh) = @_;
+      $seq_dumper->$target($slice, $fh);
+      return;
+    }, $args);
+  }
+  
+  $self->_create_README();
+  
+  return;
+}
+
+sub _seq_dumper {
+  my ($self) = @_;
+  my $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
+  $seq_dumper->disable_feature_type('similarity');
+  $seq_dumper->disable_feature_type('genscan');
+  $seq_dumper->disable_feature_type('variation');
+  $seq_dumper->disable_feature_type('repeat');
+  return $seq_dumper;
+}
+
+sub _generate_file_name {
+  my ($self, $section, $name) = @_;
+
+  # File name format looks like:
+  # <species>.<assembly>.<release>.<section.name|section>.dat.gz
+  # e.g. Homo_sapiens.GRCh37.64.chromosome.20.dat.gz
+  #      Homo_sapiens.GRCh37.64.nonchromosomal.dat.gz
+  my @name_bits;
+  push @name_bits, $self->web_name();
+  push @name_bits, $self->assembly();
+  push @name_bits, $self->param('release');
+  push @name_bits, $section if $section;
+  push @name_bits, $name if $name;
+  push @name_bits, 'dat', 'gz';
+
+  my $file_name = join( '.', @name_bits );
+  my $path = $self->data_path();
+  return File::Spec->catfile($path, $file_name);
+}
+
+sub _create_README {
+  my ($self) = @_;
+  my $species = $self->scientific_name();
+  my $format = uc($self->param('type'));
+  
+  my $readme = <<README;
+#### README ####
+
+IMPORTANT: Please note you can download correlation data tables, 
+supported by Ensembl, via the highly customisable BioMart and 
+EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
+http://www.ebi.ac.uk/biomart/ for more information.
+
+-----------------------
+$format FLATFILE DUMPS
+-----------------------
+This directory contains $species $format flatfile dumps.  To ease 
+downloading of the files, the $format format entries are bundled 
+into groups of chromosomes and non-chromosomal regions.  
+All files are then compacted with gzip.
+
+Ensembl provides an automatic reannotation of $species genomic data.
+These data will be dumped in a number of forms - one of them being 
+$format flat files.  As the annotation of this form comes from Ensembl, 
+and not the original sequence entry, the two annotations are 
+likely to be different.
+
+$format flat file format dumping provides all the confirmed protein coding 
+genes known by Ensembl. Considerably more information is stored in Ensembl: 
+the flat file just gives a representation which is compatible with 
+existing tools.
+
+The main body of the entry gives the same information as is in the main 
+$format flat file entry.
+
+    * ID - the $format id
+    * AC - the EMBL/GenBank/DDBJ accession number (only the primary 
+           accession number used)
+    * SV - The accession.version pair which gives the exact reference to 
+           a particular sequence
+    * CC - comment lines to help you interpret the entry 
+
+Currently the following features are dumped into the feature table of 
+the Ensembl entry:
+
+    * Transcripts as CDS entries. Each transcript has the following 
+      attributes attached
+          o Transcript id - a stable id, which Ensembl will attempt to 
+            preserve as sensibly as possible during updates of the data
+          o Gene id - indication of the gene that this transcript belongs 
+            to. gene ids are stable and preserved as sensibly as possible 
+            during updates of the data
+          o Translation - the peptide translation of the transcript. 
+    * Exons as exon entries. Each exon has the following information
+          o Exon id. The exon id is stable and preserved as sensibly 
+            as possible during sequence updates
+          o start_phase. The phase of the splice site at the 5' end 
+            of the exon. Phase 0 means between two codons, phase 1 
+            means between the first and the second base of the codon 
+            (meaning that there are 2 bases until the reading frame of 
+            the exon) and phase 2 means between the second and the third 
+            base of the codon (one base until the reading frame starts).
+          o end_phase. The phase of the splice site at the 3' end of the 
+            exon: same definition as above (though of course, being end_phase, 
+            the position relative to the exon's reading frame is different 
+            for phase 1 and 2). 
+
+We are considering other information that should be made dumpable. In 
+general we would prefer people to use database access over flat file 
+access if you want to do something serious with the data. 
+
+README
+  
+  my $path = File::Spec->catfile($self->data_path(), 'README');
+  work_with_file($path, 'w', sub {
+    my ($fh) = @_;
+    print $fh $readme;
+    return;
+  });
+  return;
+}
+
+
+1;
+