Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,976 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::FASTA::DumpFile
+
+=head1 DESCRIPTION
+
+The main workhorse of the FASTA dumping pipeline. This module has two
+functions
+
+=over 8 
+
+=item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats
+
+=item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included)
+
+=back
+
+The script is responsible for creating the filenames of these target
+files, taking data from the database and the formatting of the FASTA
+headers. It is also responsible for the creation of README files pertaining
+to the type of dumps produced. The final files are all Gzipped at normal
+levels of compression.
+
+B<N.B.> This code will remove any files already found in the target directory
+on its first run as it assumes all data will be dumped in the one process. It
+is selective of its directory meaning a rerun of DNA dumps will not cause
+the protein/cdna files to be removed.
+
+Allowed parameters are:
+
+=over 8
+
+=item species - The species to dump
+
+=item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed
+
+=item release - A required parameter for the version of Ensembl we are dumping for
+
+=item db_types - Array reference of the database groups to use. Defaults to core
+
+=item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names
+
+=item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these)
+
+
+=item base_path - The base of the dumps
+
+=item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and 
+                       process when formatting FASTA files. Normally do not 
+                       touch
+
+=item allow_appending - If the same file name is generated we will 
+                        append into that file rather than overwriting
+
+=item overwrite_files - If the same file name is generated we will overwrite 
+                        the into that file rather than appending
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::FASTA::DumpFile;
+
+use strict;
+use warnings;
+
+use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base);
+
+use File::Spec;
+use IO::Compress::Gzip qw/gzip $GzipError/;
+use IO::File;
+use Bio::EnsEMBL::PaddedSlice;
+use Bio::EnsEMBL::Utils::BiotypeMapper;
+use Bio::EnsEMBL::Utils::Exception qw/throw/;
+use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
+use Bio::EnsEMBL::Utils::IO::FASTASerializer;
+use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;
+
+my $DNA_INDEXING_FLOW = 1;
+my $PEPTIDE_INDEXING_FLOW = 2;
+my $GENE_INDEXING_FLOW = 3;
+
+sub param_defaults {
+  my ($self) = @_;
+  return {
+    #user configurable
+    allow_appending => 1,
+    overwrite_files => 0,
+    
+    dna_chunk_size => 17000,
+    
+    skip_logic_names => [],
+    process_logic_names => [],
+    
+    #DON'T MESS
+    #used to track if we need to reopen a file in append mode or not
+    generated_files => {}, 
+    remove_files_from_dir => {},
+    dataflows => []
+  };
+}
+
+sub fetch_input {
+  my ($self) = @_;
+  
+  my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') };
+  $self->param('sequence_types', \%sequence_types);
+  
+  my $dba = $self->get_DBAdaptor();
+  my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis');
+  $self->param('analyses', $analyses);
+  
+  my $types = $self->param('db_types');
+  $types = ['core'] unless $types;
+  $self->param('db_types', $types);
+  
+  my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')};
+  $self->param('skip_logic', \%skip_logic_names);
+  $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')};
+  my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')};
+  $self->param('process_logic', \%process_logic_names);
+  $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')};
+  
+  return;
+}
+
+sub run {
+  my ($self) = @_;
+  my $types = $self->param('db_types');
+  foreach my $type (@{$types}) {
+    my $dba = $self->get_DBAdaptor($type);
+    if(! $dba) {
+      $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type);
+      next;
+    }
+    $self->run_type($type);
+  }
+  return;
+}
+
+sub write_output {
+  my ($self) = @_;
+  my $dataflows = $self->param('dataflows');
+  foreach my $flow (@{$dataflows}) {
+    $self->dataflow_output_id(@{$flow});
+  }
+  return;
+}
+
+sub run_type {
+  my ($self, $type) = @_;
+
+  my $species = $self->param('species');
+  my $sequence_types = $self->param('sequence_types');
+
+  # dump file for each type on a per slice basis
+  # types are dna,cDNA, peptide, ncRNA
+
+  #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref
+  my $dba = $self->get_DBAdaptor($type);
+  if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) {
+    $self->info( "Starting dna dump for " . $species );
+    $self->_dump_dna($type);
+    $self->_create_README('dna');
+  }
+  
+  if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not
+    $self->info( "Starting cdna dump for " . $species );
+    my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type);
+    
+    $self->info( "Starting prediction transcript dumps for " . $species );
+    my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type);
+    
+    $self->_create_README('cdna') if $transcripts || $pred_transcripts;
+    $self->_create_README('pep') if $peptide || $pred_proteins;
+  }
+  if ( $sequence_types->{ncrna} ) {
+    $self->info( "Starting ncRNA dump for " . $species );
+    my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type);
+    $self->_create_README('ncrna') if $ncrna_transcripts;
+  }
+
+  $self->cleanup_DBAdaptor($type);
+}
+
+# Dump entire sequence, also dump data into chromosome files as appropriate
+sub _dump_dna {
+  my ($self,$type) = @_;
+  
+  my @chromosomes;
+  my @non_chromosomes;
+  my $filter_human = 1;
+  foreach my $s (@{$self->get_Slices($type, $filter_human)}) {
+    my $chr = $s->is_chromosome();
+    push(@chromosomes, $s) if $chr;
+    push(@non_chromosomes, $s) if ! $chr;
+  }
+  
+  ############ NON CHROMOSOME WORK
+  $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes));
+  if(@non_chromosomes) {
+    my ( $non_specific_file, $non_specific_fh, $other_serializer ) =
+      $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' );
+    my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) =
+      $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' );
+    foreach my $s (@non_chromosomes) {
+      $self->_dump_slice($s, $other_serializer, $other_rm_serializer);
+    }
+    #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
+    $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1);
+    my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh);
+    
+    $self->tidy_file_handle( $non_specific_fh, $non_specific_file );
+    $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file );
+    $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file);
+    $self->info('Dumped non-chromosomes');
+  }
+
+  ############ CHROMOSOME WORK 
+  $self->info('Processing %d chromosome(s)', scalar(@chromosomes));
+  foreach my $s (@chromosomes) {
+    my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) =
+      $self->_generate_fasta_serializer( 'dna', 'chromosome',
+      $s->seq_region_name(), undef);
+    # repeat masked data too
+    my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) =
+      $self->_generate_fasta_serializer( 'dna_sm', 'chromosome',
+      $s->seq_region_name(), undef);
+    
+    $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer);
+    
+    #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
+    $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1);
+    my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh);
+    
+    $self->tidy_file_handle($chromo_fh, $chromo_file_name);
+    $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name);
+    $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file);
+  }
+  $self->info("Dumped chromosomes");
+  
+  #input_id
+  push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
+  push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
+  push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
+  
+  return;
+}
+
+sub _dump_slice {
+  my ($self, $s, $serialiser, $rm_serialiser) = @_;
+  
+  my $analyses = $self->param('analyses');
+  
+  my $chr = $s->is_chromosome();
+  $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end());
+  $self->info('    Slice is a chromosome') if $chr;
+  $self->info('    Slice is non-chromosomal') if ! $chr;
+  
+  # Make a padded slice (to automatically pad with N's outside of known regions)
+  # and make a repeat-masked slice and then pad that too.
+  my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s);
+  $serialiser->print_Seq($padded_slice);
+  
+  my $soft_mask = 1;
+  my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask);
+  my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice);
+  $rm_serialiser->print_Seq($padded_masked_slice);  
+  
+  return;
+}
+
+#Assumes we are working with un-compressed files
+sub _convert_softmask_to_hardmask {
+  my ($self, $soft_mask_file, $soft_mask_fh) = @_;
+  if(! -f $soft_mask_file) {
+    $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file);
+    return;
+  }
+  my $hard_mask_file = $soft_mask_file;
+  $hard_mask_file =~ s/\.dna_sm\./.dna_rm./;
+  my $hm_fh = IO::File->new($hard_mask_file, 'w');
+  $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file);
+  work_with_file($soft_mask_file, 'r', sub {
+    my ($sm_fh) = @_;
+    while(my $line = <$sm_fh>) {
+      if(index($line, '>') == 0) {
+        $line =~ s/dna_sm/dna_rm/;
+      }
+      else {
+        $line =~ tr/[acgtn]/N/;
+      }
+      print $hm_fh $line;
+    };
+    return;
+  });
+  return ($hm_fh, $hard_mask_file);
+}
+
+sub _dump_transcripts {
+  my ($self, $transcript_type, $type) = @_;
+  
+  my $has_transcript_data = 0;
+  my $has_protein_data = 0;
+
+  my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef;
+  my ( $filename, $fh, $transcript_serializer ) =
+    $self->_generate_fasta_serializer( $transcript_type, $transcript_level );
+
+  my ( $peptide_filename, $pep_fh, $peptide_serializer );
+
+  # some cDNAs are translated, make a file to receive them.
+  if ( $transcript_type eq 'cdna') {
+    ( $peptide_filename, $pep_fh, $peptide_serializer ) =
+      $self->_generate_fasta_serializer( 'pep', 'all' );
+  }
+
+  # work out what biotypes correspond to $transcript_type
+  my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new();
+  my $biotypes_list  = $biotype_mapper->group_members($transcript_type);
+
+  my $dba          = $self->get_DBAdaptor($type);
+  my $gene_adaptor = $dba->get_GeneAdaptor();
+
+  # get all the transcripts that are $transcript_type e.g. cdna, ncrna,
+  foreach my $biotype ( @{$biotypes_list} ) {
+    my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype);
+    $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} ));
+    while ( my $gene = shift @{$gene_list} ) {
+      $self->fine( 'Gene %s', $gene->display_id );
+      my $transcript_list = $gene->get_all_Transcripts();
+      foreach my $transcript ( @{$transcript_list} ) {
+        $self->fine( 'Transcript %s', $transcript->display_id );
+        next unless $self->ok_to_process_logic_name($transcript);
+
+        # foreach transcripts of all genes with biotypes classed as cdna
+        my $transcript_seq = $transcript->seq();
+        $self->_create_display_id($transcript, $transcript_seq, $transcript_type);
+        $transcript_serializer->print_Seq($transcript_seq);
+        if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) {
+          my $translation = $transcript->translation();
+          if ($translation) {
+            my $translation_seq = $transcript->translate();
+            $self->_create_display_id($translation, $translation_seq, $transcript_type);
+            $peptide_serializer->print_Seq($translation_seq);
+            
+            $has_protein_data = 1;
+          }
+        }
+        
+        $has_transcript_data = 1;
+      }
+    }
+  }
+
+  $self->tidy_file_handle( $fh, $filename );
+  if ( $transcript_type eq 'cdna' ) {
+    $self->tidy_file_handle( $pep_fh, $peptide_filename );
+  }
+  
+  if($has_protein_data) {
+    push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
+  }
+  if($has_transcript_data) {
+    push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
+  }
+  
+  return ($has_transcript_data, $has_protein_data);
+}
+
+# Dump prediction transcripts and peptides. All predicted transcripts have translations
+sub _dump_prediction_transcripts {
+  my ($self, $type) = @_;
+  my $dba  = $self->get_DBAdaptor($type);
+  
+  my $has_transcript_data = 0;
+  my $has_protein_data = 0;
+  
+  my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor();
+  my $transcript_list = $prediction_transcript_adaptor->fetch_all();
+  my $count = scalar(@{$transcript_list});
+  $self->info('Found %d prediction transcript(s)', $count);
+  if($count) {
+    my ( $abinitio_filename, $fh, $abinitio_serializer ) =
+      $self->_generate_fasta_serializer( 'cdna', 'abinitio' );
+    my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) =
+      $self->_generate_fasta_serializer( 'pep', 'abinitio' );
+    
+    while ( my $transcript = shift @{$transcript_list} ) {
+      next unless $self->ok_to_process_logic_name($transcript);
+      
+      $has_transcript_data = 1;
+      my $transcript_seq = $transcript->seq();
+      $self->_create_display_id( $transcript, $transcript_seq, 'cdna' );
+      $abinitio_serializer->print_Seq($transcript_seq);
+    
+      my $translation_seq = $transcript->translate();
+      if ( $transcript->translation() ) {
+        $has_protein_data = 1;
+        $self->_create_display_id( $transcript, $translation_seq, 'pep' );
+        $abinitio_peptide_serializer->print_Seq($translation_seq);
+      }
+    }
+    
+    $self->tidy_file_handle( $fh,     $abinitio_filename );
+    $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename );
+    
+    if($has_protein_data) {
+      push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
+    }
+    if($has_transcript_data) {
+      push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
+    }
+  }
+  
+
+  return ($has_transcript_data, $has_protein_data);
+}
+
+# We can optionally skip the Gzip process & just delegate to the super class
+# for it's cleanup routines which only work with an open file handle. Therefore 
+# only pass it onto the super implementation *if* the handle was open. 
+# Also only Gzip if the source file exists (it could have been unlinked from
+# an earlier call)
+
+sub tidy_file_handle {
+  my ($self, $fh, $path, $no_gzip) = @_;
+  if($fh->opened()) {
+    my $tidy = $self->SUPER::tidy_file_handle($fh, $path);
+    return 1 if $tidy;
+  }
+  
+  return if $no_gzip; #don't gzip if we were told to skip
+  return if ! -f $path; #don't gzip if we had no file
+  
+  my $target = $path.".gz";
+  $self->info('Gzipping "%s"', $path);
+  my %args;
+  if($self->param('generated_files')->{$target}) {
+    if($self->param('allow_appending')) {
+      $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target);
+      $args{Append} = 1;
+    }
+    elsif($self->param('overwrite_files')) {
+      $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target);
+    }
+    else {
+      $self->throw("Cannot continue. The file %s has already been created this session. Fail!");
+    }
+  }
+  gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError";
+  $self->info('    Removing original file from filesystem');
+  unlink $path or throw "Could not delete $path: $!";
+  $self->info('    Finished');
+  $self->param('generated_files')->{$target} = 1;
+  return 0;
+}
+
+#We assume a transcript is ok to process unless proven otherwise
+sub ok_to_process_logic_name {
+  my ($self, $transcript) = @_;
+  my $ok = 1;
+  my $logic_name = $transcript->analysis()->logic_name();
+  if($self->param('process_logic_active')) {
+    if(! $self->param('process_logic')->{$logic_name}) {
+      $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name);
+      $ok = 0;
+    }
+  }
+  if($self->param('skip_logic_active')) {
+    if($self->param('skip_logic')->{$logic_name}) {
+      $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name);
+      $ok = 0;
+    }
+  }
+  return $ok;
+}
+
+#Generates a FASTA serializer but returns the (filename, handle & instance)
+sub _generate_fasta_serializer {
+  my ( $self, $datatype, $level, $section, $header_formatter ) = @_;
+  $header_formatter ||= $self->_custom_header();
+  my $chunk = $self->param('dna_chunk_size');
+  my $filename = $self->_generate_file_name( $datatype, $level, $section );
+  my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!";
+  my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk);
+  return ( $filename, $fh, $ser );
+}
+
+#
+# _generate_file_name(data type, level, section )
+#                      dna       toplevel  undef
+#                      dna       chromosome 6
+
+sub _generate_file_name {
+  my ( $self, $data_type, $level, $section ) = @_;    #level & section is optional
+
+  # File name format looks like:
+  # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa
+  # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa
+  #      Homo_sapiens.GRCh37.64.dna.chromosome.20.fa
+  #      Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa
+  my @name_bits;
+  push @name_bits, $self->web_name();
+  push @name_bits, $self->assembly();
+  push @name_bits, $self->param('release');
+  push @name_bits, lc($data_type);
+  push @name_bits, $level if $level;
+  push @name_bits, $section if $section;
+  push @name_bits, 'fa';
+
+  my $file_name = join( '.', @name_bits );
+
+  $data_type =~ s/_[rs]m$//;    # remove repeatmask or softmask designation from path component
+  my $data_type_dir = $self->fasta_path($data_type);
+  $self->_remove_files_from_dir($data_type_dir);
+  return File::Spec->catfile( $data_type_dir, $file_name );
+}
+
+# Attempts to remove any generated files previously present for the instance
+# of the Process
+sub _remove_files_from_dir {
+  my ($self, $dir) = @_;
+  if(! $self->param('remove_files_from_dir')->{$dir}) {
+    $self->unlink_all_files($dir);
+    $self->param('remove_files_from_dir')->{$dir} = 1;
+  }
+  return;
+}
+
+##Logic used to generate the expected format for a FASTA header
+sub _create_display_id {
+  my ($self, $object, $seq, $type) = @_;
+  
+  my $stable_id;
+  my $location;
+  my $decoded_type;
+  my $decoded_status;
+  my %attributes;
+  
+  if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) {
+    $attributes{transcript_biotype} = $object->biotype();
+    
+    #If pred transcript then no gene but type & status are different
+    if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) {
+      $stable_id = $object->stable_id();
+      $location  = $object->feature_Slice()->name();
+      $decoded_type = $type;
+      $decoded_status = lc($object->analysis()->logic_name());
+      if($type eq 'pep') {
+        $attributes{transcript} = $stable_id;
+      }
+    }
+    #Must be a real "transcript"
+    else {
+      $stable_id = $object->stable_id();
+      $location  = $object->feature_Slice()->name();
+      my $gene = $object->get_Gene();
+      $attributes{gene} = $gene->stable_id();
+      $attributes{gene_biotype} = $gene->biotype();
+      
+      #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script)
+      if($type eq 'ncrna') {
+        $decoded_type = lc($object->analysis()->logic_name());
+        $decoded_status = $gene->biotype();
+      }
+      elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) {
+        $decoded_type = $type;
+        $decoded_status = 'pseudogene';
+      }
+      #Otherwise use type & object's transcript's status
+      else {
+        $decoded_type   = $type;
+        $decoded_status = lc($object->status());
+      }
+    }
+  }
+  #If it's a translation then grab the transcript and gene then set accordingly
+  elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) {
+    my $transcript = $object->transcript();
+    my $gene = $transcript->get_Gene();
+    $stable_id = $object->stable_id();
+    $location  = $transcript->feature_Slice()->name();
+    %attributes = (
+      gene => $gene->stable_id(),
+      gene_biotype => $gene->biotype(),
+      transcript => $transcript->stable_id(),
+      transcript_biotype => $transcript->biotype()
+    );
+    $decoded_type = 'pep';
+    $decoded_status = lc($transcript->status());
+  }
+  else {
+    throw sprintf( 'Do not understand how to format a display_id for type "%s"',
+      ref($object) );
+  }
+  
+  my $attr_str = join(q{ }, 
+    map { $_.':'.$attributes{$_} } 
+    grep { exists $attributes{$_} } 
+    qw/gene transcript gene_biotype transcript_biotype/);
+
+  my $format = '%s %s:%s %s %s';
+  
+  my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str);
+  $seq->display_id($id);
+
+  return;
+}
+
+sub _custom_header {
+  my ($self) = @_;
+  return sub {
+    my $slice = shift;
+    if ( !$slice->isa('Bio::EnsEMBL::Slice') ) {
+      return $slice->display_id();
+    }
+
+    #RMS means masked data. soft_mask() true means it was softmasked
+    my $dna_type = 'dna';
+    if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) {
+      $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm';
+    }
+
+    my $id        = $slice->seq_region_name;
+    my $idtype    = $slice->coord_system->name;
+    my $location  = $slice->name;
+    my $ref       = $slice->assembly_exception_type();
+    my $header    = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref);  
+    return $header;
+  };
+}
+
+sub _final_filename {
+  my ($self, $filename) = @_;
+  return $filename if $filename =~ /\.gz$/;
+  return $filename.'.gz';
+}
+
+sub assembly_accession {
+  my ($self) = @_;
+  my $mc = $self->get_DBAdaptor()->get_MetaContainer();
+  return $mc->single_value_by_key('assembly.accession');
+}
+
+sub assembly_accession_type {
+  my ($self) = @_;
+  my $mc = $self->get_DBAdaptor()->get_MetaContainer();
+  return $mc->single_value_by_key('assembly.web_accession_type');
+}
+
+sub _create_README {
+
+  #Text for readme files
+
+  my %text = (
+    dna => <<'README',
+#######################
+Fasta DNA dumps
+#######################
+
+-----------
+FILE NAMES
+------------
+The files are consistently named following this pattern:
+   <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
+
+<species>:   The systematic name of the species.
+<assembly>:  The assembly build name.
+<release>:   The release number.
+<sequence type>:
+ * 'dna' - unmasked genomic DNA sequences.
+  * 'dna_rm' - masked genomic DNA.  Interspersed repeats and low
+     complexity regions are detected with the RepeatMasker tool and masked
+     by replacing repeats with 'N's.
+  * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions
+    have been replaced with lowercased versions of their nucleic base
+<id type> One of the following:
+  * 'chromosome'a    - The top-level coordinate system in most species in Ensembl
+  * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome
+  * 'seqlevel'       - This is usually sequence scaffolds, chunks or clones.
+     -- 'scaffold'  - Larger sequence contigs from the assembly of shorter
+        sequencing reads (often from whole genome shotgun, WGS) which could
+        not yet be assembled into chromosomes. Often more genome sequencing
+        is needed to narrow gaps and establish a tiling path.
+     -- 'chunk' -  While contig sequences can be assembled into large entities,
+        they sometimes have to be artificially broken down into smaller entities
+        called 'chunks'. This is due to limitations in the annotation
+        pipeline and the finite record size imposed by MySQL which stores the
+        sequence and annotation information.
+     -- 'clone' - In general this is the smallest sequence entity.  It is often
+        identical to the sequence of one BAC clone, or sequence region
+        of one BAC clone which forms the tiling path.
+<id>:     The actual sequence identifier. Depending on the <id type> the <id>
+          could represent the name of a chromosome, a scaffold, a contig, a clone ..
+          Field is empty for seqlevel files
+fa: All files in these directories represent FASTA database files
+gz: All files are compacted with GNU Zip for storage efficiency.
+
+-----------
+TOPLEVEL
+----------
+These files contain the full sequence of the assembly in fasta format.
+They contain one chromosome per file.
+
+EXAMPLES
+   The genomic sequence of human chromosome 1:
+     Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz
+
+   The masked version of the genome sequence on human chromosome 1
+   (contains '_rm' or '_sm' in the name):
+     Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz
+     Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz
+
+   Non-chromosomal assembly sequences:
+   e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes
+     Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz
+     Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz
+     Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz
+
+
+--------------
+SPECIAL CASES
+--------------
+Some chromosomes have alternate haplotypes which are presented in files with 
+the haplotype sequence only:
+   Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz
+   Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz
+   
+
+Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR)
+on the Y is annotated.  By definition the PAR region is identical on the 
+X and Y chromosome.  We provide this sequence in the following way.
+-- The Y chromosome file contains the complete sequence of the PAR:
+    Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz
+-- The top level file includes only the unique portion of Y (i.e. the PAR 
+   (region is N-masked):
+      Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz
+
+README
+
+    pep => <<'README',
+####################
+Fasta Peptide dumps
+####################
+These files hold the protein translations of Ensembl gene predictions.
+
+-----------
+FILE NAMES
+------------
+The files are consistently named following this pattern:
+   <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
+
+<species>:       The systematic name of the species.
+<assembly>:      The assembly build name.
+<release>:       The release number.
+<sequence type>: pep for peptide sequences
+<status>
+  * 'pep.all' - the super-set of all translations resulting from Ensembl known
+     or novel gene predictions.
+  * 'pep.abinitio' translations resulting from 'ab initio' gene
+     prediction algorithms such as SNAP and GENSCAN. In general, all
+     'ab initio' predictions are based solely on the genomic sequence and
+     not any other experimental evidence. Therefore, not all GENSCAN
+     or SNAP predictions represent biologically real proteins.
+fa : All files in these directories represent FASTA database files
+gz : All files are compacted with GNU Zip for storage efficiency.
+
+EXAMPLES (Note: Most species do not sequences for each different <status>)
+ for Human:
+    Homo_sapiens.NCBI36.40.pep.all.fa.gz
+      contains all known and novel peptides
+    Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz
+      contains all abinitio predicted peptide
+
+Difference between known and novel
+----------------------------------
+Protein models that can be mapped to species-specific entries in
+Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as
+known genes.  Those that cannot be mapped are called novel
+(e.g. genes predicted on the basis of evidence from closely related species).
+
+For models annotated by HAVANA the status is set manually. Models that have 
+an HGNC name are referred to as known and the remaining models are referred to
+as novel.
+
+-------------------------------
+FASTA Sequence Header Lines
+------------------------------
+The FASTA sequence header lines are designed to be consistent across
+all types of Ensembl FASTA sequences.  This gives enough information
+for the sequence to be identified outside the context of the FASTA
+database file.
+
+General format:
+
+>ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
+
+Example of Ensembl Peptide header:
+
+>ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
+ ^               ^   ^     ^                                   ^                    ^                          ^                            ^ 
+ ID              |   |  LOCATION                          GENE:stable gene ID       |                       GENE: gene biotype           TRANSCRIPT: transcript biotype
+                 | STATUS                                           TRANSCRIPT: stable transcript ID
+               SEQTYPE
+
+README
+
+    cdna => <<'README',
+##################
+Fasta cDNA dumps
+#################
+
+These files hold the cDNA sequences corresponding to Ensembl gene predictions.
+
+------------
+FILE NAMES
+------------
+The files are consistently named following this pattern:
+<species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
+
+<species>: The systematic name of the species.
+<assembly>: The assembly build name.
+<release>: The release number.
+<sequence type>: cdna for cDNA sequences
+<status>
+  * 'cdna.all' - the super-set of all transcripts resulting from
+     Ensembl known, novel and pseudo gene predictions (see more below).
+  * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction
+     algorithms such as SNAP and GENSCAN. In general all 'ab initio'
+     predictions are solely based on the genomic sequence and do not
+     use other experimental evidence. Therefore, not all GENSCAN or SNAP
+     cDNA predictions represent biologically real cDNAs.
+     Consequently, these predictions should be used with care.
+
+EXAMPLES  (Note: Most species do not sequences for each different <status>)
+  for Human:
+    Homo_sapiens.NCBI36.40.cdna.all.fa.gz
+      cDNA sequences for all transcripts: known, novel and pseudo
+    Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz
+      cDNA sequences for 'ab-initio' prediction transcripts.
+
+Difference between known and novel transcripts
+-----------------------------------------------
+Transcript or protein models that can be mapped to species-specific entries
+in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl.
+Those that cannot be mapped are called novel genes (e.g. genes predicted on
+the basis of evidence from closely related species).
+
+For models annotated by HAVANA the status is set manually. Models that have 
+an HGNC name are referred to as known and the remaining models are referred to
+as novel.
+
+-------------------------------
+FASTA Sequence Header Lines
+------------------------------
+The FASTA sequence header lines are designed to be consistent across
+all types of Ensembl FASTA sequences.  This gives enough information
+for the sequence to be identified outside the context of the FASTA file.
+
+General format:
+
+>ID SEQTYPE:STATUS LOCATION GENE
+
+Example of an Ensembl cDNA header:
+
+>ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding
+ ^               ^    ^     ^                                       ^                    ^                           ^       
+ ID              |    |  LOCATION                         GENE: gene stable ID        GENE: gene biotype        TRANSCRIPT: transcript biotype
+                 |  STATUS
+              SEQTYPE
+
+
+README
+
+    ncrna => <<'README',
+##################
+Fasta RNA dumps
+#################
+
+These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA).
+
+------------
+FILE NAMES
+------------
+The files are consistently named following this pattern:
+<species>.<assembly>.<release>.<sequence type>.fa.gz
+
+<species>: The systematic name of the species.
+<assembly>: The assembly build name.
+<release>: The release number.
+<sequence type>: ncrna for non-coding RNA sequences
+
+EXAMPLES
+  for Human:
+    Homo_sapiens.NCBI36.40.ncrna.fa.gz
+      Transcript sequences for all ncRNA gene types.
+
+
+-------------------------------
+FASTA Sequence Header Lines
+------------------------------
+The FASTA sequence header lines are designed to be consistent across
+all types of Ensembl FASTA sequences.  This gives enough information
+for the sequence to be identified outside the context of the FASTA file.
+
+General format:
+
+>ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA
+   ^             ^     ^     ^                                          ^                    ^                           ^ 
+   ID            |     |  LOCATION                            GENE: gene stable ID       GENE: gene biotype           TRANSCRIPT: transcript biotype   
+                 |   STATUS
+              SEQTYPE
+
+
+README
+  );
+
+  my $warning = <<'README';
+#### README ####
+
+IMPORTANT: Please note you can download correlation data tables,
+supported by Ensembl, via the highly customisable BioMart and
+EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
+http://www.ebi.ac.uk/biomart/ for more information.
+
+README
+
+  my ( $self, $data_type ) = @_;
+  my $base_path = $self->fasta_path();
+  my $path      = File::Spec->catfile( $base_path, $data_type, 'README' );
+  my $accession = $self->assembly_accession();
+  my $txt       = $text{$data_type};
+  throw "Cannot find README text for type $data_type" unless $txt;
+
+  #Add accession information if it is available
+  if($data_type eq 'dna' && $accession) {
+    my $type = $self->assembly_accession_type();
+    $warning .= <<EXTRA;
+The genome assembly represented here corresponds to $type 
+$accession
+
+EXTRA
+  }
+  
+  work_with_file($path, 'w', sub {
+    my ($fh) = @_;
+    print $fh $warning;
+    print $fh $txt;
+    return;
+  });
+  return;
+}
+
+1;
+
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children