Mercurial > repos > mahtabm > ensembl

=pod

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 NAME

Bio::EnsEMBL::Pipeline::FASTA::DumpFile

=head1 DESCRIPTION

The main workhorse of the FASTA dumping pipeline. This module has two
functions

=over 8

=item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats

=item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included)

=back

The script is responsible for creating the filenames of these target
files, taking data from the database and the formatting of the FASTA
headers. It is also responsible for the creation of README files pertaining
to the type of dumps produced. The final files are all Gzipped at normal
levels of compression.

B<N.B.> This code will remove any files already found in the target directory
on its first run as it assumes all data will be dumped in the one process. It
is selective of its directory meaning a rerun of DNA dumps will not cause
the protein/cdna files to be removed.

Allowed parameters are:

=over 8

=item species - The species to dump

=item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed

=item release - A required parameter for the version of Ensembl we are dumping for

=item db_types - Array reference of the database groups to use. Defaults to core

=item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names

=item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these)


=item base_path - The base of the dumps

=item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and
                       process when formatting FASTA files. Normally do not
                       touch

=item allow_appending - If the same file name is generated we will
                        append into that file rather than overwriting

=item overwrite_files - If the same file name is generated we will overwrite
                        the into that file rather than appending

=back

=cut

package Bio::EnsEMBL::Pipeline::FASTA::DumpFile;

use strict;
use warnings;

use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base);

use File::Spec;
use IO::Compress::Gzip qw/gzip $GzipError/;
use IO::File;
use Bio::EnsEMBL::PaddedSlice;
use Bio::EnsEMBL::Utils::BiotypeMapper;
use Bio::EnsEMBL::Utils::Exception qw/throw/;
use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
use Bio::EnsEMBL::Utils::IO::FASTASerializer;
use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;

my $DNA_INDEXING_FLOW = 1;
my $PEPTIDE_INDEXING_FLOW = 2;
my $GENE_INDEXING_FLOW = 3;

sub param_defaults {
  my ($self) = @_;
  return {
    #user configurable
    allow_appending => 1,
    overwrite_files => 0,

    dna_chunk_size => 17000,

    skip_logic_names => [],
    process_logic_names => [],

    #DON'T MESS
    #used to track if we need to reopen a file in append mode or not
    generated_files => {},
    remove_files_from_dir => {},
    dataflows => []
  };
}

sub fetch_input {
  my ($self) = @_;

  my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') };
  $self->param('sequence_types', \%sequence_types);

  my $dba = $self->get_DBAdaptor();
  my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis');
  $self->param('analyses', $analyses);

  my $types = $self->param('db_types');
  $types = ['core'] unless $types;
  $self->param('db_types', $types);

  my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')};
  $self->param('skip_logic', \%skip_logic_names);
  $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')};
  my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')};
  $self->param('process_logic', \%process_logic_names);
  $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')};

  return;
}

sub run {
  my ($self) = @_;
  my $types = $self->param('db_types');
  foreach my $type (@{$types}) {
    my $dba = $self->get_DBAdaptor($type);
    if(! $dba) {
      $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type);
      next;
    }
    $self->run_type($type);
  }
  return;
}

sub write_output {
  my ($self) = @_;
  my $dataflows = $self->param('dataflows');
  foreach my $flow (@{$dataflows}) {
    $self->dataflow_output_id(@{$flow});
  }
  return;
}

sub run_type {
  my ($self, $type) = @_;

  my $species = $self->param('species');
  my $sequence_types = $self->param('sequence_types');

  # dump file for each type on a per slice basis
  # types are dna,cDNA, peptide, ncRNA

  #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref
  my $dba = $self->get_DBAdaptor($type);
  if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) {
    $self->info( "Starting dna dump for " . $species );
    $self->_dump_dna($type);
    $self->_create_README('dna');
  }

  if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not
    $self->info( "Starting cdna dump for " . $species );
    my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type);

    $self->info( "Starting prediction transcript dumps for " . $species );
    my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type);

    $self->_create_README('cdna') if $transcripts || $pred_transcripts;
    $self->_create_README('pep') if $peptide || $pred_proteins;
  }
  if ( $sequence_types->{ncrna} ) {
    $self->info( "Starting ncRNA dump for " . $species );
    my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type);
    $self->_create_README('ncrna') if $ncrna_transcripts;
  }

  $self->cleanup_DBAdaptor($type);
}

# Dump entire sequence, also dump data into chromosome files as appropriate
sub _dump_dna {
  my ($self,$type) = @_;

  my @chromosomes;
  my @non_chromosomes;
  my $filter_human = 1;
  foreach my $s (@{$self->get_Slices($type, $filter_human)}) {
    my $chr = $s->is_chromosome();
    push(@chromosomes, $s) if $chr;
    push(@non_chromosomes, $s) if ! $chr;
  }

  ############ NON CHROMOSOME WORK
  $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes));
  if(@non_chromosomes) {
    my ( $non_specific_file, $non_specific_fh, $other_serializer ) =
      $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' );
    my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) =
      $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' );
    foreach my $s (@non_chromosomes) {
      $self->_dump_slice($s, $other_serializer, $other_rm_serializer);
    }
    #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
    $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1);
    my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh);

    $self->tidy_file_handle( $non_specific_fh, $non_specific_file );
    $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file );
    $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file);
    $self->info('Dumped non-chromosomes');
  }

  ############ CHROMOSOME WORK
  $self->info('Processing %d chromosome(s)', scalar(@chromosomes));
  foreach my $s (@chromosomes) {
    my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) =
      $self->_generate_fasta_serializer( 'dna', 'chromosome',
      $s->seq_region_name(), undef);
    # repeat masked data too
    my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) =
      $self->_generate_fasta_serializer( 'dna_sm', 'chromosome',
      $s->seq_region_name(), undef);

    $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer);

    #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
    $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1);
    my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh);

    $self->tidy_file_handle($chromo_fh, $chromo_file_name);
    $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name);
    $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file);
  }
  $self->info("Dumped chromosomes");

  #input_id
  push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
  push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
  push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);

  return;
}

sub _dump_slice {
  my ($self, $s, $serialiser, $rm_serialiser) = @_;

  my $analyses = $self->param('analyses');

  my $chr = $s->is_chromosome();
  $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end());
  $self->info('    Slice is a chromosome') if $chr;
  $self->info('    Slice is non-chromosomal') if ! $chr;

  # Make a padded slice (to automatically pad with N's outside of known regions)
  # and make a repeat-masked slice and then pad that too.
  my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s);
  $serialiser->print_Seq($padded_slice);

  my $soft_mask = 1;
  my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask);
  my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice);
  $rm_serialiser->print_Seq($padded_masked_slice);

  return;
}

#Assumes we are working with un-compressed files
sub _convert_softmask_to_hardmask {
  my ($self, $soft_mask_file, $soft_mask_fh) = @_;
  if(! -f $soft_mask_file) {
    $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file);
    return;
  }
  my $hard_mask_file = $soft_mask_file;
  $hard_mask_file =~ s/\.dna_sm\./.dna_rm./;
  my $hm_fh = IO::File->new($hard_mask_file, 'w');
  $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file);
  work_with_file($soft_mask_file, 'r', sub {
    my ($sm_fh) = @_;
    while(my $line = <$sm_fh>) {
      if(index($line, '>') == 0) {
        $line =~ s/dna_sm/dna_rm/;
      }
      else {
        $line =~ tr/[acgtn]/N/;
      }
      print $hm_fh $line;
    };
    return;
  });
  return ($hm_fh, $hard_mask_file);
}

sub _dump_transcripts {
  my ($self, $transcript_type, $type) = @_;

  my $has_transcript_data = 0;
  my $has_protein_data = 0;

  my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef;
  my ( $filename, $fh, $transcript_serializer ) =
    $self->_generate_fasta_serializer( $transcript_type, $transcript_level );

  my ( $peptide_filename, $pep_fh, $peptide_serializer );

  # some cDNAs are translated, make a file to receive them.
  if ( $transcript_type eq 'cdna') {
    ( $peptide_filename, $pep_fh, $peptide_serializer ) =
      $self->_generate_fasta_serializer( 'pep', 'all' );
  }

  # work out what biotypes correspond to $transcript_type
  my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new();
  my $biotypes_list  = $biotype_mapper->group_members($transcript_type);

  my $dba          = $self->get_DBAdaptor($type);
  my $gene_adaptor = $dba->get_GeneAdaptor();

  # get all the transcripts that are $transcript_type e.g. cdna, ncrna,
  foreach my $biotype ( @{$biotypes_list} ) {
    my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype);
    $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} ));
    while ( my $gene = shift @{$gene_list} ) {
      $self->fine( 'Gene %s', $gene->display_id );
      my $transcript_list = $gene->get_all_Transcripts();
      foreach my $transcript ( @{$transcript_list} ) {
        $self->fine( 'Transcript %s', $transcript->display_id );
        next unless $self->ok_to_process_logic_name($transcript);

        # foreach transcripts of all genes with biotypes classed as cdna
        my $transcript_seq = $transcript->seq();
        $self->_create_display_id($transcript, $transcript_seq, $transcript_type);
        $transcript_serializer->print_Seq($transcript_seq);
        if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) {
          my $translation = $transcript->translation();
          if ($translation) {
            my $translation_seq = $transcript->translate();
            $self->_create_display_id($translation, $translation_seq, $transcript_type);
            $peptide_serializer->print_Seq($translation_seq);

            $has_protein_data = 1;
          }
        }

        $has_transcript_data = 1;
      }
    }
  }

  $self->tidy_file_handle( $fh, $filename );
  if ( $transcript_type eq 'cdna' ) {
    $self->tidy_file_handle( $pep_fh, $peptide_filename );
  }

  if($has_protein_data) {
    push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
  }
  if($has_transcript_data) {
    push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
  }

  return ($has_transcript_data, $has_protein_data);
}

# Dump prediction transcripts and peptides. All predicted transcripts have translations
sub _dump_prediction_transcripts {
  my ($self, $type) = @_;
  my $dba  = $self->get_DBAdaptor($type);

  my $has_transcript_data = 0;
  my $has_protein_data = 0;

  my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor();
  my $transcript_list = $prediction_transcript_adaptor->fetch_all();
  my $count = scalar(@{$transcript_list});
  $self->info('Found %d prediction transcript(s)', $count);
  if($count) {
    my ( $abinitio_filename, $fh, $abinitio_serializer ) =
      $self->_generate_fasta_serializer( 'cdna', 'abinitio' );
    my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) =
      $self->_generate_fasta_serializer( 'pep', 'abinitio' );

    while ( my $transcript = shift @{$transcript_list} ) {
      next unless $self->ok_to_process_logic_name($transcript);

      $has_transcript_data = 1;
      my $transcript_seq = $transcript->seq();
      $self->_create_display_id( $transcript, $transcript_seq, 'cdna' );
      $abinitio_serializer->print_Seq($transcript_seq);

      my $translation_seq = $transcript->translate();
      if ( $transcript->translation() ) {
        $has_protein_data = 1;
        $self->_create_display_id( $transcript, $translation_seq, 'pep' );
        $abinitio_peptide_serializer->print_Seq($translation_seq);
      }
    }

    $self->tidy_file_handle( $fh,     $abinitio_filename );
    $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename );

    if($has_protein_data) {
      push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
    }
    if($has_transcript_data) {
      push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
    }
  }


  return ($has_transcript_data, $has_protein_data);
}

# We can optionally skip the Gzip process & just delegate to the super class
# for it's cleanup routines which only work with an open file handle. Therefore
# only pass it onto the super implementation *if* the handle was open.
# Also only Gzip if the source file exists (it could have been unlinked from
# an earlier call)

sub tidy_file_handle {
  my ($self, $fh, $path, $no_gzip) = @_;
  if($fh->opened()) {
    my $tidy = $self->SUPER::tidy_file_handle($fh, $path);
    return 1 if $tidy;
  }

  return if $no_gzip; #don't gzip if we were told to skip
  return if ! -f $path; #don't gzip if we had no file

  my $target = $path.".gz";
  $self->info('Gzipping "%s"', $path);
  my %args;
  if($self->param('generated_files')->{$target}) {
    if($self->param('allow_appending')) {
      $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target);
      $args{Append} = 1;
    }
    elsif($self->param('overwrite_files')) {
      $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target);
    }
    else {
      $self->throw("Cannot continue. The file %s has already been created this session. Fail!");
    }
  }
  gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError";
  $self->info('    Removing original file from filesystem');
  unlink $path or throw "Could not delete $path: $!";
  $self->info('    Finished');
  $self->param('generated_files')->{$target} = 1;
  return 0;
}

#We assume a transcript is ok to process unless proven otherwise
sub ok_to_process_logic_name {
  my ($self, $transcript) = @_;
  my $ok = 1;
  my $logic_name = $transcript->analysis()->logic_name();
  if($self->param('process_logic_active')) {
    if(! $self->param('process_logic')->{$logic_name}) {
      $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name);
      $ok = 0;
    }
  }
  if($self->param('skip_logic_active')) {
    if($self->param('skip_logic')->{$logic_name}) {
      $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name);
      $ok = 0;
    }
  }
  return $ok;
}

#Generates a FASTA serializer but returns the (filename, handle & instance)
sub _generate_fasta_serializer {
  my ( $self, $datatype, $level, $section, $header_formatter ) = @_;
  $header_formatter ||= $self->_custom_header();
  my $chunk = $self->param('dna_chunk_size');
  my $filename = $self->_generate_file_name( $datatype, $level, $section );
  my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!";
  my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk);
  return ( $filename, $fh, $ser );
}

#
# _generate_file_name(data type, level, section )
#                      dna       toplevel  undef
#                      dna       chromosome 6

sub _generate_file_name {
  my ( $self, $data_type, $level, $section ) = @_;    #level & section is optional

  # File name format looks like:
  # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa
  # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa
  #      Homo_sapiens.GRCh37.64.dna.chromosome.20.fa
  #      Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa
  my @name_bits;
  push @name_bits, $self->web_name();
  push @name_bits, $self->assembly();
  push @name_bits, $self->param('release');
  push @name_bits, lc($data_type);
  push @name_bits, $level if $level;
  push @name_bits, $section if $section;
  push @name_bits, 'fa';

  my $file_name = join( '.', @name_bits );

  $data_type =~ s/_[rs]m$//;    # remove repeatmask or softmask designation from path component
  my $data_type_dir = $self->fasta_path($data_type);
  $self->_remove_files_from_dir($data_type_dir);
  return File::Spec->catfile( $data_type_dir, $file_name );
}

# Attempts to remove any generated files previously present for the instance
# of the Process
sub _remove_files_from_dir {
  my ($self, $dir) = @_;
  if(! $self->param('remove_files_from_dir')->{$dir}) {
    $self->unlink_all_files($dir);
    $self->param('remove_files_from_dir')->{$dir} = 1;
  }
  return;
}

##Logic used to generate the expected format for a FASTA header
sub _create_display_id {
  my ($self, $object, $seq, $type) = @_;

  my $stable_id;
  my $location;
  my $decoded_type;
  my $decoded_status;
  my %attributes;

  if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) {
    $attributes{transcript_biotype} = $object->biotype();

    #If pred transcript then no gene but type & status are different
    if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) {
      $stable_id = $object->stable_id();
      $location  = $object->feature_Slice()->name();
      $decoded_type = $type;
      $decoded_status = lc($object->analysis()->logic_name());
      if($type eq 'pep') {
        $attributes{transcript} = $stable_id;
      }
    }
    #Must be a real "transcript"
    else {
      $stable_id = $object->stable_id();
      $location  = $object->feature_Slice()->name();
      my $gene = $object->get_Gene();
      $attributes{gene} = $gene->stable_id();
      $attributes{gene_biotype} = $gene->biotype();

      #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script)
      if($type eq 'ncrna') {
        $decoded_type = lc($object->analysis()->logic_name());
        $decoded_status = $gene->biotype();
      }
      elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) {
        $decoded_type = $type;
        $decoded_status = 'pseudogene';
      }
      #Otherwise use type & object's transcript's status
      else {
        $decoded_type   = $type;
        $decoded_status = lc($object->status());
      }
    }
  }
  #If it's a translation then grab the transcript and gene then set accordingly
  elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) {
    my $transcript = $object->transcript();
    my $gene = $transcript->get_Gene();
    $stable_id = $object->stable_id();
    $location  = $transcript->feature_Slice()->name();
    %attributes = (
      gene => $gene->stable_id(),
      gene_biotype => $gene->biotype(),
      transcript => $transcript->stable_id(),
      transcript_biotype => $transcript->biotype()
    );
    $decoded_type = 'pep';
    $decoded_status = lc($transcript->status());
  }
  else {
    throw sprintf( 'Do not understand how to format a display_id for type "%s"',
      ref($object) );
  }

  my $attr_str = join(q{ },
    map { $_.':'.$attributes{$_} }
    grep { exists $attributes{$_} }
    qw/gene transcript gene_biotype transcript_biotype/);

  my $format = '%s %s:%s %s %s';

  my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str);
  $seq->display_id($id);

  return;
}

sub _custom_header {
  my ($self) = @_;
  return sub {
    my $slice = shift;
    if ( !$slice->isa('Bio::EnsEMBL::Slice') ) {
      return $slice->display_id();
    }

    #RMS means masked data. soft_mask() true means it was softmasked
    my $dna_type = 'dna';
    if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) {
      $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm';
    }

    my $id        = $slice->seq_region_name;
    my $idtype    = $slice->coord_system->name;
    my $location  = $slice->name;
    my $ref       = $slice->assembly_exception_type();
    my $header    = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref);
    return $header;
  };
}

sub _final_filename {
  my ($self, $filename) = @_;
  return $filename if $filename =~ /\.gz$/;
  return $filename.'.gz';
}

sub assembly_accession {
  my ($self) = @_;
  my $mc = $self->get_DBAdaptor()->get_MetaContainer();
  return $mc->single_value_by_key('assembly.accession');
}

sub assembly_accession_type {
  my ($self) = @_;
  my $mc = $self->get_DBAdaptor()->get_MetaContainer();
  return $mc->single_value_by_key('assembly.web_accession_type');
}

sub _create_README {

  #Text for readme files

  my %text = (
    dna => <<'README',
#######################
Fasta DNA dumps
#######################

-----------
FILE NAMES
------------
The files are consistently named following this pattern:
   <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz

<species>:   The systematic name of the species.
<assembly>:  The assembly build name.
<release>:   The release number.
<sequence type>:
 * 'dna' - unmasked genomic DNA sequences.
  * 'dna_rm' - masked genomic DNA.  Interspersed repeats and low
     complexity regions are detected with the RepeatMasker tool and masked
     by replacing repeats with 'N's.
  * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions
    have been replaced with lowercased versions of their nucleic base
<id type> One of the following:
  * 'chromosome'a    - The top-level coordinate system in most species in Ensembl
  * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome
  * 'seqlevel'       - This is usually sequence scaffolds, chunks or clones.
     -- 'scaffold'  - Larger sequence contigs from the assembly of shorter
        sequencing reads (often from whole genome shotgun, WGS) which could
        not yet be assembled into chromosomes. Often more genome sequencing
        is needed to narrow gaps and establish a tiling path.
     -- 'chunk' -  While contig sequences can be assembled into large entities,
        they sometimes have to be artificially broken down into smaller entities
        called 'chunks'. This is due to limitations in the annotation
        pipeline and the finite record size imposed by MySQL which stores the
        sequence and annotation information.
     -- 'clone' - In general this is the smallest sequence entity.  It is often
        identical to the sequence of one BAC clone, or sequence region
        of one BAC clone which forms the tiling path.
<id>:     The actual sequence identifier. Depending on the <id type> the <id>
          could represent the name of a chromosome, a scaffold, a contig, a clone ..
          Field is empty for seqlevel files
fa: All files in these directories represent FASTA database files
gz: All files are compacted with GNU Zip for storage efficiency.

-----------
TOPLEVEL
----------
These files contain the full sequence of the assembly in fasta format.
They contain one chromosome per file.

EXAMPLES
   The genomic sequence of human chromosome 1:
     Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz

   The masked version of the genome sequence on human chromosome 1
   (contains '_rm' or '_sm' in the name):
     Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz
     Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz

   Non-chromosomal assembly sequences:
   e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes
     Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz
     Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz
     Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz


--------------
SPECIAL CASES
--------------
Some chromosomes have alternate haplotypes which are presented in files with
the haplotype sequence only:
   Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz
   Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz


Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR)
on the Y is annotated.  By definition the PAR region is identical on the
X and Y chromosome.  We provide this sequence in the following way.
-- The Y chromosome file contains the complete sequence of the PAR:
    Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz
-- The top level file includes only the unique portion of Y (i.e. the PAR
   (region is N-masked):
      Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz

README

    pep => <<'README',
####################
Fasta Peptide dumps
####################
These files hold the protein translations of Ensembl gene predictions.

-----------
FILE NAMES
------------
The files are consistently named following this pattern:
   <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz

<species>:       The systematic name of the species.
<assembly>:      The assembly build name.
<release>:       The release number.
<sequence type>: pep for peptide sequences
<status>
  * 'pep.all' - the super-set of all translations resulting from Ensembl known
     or novel gene predictions.
  * 'pep.abinitio' translations resulting from 'ab initio' gene
     prediction algorithms such as SNAP and GENSCAN. In general, all
     'ab initio' predictions are based solely on the genomic sequence and
     not any other experimental evidence. Therefore, not all GENSCAN
     or SNAP predictions represent biologically real proteins.
fa : All files in these directories represent FASTA database files
gz : All files are compacted with GNU Zip for storage efficiency.

EXAMPLES (Note: Most species do not sequences for each different <status>)
 for Human:
    Homo_sapiens.NCBI36.40.pep.all.fa.gz
      contains all known and novel peptides
    Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz
      contains all abinitio predicted peptide

Difference between known and novel
----------------------------------
Protein models that can be mapped to species-specific entries in
Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as
known genes.  Those that cannot be mapped are called novel
(e.g. genes predicted on the basis of evidence from closely related species).

For models annotated by HAVANA the status is set manually. Models that have
an HGNC name are referred to as known and the remaining models are referred to
as novel.

-------------------------------
FASTA Sequence Header Lines
------------------------------
The FASTA sequence header lines are designed to be consistent across
all types of Ensembl FASTA sequences.  This gives enough information
for the sequence to be identified outside the context of the FASTA
database file.

General format:

>ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT

Example of Ensembl Peptide header:

>ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
 ^               ^   ^     ^                                   ^                    ^                          ^                            ^
 ID              |   |  LOCATION                          GENE:stable gene ID       |                       GENE: gene biotype           TRANSCRIPT: transcript biotype
                 | STATUS                                           TRANSCRIPT: stable transcript ID
               SEQTYPE

README

    cdna => <<'README',
##################
Fasta cDNA dumps
#################

These files hold the cDNA sequences corresponding to Ensembl gene predictions.

------------
FILE NAMES
------------
The files are consistently named following this pattern:
<species>.<assembly>.<release>.<sequence type>.<status>.fa.gz

<species>: The systematic name of the species.
<assembly>: The assembly build name.
<release>: The release number.
<sequence type>: cdna for cDNA sequences
<status>
  * 'cdna.all' - the super-set of all transcripts resulting from
     Ensembl known, novel and pseudo gene predictions (see more below).
  * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction
     algorithms such as SNAP and GENSCAN. In general all 'ab initio'
     predictions are solely based on the genomic sequence and do not
     use other experimental evidence. Therefore, not all GENSCAN or SNAP
     cDNA predictions represent biologically real cDNAs.
     Consequently, these predictions should be used with care.

EXAMPLES  (Note: Most species do not sequences for each different <status>)
  for Human:
    Homo_sapiens.NCBI36.40.cdna.all.fa.gz
      cDNA sequences for all transcripts: known, novel and pseudo
    Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz
      cDNA sequences for 'ab-initio' prediction transcripts.

Difference between known and novel transcripts
-----------------------------------------------
Transcript or protein models that can be mapped to species-specific entries
in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl.
Those that cannot be mapped are called novel genes (e.g. genes predicted on
the basis of evidence from closely related species).

For models annotated by HAVANA the status is set manually. Models that have
an HGNC name are referred to as known and the remaining models are referred to
as novel.

-------------------------------
FASTA Sequence Header Lines
------------------------------
The FASTA sequence header lines are designed to be consistent across
all types of Ensembl FASTA sequences.  This gives enough information
for the sequence to be identified outside the context of the FASTA file.

General format:

>ID SEQTYPE:STATUS LOCATION GENE

Example of an Ensembl cDNA header:

>ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding
 ^               ^    ^     ^                                       ^                    ^                           ^
 ID              |    |  LOCATION                         GENE: gene stable ID        GENE: gene biotype        TRANSCRIPT: transcript biotype
                 |  STATUS
              SEQTYPE


README

    ncrna => <<'README',
##################
Fasta RNA dumps
#################

These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA).

------------
FILE NAMES
------------
The files are consistently named following this pattern:
<species>.<assembly>.<release>.<sequence type>.fa.gz

<species>: The systematic name of the species.
<assembly>: The assembly build name.
<release>: The release number.
<sequence type>: ncrna for non-coding RNA sequences

EXAMPLES
  for Human:
    Homo_sapiens.NCBI36.40.ncrna.fa.gz
      Transcript sequences for all ncRNA gene types.


-------------------------------
FASTA Sequence Header Lines
------------------------------
The FASTA sequence header lines are designed to be consistent across
all types of Ensembl FASTA sequences.  This gives enough information
for the sequence to be identified outside the context of the FASTA file.

General format:

>ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA
   ^             ^     ^     ^                                          ^                    ^                           ^
   ID            |     |  LOCATION                            GENE: gene stable ID       GENE: gene biotype           TRANSCRIPT: transcript biotype
                 |   STATUS
              SEQTYPE


README
  );

  my $warning = <<'README';
#### README ####

IMPORTANT: Please note you can download correlation data tables,
supported by Ensembl, via the highly customisable BioMart and
EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
http://www.ebi.ac.uk/biomart/ for more information.

README

  my ( $self, $data_type ) = @_;
  my $base_path = $self->fasta_path();
  my $path      = File::Spec->catfile( $base_path, $data_type, 'README' );
  my $accession = $self->assembly_accession();
  my $txt       = $text{$data_type};
  throw "Cannot find README text for type $data_type" unless $txt;

  #Add accession information if it is available
  if($data_type eq 'dna' && $accession) {
    my $type = $self->assembly_accession_type();
    $warning .= <<EXTRA;
The genome assembly represented here corresponds to $type
$accession

EXTRA
  }

  work_with_file($path, 'w', sub {
    my ($fh) = @_;
    print $fh $warning;
    print $fh $txt;
    return;
  });
  return;
}

1;
author	devteam <devteam@galaxyproject.org>
date	Mon, 13 Jan 2014 10:38:30 -0500
parents	1f6dce3d34e0
children