Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,976 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::DumpFile + +=head1 DESCRIPTION + +The main workhorse of the FASTA dumping pipeline. This module has two +functions + +=over 8 + +=item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats + +=item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included) + +=back + +The script is responsible for creating the filenames of these target +files, taking data from the database and the formatting of the FASTA +headers. It is also responsible for the creation of README files pertaining +to the type of dumps produced. The final files are all Gzipped at normal +levels of compression. + +B<N.B.> This code will remove any files already found in the target directory +on its first run as it assumes all data will be dumped in the one process. It +is selective of its directory meaning a rerun of DNA dumps will not cause +the protein/cdna files to be removed. + +Allowed parameters are: + +=over 8 + +=item species - The species to dump + +=item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed + +=item release - A required parameter for the version of Ensembl we are dumping for + +=item db_types - Array reference of the database groups to use. Defaults to core + +=item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names + +=item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these) + + +=item base_path - The base of the dumps + +=item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and + process when formatting FASTA files. Normally do not + touch + +=item allow_appending - If the same file name is generated we will + append into that file rather than overwriting + +=item overwrite_files - If the same file name is generated we will overwrite + the into that file rather than appending + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::DumpFile; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base); + +use File::Spec; +use IO::Compress::Gzip qw/gzip $GzipError/; +use IO::File; +use Bio::EnsEMBL::PaddedSlice; +use Bio::EnsEMBL::Utils::BiotypeMapper; +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use Bio::EnsEMBL::Utils::Scalar qw/check_ref/; +use Bio::EnsEMBL::Utils::IO::FASTASerializer; +use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/; + +my $DNA_INDEXING_FLOW = 1; +my $PEPTIDE_INDEXING_FLOW = 2; +my $GENE_INDEXING_FLOW = 3; + +sub param_defaults { + my ($self) = @_; + return { + #user configurable + allow_appending => 1, + overwrite_files => 0, + + dna_chunk_size => 17000, + + skip_logic_names => [], + process_logic_names => [], + + #DON'T MESS + #used to track if we need to reopen a file in append mode or not + generated_files => {}, + remove_files_from_dir => {}, + dataflows => [] + }; +} + +sub fetch_input { + my ($self) = @_; + + my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') }; + $self->param('sequence_types', \%sequence_types); + + my $dba = $self->get_DBAdaptor(); + my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis'); + $self->param('analyses', $analyses); + + my $types = $self->param('db_types'); + $types = ['core'] unless $types; + $self->param('db_types', $types); + + my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')}; + $self->param('skip_logic', \%skip_logic_names); + $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')}; + my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')}; + $self->param('process_logic', \%process_logic_names); + $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')}; + + return; +} + +sub run { + my ($self) = @_; + my $types = $self->param('db_types'); + foreach my $type (@{$types}) { + my $dba = $self->get_DBAdaptor($type); + if(! $dba) { + $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type); + next; + } + $self->run_type($type); + } + return; +} + +sub write_output { + my ($self) = @_; + my $dataflows = $self->param('dataflows'); + foreach my $flow (@{$dataflows}) { + $self->dataflow_output_id(@{$flow}); + } + return; +} + +sub run_type { + my ($self, $type) = @_; + + my $species = $self->param('species'); + my $sequence_types = $self->param('sequence_types'); + + # dump file for each type on a per slice basis + # types are dna,cDNA, peptide, ncRNA + + #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref + my $dba = $self->get_DBAdaptor($type); + if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) { + $self->info( "Starting dna dump for " . $species ); + $self->_dump_dna($type); + $self->_create_README('dna'); + } + + if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not + $self->info( "Starting cdna dump for " . $species ); + my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type); + + $self->info( "Starting prediction transcript dumps for " . $species ); + my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type); + + $self->_create_README('cdna') if $transcripts || $pred_transcripts; + $self->_create_README('pep') if $peptide || $pred_proteins; + } + if ( $sequence_types->{ncrna} ) { + $self->info( "Starting ncRNA dump for " . $species ); + my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type); + $self->_create_README('ncrna') if $ncrna_transcripts; + } + + $self->cleanup_DBAdaptor($type); +} + +# Dump entire sequence, also dump data into chromosome files as appropriate +sub _dump_dna { + my ($self,$type) = @_; + + my @chromosomes; + my @non_chromosomes; + my $filter_human = 1; + foreach my $s (@{$self->get_Slices($type, $filter_human)}) { + my $chr = $s->is_chromosome(); + push(@chromosomes, $s) if $chr; + push(@non_chromosomes, $s) if ! $chr; + } + + ############ NON CHROMOSOME WORK + $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes)); + if(@non_chromosomes) { + my ( $non_specific_file, $non_specific_fh, $other_serializer ) = + $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' ); + my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) = + $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' ); + foreach my $s (@non_chromosomes) { + $self->_dump_slice($s, $other_serializer, $other_rm_serializer); + } + #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call + $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1); + my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh); + + $self->tidy_file_handle( $non_specific_fh, $non_specific_file ); + $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file ); + $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file); + $self->info('Dumped non-chromosomes'); + } + + ############ CHROMOSOME WORK + $self->info('Processing %d chromosome(s)', scalar(@chromosomes)); + foreach my $s (@chromosomes) { + my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) = + $self->_generate_fasta_serializer( 'dna', 'chromosome', + $s->seq_region_name(), undef); + # repeat masked data too + my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) = + $self->_generate_fasta_serializer( 'dna_sm', 'chromosome', + $s->seq_region_name(), undef); + + $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer); + + #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call + $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1); + my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh); + + $self->tidy_file_handle($chromo_fh, $chromo_file_name); + $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name); + $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file); + } + $self->info("Dumped chromosomes"); + + #input_id + push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]); + push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); + push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); + + return; +} + +sub _dump_slice { + my ($self, $s, $serialiser, $rm_serialiser) = @_; + + my $analyses = $self->param('analyses'); + + my $chr = $s->is_chromosome(); + $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end()); + $self->info(' Slice is a chromosome') if $chr; + $self->info(' Slice is non-chromosomal') if ! $chr; + + # Make a padded slice (to automatically pad with N's outside of known regions) + # and make a repeat-masked slice and then pad that too. + my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s); + $serialiser->print_Seq($padded_slice); + + my $soft_mask = 1; + my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask); + my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice); + $rm_serialiser->print_Seq($padded_masked_slice); + + return; +} + +#Assumes we are working with un-compressed files +sub _convert_softmask_to_hardmask { + my ($self, $soft_mask_file, $soft_mask_fh) = @_; + if(! -f $soft_mask_file) { + $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file); + return; + } + my $hard_mask_file = $soft_mask_file; + $hard_mask_file =~ s/\.dna_sm\./.dna_rm./; + my $hm_fh = IO::File->new($hard_mask_file, 'w'); + $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file); + work_with_file($soft_mask_file, 'r', sub { + my ($sm_fh) = @_; + while(my $line = <$sm_fh>) { + if(index($line, '>') == 0) { + $line =~ s/dna_sm/dna_rm/; + } + else { + $line =~ tr/[acgtn]/N/; + } + print $hm_fh $line; + }; + return; + }); + return ($hm_fh, $hard_mask_file); +} + +sub _dump_transcripts { + my ($self, $transcript_type, $type) = @_; + + my $has_transcript_data = 0; + my $has_protein_data = 0; + + my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef; + my ( $filename, $fh, $transcript_serializer ) = + $self->_generate_fasta_serializer( $transcript_type, $transcript_level ); + + my ( $peptide_filename, $pep_fh, $peptide_serializer ); + + # some cDNAs are translated, make a file to receive them. + if ( $transcript_type eq 'cdna') { + ( $peptide_filename, $pep_fh, $peptide_serializer ) = + $self->_generate_fasta_serializer( 'pep', 'all' ); + } + + # work out what biotypes correspond to $transcript_type + my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new(); + my $biotypes_list = $biotype_mapper->group_members($transcript_type); + + my $dba = $self->get_DBAdaptor($type); + my $gene_adaptor = $dba->get_GeneAdaptor(); + + # get all the transcripts that are $transcript_type e.g. cdna, ncrna, + foreach my $biotype ( @{$biotypes_list} ) { + my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype); + $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} )); + while ( my $gene = shift @{$gene_list} ) { + $self->fine( 'Gene %s', $gene->display_id ); + my $transcript_list = $gene->get_all_Transcripts(); + foreach my $transcript ( @{$transcript_list} ) { + $self->fine( 'Transcript %s', $transcript->display_id ); + next unless $self->ok_to_process_logic_name($transcript); + + # foreach transcripts of all genes with biotypes classed as cdna + my $transcript_seq = $transcript->seq(); + $self->_create_display_id($transcript, $transcript_seq, $transcript_type); + $transcript_serializer->print_Seq($transcript_seq); + if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) { + my $translation = $transcript->translation(); + if ($translation) { + my $translation_seq = $transcript->translate(); + $self->_create_display_id($translation, $translation_seq, $transcript_type); + $peptide_serializer->print_Seq($translation_seq); + + $has_protein_data = 1; + } + } + + $has_transcript_data = 1; + } + } + } + + $self->tidy_file_handle( $fh, $filename ); + if ( $transcript_type eq 'cdna' ) { + $self->tidy_file_handle( $pep_fh, $peptide_filename ); + } + + if($has_protein_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); + } + if($has_transcript_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); + } + + return ($has_transcript_data, $has_protein_data); +} + +# Dump prediction transcripts and peptides. All predicted transcripts have translations +sub _dump_prediction_transcripts { + my ($self, $type) = @_; + my $dba = $self->get_DBAdaptor($type); + + my $has_transcript_data = 0; + my $has_protein_data = 0; + + my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor(); + my $transcript_list = $prediction_transcript_adaptor->fetch_all(); + my $count = scalar(@{$transcript_list}); + $self->info('Found %d prediction transcript(s)', $count); + if($count) { + my ( $abinitio_filename, $fh, $abinitio_serializer ) = + $self->_generate_fasta_serializer( 'cdna', 'abinitio' ); + my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) = + $self->_generate_fasta_serializer( 'pep', 'abinitio' ); + + while ( my $transcript = shift @{$transcript_list} ) { + next unless $self->ok_to_process_logic_name($transcript); + + $has_transcript_data = 1; + my $transcript_seq = $transcript->seq(); + $self->_create_display_id( $transcript, $transcript_seq, 'cdna' ); + $abinitio_serializer->print_Seq($transcript_seq); + + my $translation_seq = $transcript->translate(); + if ( $transcript->translation() ) { + $has_protein_data = 1; + $self->_create_display_id( $transcript, $translation_seq, 'pep' ); + $abinitio_peptide_serializer->print_Seq($translation_seq); + } + } + + $self->tidy_file_handle( $fh, $abinitio_filename ); + $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename ); + + if($has_protein_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); + } + if($has_transcript_data) { + push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); + } + } + + + return ($has_transcript_data, $has_protein_data); +} + +# We can optionally skip the Gzip process & just delegate to the super class +# for it's cleanup routines which only work with an open file handle. Therefore +# only pass it onto the super implementation *if* the handle was open. +# Also only Gzip if the source file exists (it could have been unlinked from +# an earlier call) + +sub tidy_file_handle { + my ($self, $fh, $path, $no_gzip) = @_; + if($fh->opened()) { + my $tidy = $self->SUPER::tidy_file_handle($fh, $path); + return 1 if $tidy; + } + + return if $no_gzip; #don't gzip if we were told to skip + return if ! -f $path; #don't gzip if we had no file + + my $target = $path.".gz"; + $self->info('Gzipping "%s"', $path); + my %args; + if($self->param('generated_files')->{$target}) { + if($self->param('allow_appending')) { + $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target); + $args{Append} = 1; + } + elsif($self->param('overwrite_files')) { + $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target); + } + else { + $self->throw("Cannot continue. The file %s has already been created this session. Fail!"); + } + } + gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError"; + $self->info(' Removing original file from filesystem'); + unlink $path or throw "Could not delete $path: $!"; + $self->info(' Finished'); + $self->param('generated_files')->{$target} = 1; + return 0; +} + +#We assume a transcript is ok to process unless proven otherwise +sub ok_to_process_logic_name { + my ($self, $transcript) = @_; + my $ok = 1; + my $logic_name = $transcript->analysis()->logic_name(); + if($self->param('process_logic_active')) { + if(! $self->param('process_logic')->{$logic_name}) { + $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name); + $ok = 0; + } + } + if($self->param('skip_logic_active')) { + if($self->param('skip_logic')->{$logic_name}) { + $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name); + $ok = 0; + } + } + return $ok; +} + +#Generates a FASTA serializer but returns the (filename, handle & instance) +sub _generate_fasta_serializer { + my ( $self, $datatype, $level, $section, $header_formatter ) = @_; + $header_formatter ||= $self->_custom_header(); + my $chunk = $self->param('dna_chunk_size'); + my $filename = $self->_generate_file_name( $datatype, $level, $section ); + my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!"; + my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk); + return ( $filename, $fh, $ser ); +} + +# +# _generate_file_name(data type, level, section ) +# dna toplevel undef +# dna chromosome 6 + +sub _generate_file_name { + my ( $self, $data_type, $level, $section ) = @_; #level & section is optional + + # File name format looks like: + # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa + # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa + # Homo_sapiens.GRCh37.64.dna.chromosome.20.fa + # Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa + my @name_bits; + push @name_bits, $self->web_name(); + push @name_bits, $self->assembly(); + push @name_bits, $self->param('release'); + push @name_bits, lc($data_type); + push @name_bits, $level if $level; + push @name_bits, $section if $section; + push @name_bits, 'fa'; + + my $file_name = join( '.', @name_bits ); + + $data_type =~ s/_[rs]m$//; # remove repeatmask or softmask designation from path component + my $data_type_dir = $self->fasta_path($data_type); + $self->_remove_files_from_dir($data_type_dir); + return File::Spec->catfile( $data_type_dir, $file_name ); +} + +# Attempts to remove any generated files previously present for the instance +# of the Process +sub _remove_files_from_dir { + my ($self, $dir) = @_; + if(! $self->param('remove_files_from_dir')->{$dir}) { + $self->unlink_all_files($dir); + $self->param('remove_files_from_dir')->{$dir} = 1; + } + return; +} + +##Logic used to generate the expected format for a FASTA header +sub _create_display_id { + my ($self, $object, $seq, $type) = @_; + + my $stable_id; + my $location; + my $decoded_type; + my $decoded_status; + my %attributes; + + if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) { + $attributes{transcript_biotype} = $object->biotype(); + + #If pred transcript then no gene but type & status are different + if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) { + $stable_id = $object->stable_id(); + $location = $object->feature_Slice()->name(); + $decoded_type = $type; + $decoded_status = lc($object->analysis()->logic_name()); + if($type eq 'pep') { + $attributes{transcript} = $stable_id; + } + } + #Must be a real "transcript" + else { + $stable_id = $object->stable_id(); + $location = $object->feature_Slice()->name(); + my $gene = $object->get_Gene(); + $attributes{gene} = $gene->stable_id(); + $attributes{gene_biotype} = $gene->biotype(); + + #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script) + if($type eq 'ncrna') { + $decoded_type = lc($object->analysis()->logic_name()); + $decoded_status = $gene->biotype(); + } + elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) { + $decoded_type = $type; + $decoded_status = 'pseudogene'; + } + #Otherwise use type & object's transcript's status + else { + $decoded_type = $type; + $decoded_status = lc($object->status()); + } + } + } + #If it's a translation then grab the transcript and gene then set accordingly + elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) { + my $transcript = $object->transcript(); + my $gene = $transcript->get_Gene(); + $stable_id = $object->stable_id(); + $location = $transcript->feature_Slice()->name(); + %attributes = ( + gene => $gene->stable_id(), + gene_biotype => $gene->biotype(), + transcript => $transcript->stable_id(), + transcript_biotype => $transcript->biotype() + ); + $decoded_type = 'pep'; + $decoded_status = lc($transcript->status()); + } + else { + throw sprintf( 'Do not understand how to format a display_id for type "%s"', + ref($object) ); + } + + my $attr_str = join(q{ }, + map { $_.':'.$attributes{$_} } + grep { exists $attributes{$_} } + qw/gene transcript gene_biotype transcript_biotype/); + + my $format = '%s %s:%s %s %s'; + + my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str); + $seq->display_id($id); + + return; +} + +sub _custom_header { + my ($self) = @_; + return sub { + my $slice = shift; + if ( !$slice->isa('Bio::EnsEMBL::Slice') ) { + return $slice->display_id(); + } + + #RMS means masked data. soft_mask() true means it was softmasked + my $dna_type = 'dna'; + if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) { + $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm'; + } + + my $id = $slice->seq_region_name; + my $idtype = $slice->coord_system->name; + my $location = $slice->name; + my $ref = $slice->assembly_exception_type(); + my $header = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref); + return $header; + }; +} + +sub _final_filename { + my ($self, $filename) = @_; + return $filename if $filename =~ /\.gz$/; + return $filename.'.gz'; +} + +sub assembly_accession { + my ($self) = @_; + my $mc = $self->get_DBAdaptor()->get_MetaContainer(); + return $mc->single_value_by_key('assembly.accession'); +} + +sub assembly_accession_type { + my ($self) = @_; + my $mc = $self->get_DBAdaptor()->get_MetaContainer(); + return $mc->single_value_by_key('assembly.web_accession_type'); +} + +sub _create_README { + + #Text for readme files + + my %text = ( + dna => <<'README', +####################### +Fasta DNA dumps +####################### + +----------- +FILE NAMES +------------ +The files are consistently named following this pattern: + <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz + +<species>: The systematic name of the species. +<assembly>: The assembly build name. +<release>: The release number. +<sequence type>: + * 'dna' - unmasked genomic DNA sequences. + * 'dna_rm' - masked genomic DNA. Interspersed repeats and low + complexity regions are detected with the RepeatMasker tool and masked + by replacing repeats with 'N's. + * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions + have been replaced with lowercased versions of their nucleic base +<id type> One of the following: + * 'chromosome'a - The top-level coordinate system in most species in Ensembl + * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome + * 'seqlevel' - This is usually sequence scaffolds, chunks or clones. + -- 'scaffold' - Larger sequence contigs from the assembly of shorter + sequencing reads (often from whole genome shotgun, WGS) which could + not yet be assembled into chromosomes. Often more genome sequencing + is needed to narrow gaps and establish a tiling path. + -- 'chunk' - While contig sequences can be assembled into large entities, + they sometimes have to be artificially broken down into smaller entities + called 'chunks'. This is due to limitations in the annotation + pipeline and the finite record size imposed by MySQL which stores the + sequence and annotation information. + -- 'clone' - In general this is the smallest sequence entity. It is often + identical to the sequence of one BAC clone, or sequence region + of one BAC clone which forms the tiling path. +<id>: The actual sequence identifier. Depending on the <id type> the <id> + could represent the name of a chromosome, a scaffold, a contig, a clone .. + Field is empty for seqlevel files +fa: All files in these directories represent FASTA database files +gz: All files are compacted with GNU Zip for storage efficiency. + +----------- +TOPLEVEL +---------- +These files contain the full sequence of the assembly in fasta format. +They contain one chromosome per file. + +EXAMPLES + The genomic sequence of human chromosome 1: + Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz + + The masked version of the genome sequence on human chromosome 1 + (contains '_rm' or '_sm' in the name): + Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz + Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz + + Non-chromosomal assembly sequences: + e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes + Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz + Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz + Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz + + +-------------- +SPECIAL CASES +-------------- +Some chromosomes have alternate haplotypes which are presented in files with +the haplotype sequence only: + Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz + Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz + + +Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR) +on the Y is annotated. By definition the PAR region is identical on the +X and Y chromosome. We provide this sequence in the following way. +-- The Y chromosome file contains the complete sequence of the PAR: + Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz +-- The top level file includes only the unique portion of Y (i.e. the PAR + (region is N-masked): + Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz + +README + + pep => <<'README', +#################### +Fasta Peptide dumps +#################### +These files hold the protein translations of Ensembl gene predictions. + +----------- +FILE NAMES +------------ +The files are consistently named following this pattern: + <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz + +<species>: The systematic name of the species. +<assembly>: The assembly build name. +<release>: The release number. +<sequence type>: pep for peptide sequences +<status> + * 'pep.all' - the super-set of all translations resulting from Ensembl known + or novel gene predictions. + * 'pep.abinitio' translations resulting from 'ab initio' gene + prediction algorithms such as SNAP and GENSCAN. In general, all + 'ab initio' predictions are based solely on the genomic sequence and + not any other experimental evidence. Therefore, not all GENSCAN + or SNAP predictions represent biologically real proteins. +fa : All files in these directories represent FASTA database files +gz : All files are compacted with GNU Zip for storage efficiency. + +EXAMPLES (Note: Most species do not sequences for each different <status>) + for Human: + Homo_sapiens.NCBI36.40.pep.all.fa.gz + contains all known and novel peptides + Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz + contains all abinitio predicted peptide + +Difference between known and novel +---------------------------------- +Protein models that can be mapped to species-specific entries in +Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as +known genes. Those that cannot be mapped are called novel +(e.g. genes predicted on the basis of evidence from closely related species). + +For models annotated by HAVANA the status is set manually. Models that have +an HGNC name are referred to as known and the remaining models are referred to +as novel. + +------------------------------- +FASTA Sequence Header Lines +------------------------------ +The FASTA sequence header lines are designed to be consistent across +all types of Ensembl FASTA sequences. This gives enough information +for the sequence to be identified outside the context of the FASTA +database file. + +General format: + +>ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT + +Example of Ensembl Peptide header: + +>ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding + ^ ^ ^ ^ ^ ^ ^ ^ + ID | | LOCATION GENE:stable gene ID | GENE: gene biotype TRANSCRIPT: transcript biotype + | STATUS TRANSCRIPT: stable transcript ID + SEQTYPE + +README + + cdna => <<'README', +################## +Fasta cDNA dumps +################# + +These files hold the cDNA sequences corresponding to Ensembl gene predictions. + +------------ +FILE NAMES +------------ +The files are consistently named following this pattern: +<species>.<assembly>.<release>.<sequence type>.<status>.fa.gz + +<species>: The systematic name of the species. +<assembly>: The assembly build name. +<release>: The release number. +<sequence type>: cdna for cDNA sequences +<status> + * 'cdna.all' - the super-set of all transcripts resulting from + Ensembl known, novel and pseudo gene predictions (see more below). + * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction + algorithms such as SNAP and GENSCAN. In general all 'ab initio' + predictions are solely based on the genomic sequence and do not + use other experimental evidence. Therefore, not all GENSCAN or SNAP + cDNA predictions represent biologically real cDNAs. + Consequently, these predictions should be used with care. + +EXAMPLES (Note: Most species do not sequences for each different <status>) + for Human: + Homo_sapiens.NCBI36.40.cdna.all.fa.gz + cDNA sequences for all transcripts: known, novel and pseudo + Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz + cDNA sequences for 'ab-initio' prediction transcripts. + +Difference between known and novel transcripts +----------------------------------------------- +Transcript or protein models that can be mapped to species-specific entries +in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl. +Those that cannot be mapped are called novel genes (e.g. genes predicted on +the basis of evidence from closely related species). + +For models annotated by HAVANA the status is set manually. Models that have +an HGNC name are referred to as known and the remaining models are referred to +as novel. + +------------------------------- +FASTA Sequence Header Lines +------------------------------ +The FASTA sequence header lines are designed to be consistent across +all types of Ensembl FASTA sequences. This gives enough information +for the sequence to be identified outside the context of the FASTA file. + +General format: + +>ID SEQTYPE:STATUS LOCATION GENE + +Example of an Ensembl cDNA header: + +>ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding + ^ ^ ^ ^ ^ ^ ^ + ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype + | STATUS + SEQTYPE + + +README + + ncrna => <<'README', +################## +Fasta RNA dumps +################# + +These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA). + +------------ +FILE NAMES +------------ +The files are consistently named following this pattern: +<species>.<assembly>.<release>.<sequence type>.fa.gz + +<species>: The systematic name of the species. +<assembly>: The assembly build name. +<release>: The release number. +<sequence type>: ncrna for non-coding RNA sequences + +EXAMPLES + for Human: + Homo_sapiens.NCBI36.40.ncrna.fa.gz + Transcript sequences for all ncRNA gene types. + + +------------------------------- +FASTA Sequence Header Lines +------------------------------ +The FASTA sequence header lines are designed to be consistent across +all types of Ensembl FASTA sequences. This gives enough information +for the sequence to be identified outside the context of the FASTA file. + +General format: + +>ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA + ^ ^ ^ ^ ^ ^ ^ + ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype + | STATUS + SEQTYPE + + +README + ); + + my $warning = <<'README'; +#### README #### + +IMPORTANT: Please note you can download correlation data tables, +supported by Ensembl, via the highly customisable BioMart and +EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or +http://www.ebi.ac.uk/biomart/ for more information. + +README + + my ( $self, $data_type ) = @_; + my $base_path = $self->fasta_path(); + my $path = File::Spec->catfile( $base_path, $data_type, 'README' ); + my $accession = $self->assembly_accession(); + my $txt = $text{$data_type}; + throw "Cannot find README text for type $data_type" unless $txt; + + #Add accession information if it is available + if($data_type eq 'dna' && $accession) { + my $type = $self->assembly_accession_type(); + $warning .= <<EXTRA; +The genome assembly represented here corresponds to $type +$accession + +EXTRA + } + + work_with_file($path, 'w', sub { + my ($fh) = @_; + print $fh $warning; + print $fh $txt; + return; + }); + return; +} + +1; +