Mercurial > repos > mahtabm > ensembl
view variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm @ 3:d30fa12e4cc5 default tip
Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author | devteam <devteam@galaxyproject.org> |
---|---|
date | Mon, 13 Jan 2014 10:38:30 -0500 |
parents | 1f6dce3d34e0 |
children |
line wrap: on
line source
=pod =head1 LICENSE Copyright (c) 1999-2012 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <dev@ensembl.org>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =head1 NAME Bio::EnsEMBL::Pipeline::FASTA::DumpFile =head1 DESCRIPTION The main workhorse of the FASTA dumping pipeline. This module has two functions =over 8 =item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats =item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included) =back The script is responsible for creating the filenames of these target files, taking data from the database and the formatting of the FASTA headers. It is also responsible for the creation of README files pertaining to the type of dumps produced. The final files are all Gzipped at normal levels of compression. B<N.B.> This code will remove any files already found in the target directory on its first run as it assumes all data will be dumped in the one process. It is selective of its directory meaning a rerun of DNA dumps will not cause the protein/cdna files to be removed. Allowed parameters are: =over 8 =item species - The species to dump =item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed =item release - A required parameter for the version of Ensembl we are dumping for =item db_types - Array reference of the database groups to use. Defaults to core =item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names =item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these) =item base_path - The base of the dumps =item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and process when formatting FASTA files. Normally do not touch =item allow_appending - If the same file name is generated we will append into that file rather than overwriting =item overwrite_files - If the same file name is generated we will overwrite the into that file rather than appending =back =cut package Bio::EnsEMBL::Pipeline::FASTA::DumpFile; use strict; use warnings; use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base); use File::Spec; use IO::Compress::Gzip qw/gzip $GzipError/; use IO::File; use Bio::EnsEMBL::PaddedSlice; use Bio::EnsEMBL::Utils::BiotypeMapper; use Bio::EnsEMBL::Utils::Exception qw/throw/; use Bio::EnsEMBL::Utils::Scalar qw/check_ref/; use Bio::EnsEMBL::Utils::IO::FASTASerializer; use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/; my $DNA_INDEXING_FLOW = 1; my $PEPTIDE_INDEXING_FLOW = 2; my $GENE_INDEXING_FLOW = 3; sub param_defaults { my ($self) = @_; return { #user configurable allow_appending => 1, overwrite_files => 0, dna_chunk_size => 17000, skip_logic_names => [], process_logic_names => [], #DON'T MESS #used to track if we need to reopen a file in append mode or not generated_files => {}, remove_files_from_dir => {}, dataflows => [] }; } sub fetch_input { my ($self) = @_; my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') }; $self->param('sequence_types', \%sequence_types); my $dba = $self->get_DBAdaptor(); my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis'); $self->param('analyses', $analyses); my $types = $self->param('db_types'); $types = ['core'] unless $types; $self->param('db_types', $types); my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')}; $self->param('skip_logic', \%skip_logic_names); $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')}; my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')}; $self->param('process_logic', \%process_logic_names); $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')}; return; } sub run { my ($self) = @_; my $types = $self->param('db_types'); foreach my $type (@{$types}) { my $dba = $self->get_DBAdaptor($type); if(! $dba) { $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type); next; } $self->run_type($type); } return; } sub write_output { my ($self) = @_; my $dataflows = $self->param('dataflows'); foreach my $flow (@{$dataflows}) { $self->dataflow_output_id(@{$flow}); } return; } sub run_type { my ($self, $type) = @_; my $species = $self->param('species'); my $sequence_types = $self->param('sequence_types'); # dump file for each type on a per slice basis # types are dna,cDNA, peptide, ncRNA #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref my $dba = $self->get_DBAdaptor($type); if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) { $self->info( "Starting dna dump for " . $species ); $self->_dump_dna($type); $self->_create_README('dna'); } if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not $self->info( "Starting cdna dump for " . $species ); my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type); $self->info( "Starting prediction transcript dumps for " . $species ); my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type); $self->_create_README('cdna') if $transcripts || $pred_transcripts; $self->_create_README('pep') if $peptide || $pred_proteins; } if ( $sequence_types->{ncrna} ) { $self->info( "Starting ncRNA dump for " . $species ); my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type); $self->_create_README('ncrna') if $ncrna_transcripts; } $self->cleanup_DBAdaptor($type); } # Dump entire sequence, also dump data into chromosome files as appropriate sub _dump_dna { my ($self,$type) = @_; my @chromosomes; my @non_chromosomes; my $filter_human = 1; foreach my $s (@{$self->get_Slices($type, $filter_human)}) { my $chr = $s->is_chromosome(); push(@chromosomes, $s) if $chr; push(@non_chromosomes, $s) if ! $chr; } ############ NON CHROMOSOME WORK $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes)); if(@non_chromosomes) { my ( $non_specific_file, $non_specific_fh, $other_serializer ) = $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' ); my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) = $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' ); foreach my $s (@non_chromosomes) { $self->_dump_slice($s, $other_serializer, $other_rm_serializer); } #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1); my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh); $self->tidy_file_handle( $non_specific_fh, $non_specific_file ); $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file ); $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file); $self->info('Dumped non-chromosomes'); } ############ CHROMOSOME WORK $self->info('Processing %d chromosome(s)', scalar(@chromosomes)); foreach my $s (@chromosomes) { my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) = $self->_generate_fasta_serializer( 'dna', 'chromosome', $s->seq_region_name(), undef); # repeat masked data too my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) = $self->_generate_fasta_serializer( 'dna_sm', 'chromosome', $s->seq_region_name(), undef); $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer); #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1); my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh); $self->tidy_file_handle($chromo_fh, $chromo_file_name); $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name); $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file); } $self->info("Dumped chromosomes"); #input_id push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]); push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); return; } sub _dump_slice { my ($self, $s, $serialiser, $rm_serialiser) = @_; my $analyses = $self->param('analyses'); my $chr = $s->is_chromosome(); $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end()); $self->info(' Slice is a chromosome') if $chr; $self->info(' Slice is non-chromosomal') if ! $chr; # Make a padded slice (to automatically pad with N's outside of known regions) # and make a repeat-masked slice and then pad that too. my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s); $serialiser->print_Seq($padded_slice); my $soft_mask = 1; my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask); my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice); $rm_serialiser->print_Seq($padded_masked_slice); return; } #Assumes we are working with un-compressed files sub _convert_softmask_to_hardmask { my ($self, $soft_mask_file, $soft_mask_fh) = @_; if(! -f $soft_mask_file) { $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file); return; } my $hard_mask_file = $soft_mask_file; $hard_mask_file =~ s/\.dna_sm\./.dna_rm./; my $hm_fh = IO::File->new($hard_mask_file, 'w'); $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file); work_with_file($soft_mask_file, 'r', sub { my ($sm_fh) = @_; while(my $line = <$sm_fh>) { if(index($line, '>') == 0) { $line =~ s/dna_sm/dna_rm/; } else { $line =~ tr/[acgtn]/N/; } print $hm_fh $line; }; return; }); return ($hm_fh, $hard_mask_file); } sub _dump_transcripts { my ($self, $transcript_type, $type) = @_; my $has_transcript_data = 0; my $has_protein_data = 0; my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef; my ( $filename, $fh, $transcript_serializer ) = $self->_generate_fasta_serializer( $transcript_type, $transcript_level ); my ( $peptide_filename, $pep_fh, $peptide_serializer ); # some cDNAs are translated, make a file to receive them. if ( $transcript_type eq 'cdna') { ( $peptide_filename, $pep_fh, $peptide_serializer ) = $self->_generate_fasta_serializer( 'pep', 'all' ); } # work out what biotypes correspond to $transcript_type my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new(); my $biotypes_list = $biotype_mapper->group_members($transcript_type); my $dba = $self->get_DBAdaptor($type); my $gene_adaptor = $dba->get_GeneAdaptor(); # get all the transcripts that are $transcript_type e.g. cdna, ncrna, foreach my $biotype ( @{$biotypes_list} ) { my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype); $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} )); while ( my $gene = shift @{$gene_list} ) { $self->fine( 'Gene %s', $gene->display_id ); my $transcript_list = $gene->get_all_Transcripts(); foreach my $transcript ( @{$transcript_list} ) { $self->fine( 'Transcript %s', $transcript->display_id ); next unless $self->ok_to_process_logic_name($transcript); # foreach transcripts of all genes with biotypes classed as cdna my $transcript_seq = $transcript->seq(); $self->_create_display_id($transcript, $transcript_seq, $transcript_type); $transcript_serializer->print_Seq($transcript_seq); if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) { my $translation = $transcript->translation(); if ($translation) { my $translation_seq = $transcript->translate(); $self->_create_display_id($translation, $translation_seq, $transcript_type); $peptide_serializer->print_Seq($translation_seq); $has_protein_data = 1; } } $has_transcript_data = 1; } } } $self->tidy_file_handle( $fh, $filename ); if ( $transcript_type eq 'cdna' ) { $self->tidy_file_handle( $pep_fh, $peptide_filename ); } if($has_protein_data) { push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); } if($has_transcript_data) { push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); } return ($has_transcript_data, $has_protein_data); } # Dump prediction transcripts and peptides. All predicted transcripts have translations sub _dump_prediction_transcripts { my ($self, $type) = @_; my $dba = $self->get_DBAdaptor($type); my $has_transcript_data = 0; my $has_protein_data = 0; my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor(); my $transcript_list = $prediction_transcript_adaptor->fetch_all(); my $count = scalar(@{$transcript_list}); $self->info('Found %d prediction transcript(s)', $count); if($count) { my ( $abinitio_filename, $fh, $abinitio_serializer ) = $self->_generate_fasta_serializer( 'cdna', 'abinitio' ); my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) = $self->_generate_fasta_serializer( 'pep', 'abinitio' ); while ( my $transcript = shift @{$transcript_list} ) { next unless $self->ok_to_process_logic_name($transcript); $has_transcript_data = 1; my $transcript_seq = $transcript->seq(); $self->_create_display_id( $transcript, $transcript_seq, 'cdna' ); $abinitio_serializer->print_Seq($transcript_seq); my $translation_seq = $transcript->translate(); if ( $transcript->translation() ) { $has_protein_data = 1; $self->_create_display_id( $transcript, $translation_seq, 'pep' ); $abinitio_peptide_serializer->print_Seq($translation_seq); } } $self->tidy_file_handle( $fh, $abinitio_filename ); $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename ); if($has_protein_data) { push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); } if($has_transcript_data) { push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); } } return ($has_transcript_data, $has_protein_data); } # We can optionally skip the Gzip process & just delegate to the super class # for it's cleanup routines which only work with an open file handle. Therefore # only pass it onto the super implementation *if* the handle was open. # Also only Gzip if the source file exists (it could have been unlinked from # an earlier call) sub tidy_file_handle { my ($self, $fh, $path, $no_gzip) = @_; if($fh->opened()) { my $tidy = $self->SUPER::tidy_file_handle($fh, $path); return 1 if $tidy; } return if $no_gzip; #don't gzip if we were told to skip return if ! -f $path; #don't gzip if we had no file my $target = $path.".gz"; $self->info('Gzipping "%s"', $path); my %args; if($self->param('generated_files')->{$target}) { if($self->param('allow_appending')) { $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target); $args{Append} = 1; } elsif($self->param('overwrite_files')) { $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target); } else { $self->throw("Cannot continue. The file %s has already been created this session. Fail!"); } } gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError"; $self->info(' Removing original file from filesystem'); unlink $path or throw "Could not delete $path: $!"; $self->info(' Finished'); $self->param('generated_files')->{$target} = 1; return 0; } #We assume a transcript is ok to process unless proven otherwise sub ok_to_process_logic_name { my ($self, $transcript) = @_; my $ok = 1; my $logic_name = $transcript->analysis()->logic_name(); if($self->param('process_logic_active')) { if(! $self->param('process_logic')->{$logic_name}) { $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name); $ok = 0; } } if($self->param('skip_logic_active')) { if($self->param('skip_logic')->{$logic_name}) { $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name); $ok = 0; } } return $ok; } #Generates a FASTA serializer but returns the (filename, handle & instance) sub _generate_fasta_serializer { my ( $self, $datatype, $level, $section, $header_formatter ) = @_; $header_formatter ||= $self->_custom_header(); my $chunk = $self->param('dna_chunk_size'); my $filename = $self->_generate_file_name( $datatype, $level, $section ); my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!"; my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk); return ( $filename, $fh, $ser ); } # # _generate_file_name(data type, level, section ) # dna toplevel undef # dna chromosome 6 sub _generate_file_name { my ( $self, $data_type, $level, $section ) = @_; #level & section is optional # File name format looks like: # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa # Homo_sapiens.GRCh37.64.dna.chromosome.20.fa # Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa my @name_bits; push @name_bits, $self->web_name(); push @name_bits, $self->assembly(); push @name_bits, $self->param('release'); push @name_bits, lc($data_type); push @name_bits, $level if $level; push @name_bits, $section if $section; push @name_bits, 'fa'; my $file_name = join( '.', @name_bits ); $data_type =~ s/_[rs]m$//; # remove repeatmask or softmask designation from path component my $data_type_dir = $self->fasta_path($data_type); $self->_remove_files_from_dir($data_type_dir); return File::Spec->catfile( $data_type_dir, $file_name ); } # Attempts to remove any generated files previously present for the instance # of the Process sub _remove_files_from_dir { my ($self, $dir) = @_; if(! $self->param('remove_files_from_dir')->{$dir}) { $self->unlink_all_files($dir); $self->param('remove_files_from_dir')->{$dir} = 1; } return; } ##Logic used to generate the expected format for a FASTA header sub _create_display_id { my ($self, $object, $seq, $type) = @_; my $stable_id; my $location; my $decoded_type; my $decoded_status; my %attributes; if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) { $attributes{transcript_biotype} = $object->biotype(); #If pred transcript then no gene but type & status are different if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) { $stable_id = $object->stable_id(); $location = $object->feature_Slice()->name(); $decoded_type = $type; $decoded_status = lc($object->analysis()->logic_name()); if($type eq 'pep') { $attributes{transcript} = $stable_id; } } #Must be a real "transcript" else { $stable_id = $object->stable_id(); $location = $object->feature_Slice()->name(); my $gene = $object->get_Gene(); $attributes{gene} = $gene->stable_id(); $attributes{gene_biotype} = $gene->biotype(); #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script) if($type eq 'ncrna') { $decoded_type = lc($object->analysis()->logic_name()); $decoded_status = $gene->biotype(); } elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) { $decoded_type = $type; $decoded_status = 'pseudogene'; } #Otherwise use type & object's transcript's status else { $decoded_type = $type; $decoded_status = lc($object->status()); } } } #If it's a translation then grab the transcript and gene then set accordingly elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) { my $transcript = $object->transcript(); my $gene = $transcript->get_Gene(); $stable_id = $object->stable_id(); $location = $transcript->feature_Slice()->name(); %attributes = ( gene => $gene->stable_id(), gene_biotype => $gene->biotype(), transcript => $transcript->stable_id(), transcript_biotype => $transcript->biotype() ); $decoded_type = 'pep'; $decoded_status = lc($transcript->status()); } else { throw sprintf( 'Do not understand how to format a display_id for type "%s"', ref($object) ); } my $attr_str = join(q{ }, map { $_.':'.$attributes{$_} } grep { exists $attributes{$_} } qw/gene transcript gene_biotype transcript_biotype/); my $format = '%s %s:%s %s %s'; my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str); $seq->display_id($id); return; } sub _custom_header { my ($self) = @_; return sub { my $slice = shift; if ( !$slice->isa('Bio::EnsEMBL::Slice') ) { return $slice->display_id(); } #RMS means masked data. soft_mask() true means it was softmasked my $dna_type = 'dna'; if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) { $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm'; } my $id = $slice->seq_region_name; my $idtype = $slice->coord_system->name; my $location = $slice->name; my $ref = $slice->assembly_exception_type(); my $header = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref); return $header; }; } sub _final_filename { my ($self, $filename) = @_; return $filename if $filename =~ /\.gz$/; return $filename.'.gz'; } sub assembly_accession { my ($self) = @_; my $mc = $self->get_DBAdaptor()->get_MetaContainer(); return $mc->single_value_by_key('assembly.accession'); } sub assembly_accession_type { my ($self) = @_; my $mc = $self->get_DBAdaptor()->get_MetaContainer(); return $mc->single_value_by_key('assembly.web_accession_type'); } sub _create_README { #Text for readme files my %text = ( dna => <<'README', ####################### Fasta DNA dumps ####################### ----------- FILE NAMES ------------ The files are consistently named following this pattern: <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz <species>: The systematic name of the species. <assembly>: The assembly build name. <release>: The release number. <sequence type>: * 'dna' - unmasked genomic DNA sequences. * 'dna_rm' - masked genomic DNA. Interspersed repeats and low complexity regions are detected with the RepeatMasker tool and masked by replacing repeats with 'N's. * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions have been replaced with lowercased versions of their nucleic base <id type> One of the following: * 'chromosome'a - The top-level coordinate system in most species in Ensembl * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome * 'seqlevel' - This is usually sequence scaffolds, chunks or clones. -- 'scaffold' - Larger sequence contigs from the assembly of shorter sequencing reads (often from whole genome shotgun, WGS) which could not yet be assembled into chromosomes. Often more genome sequencing is needed to narrow gaps and establish a tiling path. -- 'chunk' - While contig sequences can be assembled into large entities, they sometimes have to be artificially broken down into smaller entities called 'chunks'. This is due to limitations in the annotation pipeline and the finite record size imposed by MySQL which stores the sequence and annotation information. -- 'clone' - In general this is the smallest sequence entity. It is often identical to the sequence of one BAC clone, or sequence region of one BAC clone which forms the tiling path. <id>: The actual sequence identifier. Depending on the <id type> the <id> could represent the name of a chromosome, a scaffold, a contig, a clone .. Field is empty for seqlevel files fa: All files in these directories represent FASTA database files gz: All files are compacted with GNU Zip for storage efficiency. ----------- TOPLEVEL ---------- These files contain the full sequence of the assembly in fasta format. They contain one chromosome per file. EXAMPLES The genomic sequence of human chromosome 1: Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz The masked version of the genome sequence on human chromosome 1 (contains '_rm' or '_sm' in the name): Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz Non-chromosomal assembly sequences: e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz -------------- SPECIAL CASES -------------- Some chromosomes have alternate haplotypes which are presented in files with the haplotype sequence only: Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR) on the Y is annotated. By definition the PAR region is identical on the X and Y chromosome. We provide this sequence in the following way. -- The Y chromosome file contains the complete sequence of the PAR: Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz -- The top level file includes only the unique portion of Y (i.e. the PAR (region is N-masked): Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz README pep => <<'README', #################### Fasta Peptide dumps #################### These files hold the protein translations of Ensembl gene predictions. ----------- FILE NAMES ------------ The files are consistently named following this pattern: <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz <species>: The systematic name of the species. <assembly>: The assembly build name. <release>: The release number. <sequence type>: pep for peptide sequences <status> * 'pep.all' - the super-set of all translations resulting from Ensembl known or novel gene predictions. * 'pep.abinitio' translations resulting from 'ab initio' gene prediction algorithms such as SNAP and GENSCAN. In general, all 'ab initio' predictions are based solely on the genomic sequence and not any other experimental evidence. Therefore, not all GENSCAN or SNAP predictions represent biologically real proteins. fa : All files in these directories represent FASTA database files gz : All files are compacted with GNU Zip for storage efficiency. EXAMPLES (Note: Most species do not sequences for each different <status>) for Human: Homo_sapiens.NCBI36.40.pep.all.fa.gz contains all known and novel peptides Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz contains all abinitio predicted peptide Difference between known and novel ---------------------------------- Protein models that can be mapped to species-specific entries in Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as known genes. Those that cannot be mapped are called novel (e.g. genes predicted on the basis of evidence from closely related species). For models annotated by HAVANA the status is set manually. Models that have an HGNC name are referred to as known and the remaining models are referred to as novel. ------------------------------- FASTA Sequence Header Lines ------------------------------ The FASTA sequence header lines are designed to be consistent across all types of Ensembl FASTA sequences. This gives enough information for the sequence to be identified outside the context of the FASTA database file. General format: >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT Example of Ensembl Peptide header: >ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding ^ ^ ^ ^ ^ ^ ^ ^ ID | | LOCATION GENE:stable gene ID | GENE: gene biotype TRANSCRIPT: transcript biotype | STATUS TRANSCRIPT: stable transcript ID SEQTYPE README cdna => <<'README', ################## Fasta cDNA dumps ################# These files hold the cDNA sequences corresponding to Ensembl gene predictions. ------------ FILE NAMES ------------ The files are consistently named following this pattern: <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz <species>: The systematic name of the species. <assembly>: The assembly build name. <release>: The release number. <sequence type>: cdna for cDNA sequences <status> * 'cdna.all' - the super-set of all transcripts resulting from Ensembl known, novel and pseudo gene predictions (see more below). * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction algorithms such as SNAP and GENSCAN. In general all 'ab initio' predictions are solely based on the genomic sequence and do not use other experimental evidence. Therefore, not all GENSCAN or SNAP cDNA predictions represent biologically real cDNAs. Consequently, these predictions should be used with care. EXAMPLES (Note: Most species do not sequences for each different <status>) for Human: Homo_sapiens.NCBI36.40.cdna.all.fa.gz cDNA sequences for all transcripts: known, novel and pseudo Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz cDNA sequences for 'ab-initio' prediction transcripts. Difference between known and novel transcripts ----------------------------------------------- Transcript or protein models that can be mapped to species-specific entries in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl. Those that cannot be mapped are called novel genes (e.g. genes predicted on the basis of evidence from closely related species). For models annotated by HAVANA the status is set manually. Models that have an HGNC name are referred to as known and the remaining models are referred to as novel. ------------------------------- FASTA Sequence Header Lines ------------------------------ The FASTA sequence header lines are designed to be consistent across all types of Ensembl FASTA sequences. This gives enough information for the sequence to be identified outside the context of the FASTA file. General format: >ID SEQTYPE:STATUS LOCATION GENE Example of an Ensembl cDNA header: >ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding ^ ^ ^ ^ ^ ^ ^ ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype | STATUS SEQTYPE README ncrna => <<'README', ################## Fasta RNA dumps ################# These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA). ------------ FILE NAMES ------------ The files are consistently named following this pattern: <species>.<assembly>.<release>.<sequence type>.fa.gz <species>: The systematic name of the species. <assembly>: The assembly build name. <release>: The release number. <sequence type>: ncrna for non-coding RNA sequences EXAMPLES for Human: Homo_sapiens.NCBI36.40.ncrna.fa.gz Transcript sequences for all ncRNA gene types. ------------------------------- FASTA Sequence Header Lines ------------------------------ The FASTA sequence header lines are designed to be consistent across all types of Ensembl FASTA sequences. This gives enough information for the sequence to be identified outside the context of the FASTA file. General format: >ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA ^ ^ ^ ^ ^ ^ ^ ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype | STATUS SEQTYPE README ); my $warning = <<'README'; #### README #### IMPORTANT: Please note you can download correlation data tables, supported by Ensembl, via the highly customisable BioMart and EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or http://www.ebi.ac.uk/biomart/ for more information. README my ( $self, $data_type ) = @_; my $base_path = $self->fasta_path(); my $path = File::Spec->catfile( $base_path, $data_type, 'README' ); my $accession = $self->assembly_accession(); my $txt = $text{$data_type}; throw "Cannot find README text for type $data_type" unless $txt; #Add accession information if it is available if($data_type eq 'dna' && $accession) { my $type = $self->assembly_accession_type(); $warning .= <<EXTRA; The genome assembly represented here corresponds to $type $accession EXTRA } work_with_file($path, 'w', sub { my ($fh) = @_; print $fh $warning; print $fh $txt; return; }); return; } 1;