comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 CONTACT
14
15 Please email comments or questions to the public Ensembl
16 developers list at <dev@ensembl.org>.
17
18 Questions may also be sent to the Ensembl help desk at
19 <helpdesk@ensembl.org>.
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Pipeline::FASTA::DumpFile
24
25 =head1 DESCRIPTION
26
27 The main workhorse of the FASTA dumping pipeline. This module has two
28 functions
29
30 =over 8
31
32 =item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats
33
34 =item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included)
35
36 =back
37
38 The script is responsible for creating the filenames of these target
39 files, taking data from the database and the formatting of the FASTA
40 headers. It is also responsible for the creation of README files pertaining
41 to the type of dumps produced. The final files are all Gzipped at normal
42 levels of compression.
43
44 B<N.B.> This code will remove any files already found in the target directory
45 on its first run as it assumes all data will be dumped in the one process. It
46 is selective of its directory meaning a rerun of DNA dumps will not cause
47 the protein/cdna files to be removed.
48
49 Allowed parameters are:
50
51 =over 8
52
53 =item species - The species to dump
54
55 =item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed
56
57 =item release - A required parameter for the version of Ensembl we are dumping for
58
59 =item db_types - Array reference of the database groups to use. Defaults to core
60
61 =item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names
62
63 =item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these)
64
65
66 =item base_path - The base of the dumps
67
68 =item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and
69 process when formatting FASTA files. Normally do not
70 touch
71
72 =item allow_appending - If the same file name is generated we will
73 append into that file rather than overwriting
74
75 =item overwrite_files - If the same file name is generated we will overwrite
76 the into that file rather than appending
77
78 =back
79
80 =cut
81
82 package Bio::EnsEMBL::Pipeline::FASTA::DumpFile;
83
84 use strict;
85 use warnings;
86
87 use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base);
88
89 use File::Spec;
90 use IO::Compress::Gzip qw/gzip $GzipError/;
91 use IO::File;
92 use Bio::EnsEMBL::PaddedSlice;
93 use Bio::EnsEMBL::Utils::BiotypeMapper;
94 use Bio::EnsEMBL::Utils::Exception qw/throw/;
95 use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
96 use Bio::EnsEMBL::Utils::IO::FASTASerializer;
97 use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;
98
99 my $DNA_INDEXING_FLOW = 1;
100 my $PEPTIDE_INDEXING_FLOW = 2;
101 my $GENE_INDEXING_FLOW = 3;
102
103 sub param_defaults {
104 my ($self) = @_;
105 return {
106 #user configurable
107 allow_appending => 1,
108 overwrite_files => 0,
109
110 dna_chunk_size => 17000,
111
112 skip_logic_names => [],
113 process_logic_names => [],
114
115 #DON'T MESS
116 #used to track if we need to reopen a file in append mode or not
117 generated_files => {},
118 remove_files_from_dir => {},
119 dataflows => []
120 };
121 }
122
123 sub fetch_input {
124 my ($self) = @_;
125
126 my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') };
127 $self->param('sequence_types', \%sequence_types);
128
129 my $dba = $self->get_DBAdaptor();
130 my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis');
131 $self->param('analyses', $analyses);
132
133 my $types = $self->param('db_types');
134 $types = ['core'] unless $types;
135 $self->param('db_types', $types);
136
137 my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')};
138 $self->param('skip_logic', \%skip_logic_names);
139 $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')};
140 my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')};
141 $self->param('process_logic', \%process_logic_names);
142 $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')};
143
144 return;
145 }
146
147 sub run {
148 my ($self) = @_;
149 my $types = $self->param('db_types');
150 foreach my $type (@{$types}) {
151 my $dba = $self->get_DBAdaptor($type);
152 if(! $dba) {
153 $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type);
154 next;
155 }
156 $self->run_type($type);
157 }
158 return;
159 }
160
161 sub write_output {
162 my ($self) = @_;
163 my $dataflows = $self->param('dataflows');
164 foreach my $flow (@{$dataflows}) {
165 $self->dataflow_output_id(@{$flow});
166 }
167 return;
168 }
169
170 sub run_type {
171 my ($self, $type) = @_;
172
173 my $species = $self->param('species');
174 my $sequence_types = $self->param('sequence_types');
175
176 # dump file for each type on a per slice basis
177 # types are dna,cDNA, peptide, ncRNA
178
179 #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref
180 my $dba = $self->get_DBAdaptor($type);
181 if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) {
182 $self->info( "Starting dna dump for " . $species );
183 $self->_dump_dna($type);
184 $self->_create_README('dna');
185 }
186
187 if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not
188 $self->info( "Starting cdna dump for " . $species );
189 my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type);
190
191 $self->info( "Starting prediction transcript dumps for " . $species );
192 my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type);
193
194 $self->_create_README('cdna') if $transcripts || $pred_transcripts;
195 $self->_create_README('pep') if $peptide || $pred_proteins;
196 }
197 if ( $sequence_types->{ncrna} ) {
198 $self->info( "Starting ncRNA dump for " . $species );
199 my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type);
200 $self->_create_README('ncrna') if $ncrna_transcripts;
201 }
202
203 $self->cleanup_DBAdaptor($type);
204 }
205
206 # Dump entire sequence, also dump data into chromosome files as appropriate
207 sub _dump_dna {
208 my ($self,$type) = @_;
209
210 my @chromosomes;
211 my @non_chromosomes;
212 my $filter_human = 1;
213 foreach my $s (@{$self->get_Slices($type, $filter_human)}) {
214 my $chr = $s->is_chromosome();
215 push(@chromosomes, $s) if $chr;
216 push(@non_chromosomes, $s) if ! $chr;
217 }
218
219 ############ NON CHROMOSOME WORK
220 $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes));
221 if(@non_chromosomes) {
222 my ( $non_specific_file, $non_specific_fh, $other_serializer ) =
223 $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' );
224 my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) =
225 $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' );
226 foreach my $s (@non_chromosomes) {
227 $self->_dump_slice($s, $other_serializer, $other_rm_serializer);
228 }
229 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
230 $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1);
231 my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh);
232
233 $self->tidy_file_handle( $non_specific_fh, $non_specific_file );
234 $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file );
235 $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file);
236 $self->info('Dumped non-chromosomes');
237 }
238
239 ############ CHROMOSOME WORK
240 $self->info('Processing %d chromosome(s)', scalar(@chromosomes));
241 foreach my $s (@chromosomes) {
242 my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) =
243 $self->_generate_fasta_serializer( 'dna', 'chromosome',
244 $s->seq_region_name(), undef);
245 # repeat masked data too
246 my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) =
247 $self->_generate_fasta_serializer( 'dna_sm', 'chromosome',
248 $s->seq_region_name(), undef);
249
250 $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer);
251
252 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
253 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1);
254 my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh);
255
256 $self->tidy_file_handle($chromo_fh, $chromo_file_name);
257 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name);
258 $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file);
259 }
260 $self->info("Dumped chromosomes");
261
262 #input_id
263 push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
264 push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
265 push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
266
267 return;
268 }
269
270 sub _dump_slice {
271 my ($self, $s, $serialiser, $rm_serialiser) = @_;
272
273 my $analyses = $self->param('analyses');
274
275 my $chr = $s->is_chromosome();
276 $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end());
277 $self->info(' Slice is a chromosome') if $chr;
278 $self->info(' Slice is non-chromosomal') if ! $chr;
279
280 # Make a padded slice (to automatically pad with N's outside of known regions)
281 # and make a repeat-masked slice and then pad that too.
282 my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s);
283 $serialiser->print_Seq($padded_slice);
284
285 my $soft_mask = 1;
286 my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask);
287 my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice);
288 $rm_serialiser->print_Seq($padded_masked_slice);
289
290 return;
291 }
292
293 #Assumes we are working with un-compressed files
294 sub _convert_softmask_to_hardmask {
295 my ($self, $soft_mask_file, $soft_mask_fh) = @_;
296 if(! -f $soft_mask_file) {
297 $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file);
298 return;
299 }
300 my $hard_mask_file = $soft_mask_file;
301 $hard_mask_file =~ s/\.dna_sm\./.dna_rm./;
302 my $hm_fh = IO::File->new($hard_mask_file, 'w');
303 $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file);
304 work_with_file($soft_mask_file, 'r', sub {
305 my ($sm_fh) = @_;
306 while(my $line = <$sm_fh>) {
307 if(index($line, '>') == 0) {
308 $line =~ s/dna_sm/dna_rm/;
309 }
310 else {
311 $line =~ tr/[acgtn]/N/;
312 }
313 print $hm_fh $line;
314 };
315 return;
316 });
317 return ($hm_fh, $hard_mask_file);
318 }
319
320 sub _dump_transcripts {
321 my ($self, $transcript_type, $type) = @_;
322
323 my $has_transcript_data = 0;
324 my $has_protein_data = 0;
325
326 my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef;
327 my ( $filename, $fh, $transcript_serializer ) =
328 $self->_generate_fasta_serializer( $transcript_type, $transcript_level );
329
330 my ( $peptide_filename, $pep_fh, $peptide_serializer );
331
332 # some cDNAs are translated, make a file to receive them.
333 if ( $transcript_type eq 'cdna') {
334 ( $peptide_filename, $pep_fh, $peptide_serializer ) =
335 $self->_generate_fasta_serializer( 'pep', 'all' );
336 }
337
338 # work out what biotypes correspond to $transcript_type
339 my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new();
340 my $biotypes_list = $biotype_mapper->group_members($transcript_type);
341
342 my $dba = $self->get_DBAdaptor($type);
343 my $gene_adaptor = $dba->get_GeneAdaptor();
344
345 # get all the transcripts that are $transcript_type e.g. cdna, ncrna,
346 foreach my $biotype ( @{$biotypes_list} ) {
347 my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype);
348 $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} ));
349 while ( my $gene = shift @{$gene_list} ) {
350 $self->fine( 'Gene %s', $gene->display_id );
351 my $transcript_list = $gene->get_all_Transcripts();
352 foreach my $transcript ( @{$transcript_list} ) {
353 $self->fine( 'Transcript %s', $transcript->display_id );
354 next unless $self->ok_to_process_logic_name($transcript);
355
356 # foreach transcripts of all genes with biotypes classed as cdna
357 my $transcript_seq = $transcript->seq();
358 $self->_create_display_id($transcript, $transcript_seq, $transcript_type);
359 $transcript_serializer->print_Seq($transcript_seq);
360 if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) {
361 my $translation = $transcript->translation();
362 if ($translation) {
363 my $translation_seq = $transcript->translate();
364 $self->_create_display_id($translation, $translation_seq, $transcript_type);
365 $peptide_serializer->print_Seq($translation_seq);
366
367 $has_protein_data = 1;
368 }
369 }
370
371 $has_transcript_data = 1;
372 }
373 }
374 }
375
376 $self->tidy_file_handle( $fh, $filename );
377 if ( $transcript_type eq 'cdna' ) {
378 $self->tidy_file_handle( $pep_fh, $peptide_filename );
379 }
380
381 if($has_protein_data) {
382 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
383 }
384 if($has_transcript_data) {
385 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
386 }
387
388 return ($has_transcript_data, $has_protein_data);
389 }
390
391 # Dump prediction transcripts and peptides. All predicted transcripts have translations
392 sub _dump_prediction_transcripts {
393 my ($self, $type) = @_;
394 my $dba = $self->get_DBAdaptor($type);
395
396 my $has_transcript_data = 0;
397 my $has_protein_data = 0;
398
399 my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor();
400 my $transcript_list = $prediction_transcript_adaptor->fetch_all();
401 my $count = scalar(@{$transcript_list});
402 $self->info('Found %d prediction transcript(s)', $count);
403 if($count) {
404 my ( $abinitio_filename, $fh, $abinitio_serializer ) =
405 $self->_generate_fasta_serializer( 'cdna', 'abinitio' );
406 my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) =
407 $self->_generate_fasta_serializer( 'pep', 'abinitio' );
408
409 while ( my $transcript = shift @{$transcript_list} ) {
410 next unless $self->ok_to_process_logic_name($transcript);
411
412 $has_transcript_data = 1;
413 my $transcript_seq = $transcript->seq();
414 $self->_create_display_id( $transcript, $transcript_seq, 'cdna' );
415 $abinitio_serializer->print_Seq($transcript_seq);
416
417 my $translation_seq = $transcript->translate();
418 if ( $transcript->translation() ) {
419 $has_protein_data = 1;
420 $self->_create_display_id( $transcript, $translation_seq, 'pep' );
421 $abinitio_peptide_serializer->print_Seq($translation_seq);
422 }
423 }
424
425 $self->tidy_file_handle( $fh, $abinitio_filename );
426 $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename );
427
428 if($has_protein_data) {
429 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
430 }
431 if($has_transcript_data) {
432 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
433 }
434 }
435
436
437 return ($has_transcript_data, $has_protein_data);
438 }
439
440 # We can optionally skip the Gzip process & just delegate to the super class
441 # for it's cleanup routines which only work with an open file handle. Therefore
442 # only pass it onto the super implementation *if* the handle was open.
443 # Also only Gzip if the source file exists (it could have been unlinked from
444 # an earlier call)
445
446 sub tidy_file_handle {
447 my ($self, $fh, $path, $no_gzip) = @_;
448 if($fh->opened()) {
449 my $tidy = $self->SUPER::tidy_file_handle($fh, $path);
450 return 1 if $tidy;
451 }
452
453 return if $no_gzip; #don't gzip if we were told to skip
454 return if ! -f $path; #don't gzip if we had no file
455
456 my $target = $path.".gz";
457 $self->info('Gzipping "%s"', $path);
458 my %args;
459 if($self->param('generated_files')->{$target}) {
460 if($self->param('allow_appending')) {
461 $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target);
462 $args{Append} = 1;
463 }
464 elsif($self->param('overwrite_files')) {
465 $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target);
466 }
467 else {
468 $self->throw("Cannot continue. The file %s has already been created this session. Fail!");
469 }
470 }
471 gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError";
472 $self->info(' Removing original file from filesystem');
473 unlink $path or throw "Could not delete $path: $!";
474 $self->info(' Finished');
475 $self->param('generated_files')->{$target} = 1;
476 return 0;
477 }
478
479 #We assume a transcript is ok to process unless proven otherwise
480 sub ok_to_process_logic_name {
481 my ($self, $transcript) = @_;
482 my $ok = 1;
483 my $logic_name = $transcript->analysis()->logic_name();
484 if($self->param('process_logic_active')) {
485 if(! $self->param('process_logic')->{$logic_name}) {
486 $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name);
487 $ok = 0;
488 }
489 }
490 if($self->param('skip_logic_active')) {
491 if($self->param('skip_logic')->{$logic_name}) {
492 $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name);
493 $ok = 0;
494 }
495 }
496 return $ok;
497 }
498
499 #Generates a FASTA serializer but returns the (filename, handle & instance)
500 sub _generate_fasta_serializer {
501 my ( $self, $datatype, $level, $section, $header_formatter ) = @_;
502 $header_formatter ||= $self->_custom_header();
503 my $chunk = $self->param('dna_chunk_size');
504 my $filename = $self->_generate_file_name( $datatype, $level, $section );
505 my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!";
506 my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk);
507 return ( $filename, $fh, $ser );
508 }
509
510 #
511 # _generate_file_name(data type, level, section )
512 # dna toplevel undef
513 # dna chromosome 6
514
515 sub _generate_file_name {
516 my ( $self, $data_type, $level, $section ) = @_; #level & section is optional
517
518 # File name format looks like:
519 # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa
520 # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa
521 # Homo_sapiens.GRCh37.64.dna.chromosome.20.fa
522 # Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa
523 my @name_bits;
524 push @name_bits, $self->web_name();
525 push @name_bits, $self->assembly();
526 push @name_bits, $self->param('release');
527 push @name_bits, lc($data_type);
528 push @name_bits, $level if $level;
529 push @name_bits, $section if $section;
530 push @name_bits, 'fa';
531
532 my $file_name = join( '.', @name_bits );
533
534 $data_type =~ s/_[rs]m$//; # remove repeatmask or softmask designation from path component
535 my $data_type_dir = $self->fasta_path($data_type);
536 $self->_remove_files_from_dir($data_type_dir);
537 return File::Spec->catfile( $data_type_dir, $file_name );
538 }
539
540 # Attempts to remove any generated files previously present for the instance
541 # of the Process
542 sub _remove_files_from_dir {
543 my ($self, $dir) = @_;
544 if(! $self->param('remove_files_from_dir')->{$dir}) {
545 $self->unlink_all_files($dir);
546 $self->param('remove_files_from_dir')->{$dir} = 1;
547 }
548 return;
549 }
550
551 ##Logic used to generate the expected format for a FASTA header
552 sub _create_display_id {
553 my ($self, $object, $seq, $type) = @_;
554
555 my $stable_id;
556 my $location;
557 my $decoded_type;
558 my $decoded_status;
559 my %attributes;
560
561 if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) {
562 $attributes{transcript_biotype} = $object->biotype();
563
564 #If pred transcript then no gene but type & status are different
565 if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) {
566 $stable_id = $object->stable_id();
567 $location = $object->feature_Slice()->name();
568 $decoded_type = $type;
569 $decoded_status = lc($object->analysis()->logic_name());
570 if($type eq 'pep') {
571 $attributes{transcript} = $stable_id;
572 }
573 }
574 #Must be a real "transcript"
575 else {
576 $stable_id = $object->stable_id();
577 $location = $object->feature_Slice()->name();
578 my $gene = $object->get_Gene();
579 $attributes{gene} = $gene->stable_id();
580 $attributes{gene_biotype} = $gene->biotype();
581
582 #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script)
583 if($type eq 'ncrna') {
584 $decoded_type = lc($object->analysis()->logic_name());
585 $decoded_status = $gene->biotype();
586 }
587 elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) {
588 $decoded_type = $type;
589 $decoded_status = 'pseudogene';
590 }
591 #Otherwise use type & object's transcript's status
592 else {
593 $decoded_type = $type;
594 $decoded_status = lc($object->status());
595 }
596 }
597 }
598 #If it's a translation then grab the transcript and gene then set accordingly
599 elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) {
600 my $transcript = $object->transcript();
601 my $gene = $transcript->get_Gene();
602 $stable_id = $object->stable_id();
603 $location = $transcript->feature_Slice()->name();
604 %attributes = (
605 gene => $gene->stable_id(),
606 gene_biotype => $gene->biotype(),
607 transcript => $transcript->stable_id(),
608 transcript_biotype => $transcript->biotype()
609 );
610 $decoded_type = 'pep';
611 $decoded_status = lc($transcript->status());
612 }
613 else {
614 throw sprintf( 'Do not understand how to format a display_id for type "%s"',
615 ref($object) );
616 }
617
618 my $attr_str = join(q{ },
619 map { $_.':'.$attributes{$_} }
620 grep { exists $attributes{$_} }
621 qw/gene transcript gene_biotype transcript_biotype/);
622
623 my $format = '%s %s:%s %s %s';
624
625 my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str);
626 $seq->display_id($id);
627
628 return;
629 }
630
631 sub _custom_header {
632 my ($self) = @_;
633 return sub {
634 my $slice = shift;
635 if ( !$slice->isa('Bio::EnsEMBL::Slice') ) {
636 return $slice->display_id();
637 }
638
639 #RMS means masked data. soft_mask() true means it was softmasked
640 my $dna_type = 'dna';
641 if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) {
642 $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm';
643 }
644
645 my $id = $slice->seq_region_name;
646 my $idtype = $slice->coord_system->name;
647 my $location = $slice->name;
648 my $ref = $slice->assembly_exception_type();
649 my $header = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref);
650 return $header;
651 };
652 }
653
654 sub _final_filename {
655 my ($self, $filename) = @_;
656 return $filename if $filename =~ /\.gz$/;
657 return $filename.'.gz';
658 }
659
660 sub assembly_accession {
661 my ($self) = @_;
662 my $mc = $self->get_DBAdaptor()->get_MetaContainer();
663 return $mc->single_value_by_key('assembly.accession');
664 }
665
666 sub assembly_accession_type {
667 my ($self) = @_;
668 my $mc = $self->get_DBAdaptor()->get_MetaContainer();
669 return $mc->single_value_by_key('assembly.web_accession_type');
670 }
671
672 sub _create_README {
673
674 #Text for readme files
675
676 my %text = (
677 dna => <<'README',
678 #######################
679 Fasta DNA dumps
680 #######################
681
682 -----------
683 FILE NAMES
684 ------------
685 The files are consistently named following this pattern:
686 <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
687
688 <species>: The systematic name of the species.
689 <assembly>: The assembly build name.
690 <release>: The release number.
691 <sequence type>:
692 * 'dna' - unmasked genomic DNA sequences.
693 * 'dna_rm' - masked genomic DNA. Interspersed repeats and low
694 complexity regions are detected with the RepeatMasker tool and masked
695 by replacing repeats with 'N's.
696 * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions
697 have been replaced with lowercased versions of their nucleic base
698 <id type> One of the following:
699 * 'chromosome'a - The top-level coordinate system in most species in Ensembl
700 * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome
701 * 'seqlevel' - This is usually sequence scaffolds, chunks or clones.
702 -- 'scaffold' - Larger sequence contigs from the assembly of shorter
703 sequencing reads (often from whole genome shotgun, WGS) which could
704 not yet be assembled into chromosomes. Often more genome sequencing
705 is needed to narrow gaps and establish a tiling path.
706 -- 'chunk' - While contig sequences can be assembled into large entities,
707 they sometimes have to be artificially broken down into smaller entities
708 called 'chunks'. This is due to limitations in the annotation
709 pipeline and the finite record size imposed by MySQL which stores the
710 sequence and annotation information.
711 -- 'clone' - In general this is the smallest sequence entity. It is often
712 identical to the sequence of one BAC clone, or sequence region
713 of one BAC clone which forms the tiling path.
714 <id>: The actual sequence identifier. Depending on the <id type> the <id>
715 could represent the name of a chromosome, a scaffold, a contig, a clone ..
716 Field is empty for seqlevel files
717 fa: All files in these directories represent FASTA database files
718 gz: All files are compacted with GNU Zip for storage efficiency.
719
720 -----------
721 TOPLEVEL
722 ----------
723 These files contain the full sequence of the assembly in fasta format.
724 They contain one chromosome per file.
725
726 EXAMPLES
727 The genomic sequence of human chromosome 1:
728 Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz
729
730 The masked version of the genome sequence on human chromosome 1
731 (contains '_rm' or '_sm' in the name):
732 Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz
733 Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz
734
735 Non-chromosomal assembly sequences:
736 e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes
737 Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz
738 Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz
739 Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz
740
741
742 --------------
743 SPECIAL CASES
744 --------------
745 Some chromosomes have alternate haplotypes which are presented in files with
746 the haplotype sequence only:
747 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz
748 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz
749
750
751 Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR)
752 on the Y is annotated. By definition the PAR region is identical on the
753 X and Y chromosome. We provide this sequence in the following way.
754 -- The Y chromosome file contains the complete sequence of the PAR:
755 Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz
756 -- The top level file includes only the unique portion of Y (i.e. the PAR
757 (region is N-masked):
758 Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz
759
760 README
761
762 pep => <<'README',
763 ####################
764 Fasta Peptide dumps
765 ####################
766 These files hold the protein translations of Ensembl gene predictions.
767
768 -----------
769 FILE NAMES
770 ------------
771 The files are consistently named following this pattern:
772 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
773
774 <species>: The systematic name of the species.
775 <assembly>: The assembly build name.
776 <release>: The release number.
777 <sequence type>: pep for peptide sequences
778 <status>
779 * 'pep.all' - the super-set of all translations resulting from Ensembl known
780 or novel gene predictions.
781 * 'pep.abinitio' translations resulting from 'ab initio' gene
782 prediction algorithms such as SNAP and GENSCAN. In general, all
783 'ab initio' predictions are based solely on the genomic sequence and
784 not any other experimental evidence. Therefore, not all GENSCAN
785 or SNAP predictions represent biologically real proteins.
786 fa : All files in these directories represent FASTA database files
787 gz : All files are compacted with GNU Zip for storage efficiency.
788
789 EXAMPLES (Note: Most species do not sequences for each different <status>)
790 for Human:
791 Homo_sapiens.NCBI36.40.pep.all.fa.gz
792 contains all known and novel peptides
793 Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz
794 contains all abinitio predicted peptide
795
796 Difference between known and novel
797 ----------------------------------
798 Protein models that can be mapped to species-specific entries in
799 Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as
800 known genes. Those that cannot be mapped are called novel
801 (e.g. genes predicted on the basis of evidence from closely related species).
802
803 For models annotated by HAVANA the status is set manually. Models that have
804 an HGNC name are referred to as known and the remaining models are referred to
805 as novel.
806
807 -------------------------------
808 FASTA Sequence Header Lines
809 ------------------------------
810 The FASTA sequence header lines are designed to be consistent across
811 all types of Ensembl FASTA sequences. This gives enough information
812 for the sequence to be identified outside the context of the FASTA
813 database file.
814
815 General format:
816
817 >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
818
819 Example of Ensembl Peptide header:
820
821 >ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
822 ^ ^ ^ ^ ^ ^ ^ ^
823 ID | | LOCATION GENE:stable gene ID | GENE: gene biotype TRANSCRIPT: transcript biotype
824 | STATUS TRANSCRIPT: stable transcript ID
825 SEQTYPE
826
827 README
828
829 cdna => <<'README',
830 ##################
831 Fasta cDNA dumps
832 #################
833
834 These files hold the cDNA sequences corresponding to Ensembl gene predictions.
835
836 ------------
837 FILE NAMES
838 ------------
839 The files are consistently named following this pattern:
840 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
841
842 <species>: The systematic name of the species.
843 <assembly>: The assembly build name.
844 <release>: The release number.
845 <sequence type>: cdna for cDNA sequences
846 <status>
847 * 'cdna.all' - the super-set of all transcripts resulting from
848 Ensembl known, novel and pseudo gene predictions (see more below).
849 * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction
850 algorithms such as SNAP and GENSCAN. In general all 'ab initio'
851 predictions are solely based on the genomic sequence and do not
852 use other experimental evidence. Therefore, not all GENSCAN or SNAP
853 cDNA predictions represent biologically real cDNAs.
854 Consequently, these predictions should be used with care.
855
856 EXAMPLES (Note: Most species do not sequences for each different <status>)
857 for Human:
858 Homo_sapiens.NCBI36.40.cdna.all.fa.gz
859 cDNA sequences for all transcripts: known, novel and pseudo
860 Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz
861 cDNA sequences for 'ab-initio' prediction transcripts.
862
863 Difference between known and novel transcripts
864 -----------------------------------------------
865 Transcript or protein models that can be mapped to species-specific entries
866 in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl.
867 Those that cannot be mapped are called novel genes (e.g. genes predicted on
868 the basis of evidence from closely related species).
869
870 For models annotated by HAVANA the status is set manually. Models that have
871 an HGNC name are referred to as known and the remaining models are referred to
872 as novel.
873
874 -------------------------------
875 FASTA Sequence Header Lines
876 ------------------------------
877 The FASTA sequence header lines are designed to be consistent across
878 all types of Ensembl FASTA sequences. This gives enough information
879 for the sequence to be identified outside the context of the FASTA file.
880
881 General format:
882
883 >ID SEQTYPE:STATUS LOCATION GENE
884
885 Example of an Ensembl cDNA header:
886
887 >ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding
888 ^ ^ ^ ^ ^ ^ ^
889 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype
890 | STATUS
891 SEQTYPE
892
893
894 README
895
896 ncrna => <<'README',
897 ##################
898 Fasta RNA dumps
899 #################
900
901 These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA).
902
903 ------------
904 FILE NAMES
905 ------------
906 The files are consistently named following this pattern:
907 <species>.<assembly>.<release>.<sequence type>.fa.gz
908
909 <species>: The systematic name of the species.
910 <assembly>: The assembly build name.
911 <release>: The release number.
912 <sequence type>: ncrna for non-coding RNA sequences
913
914 EXAMPLES
915 for Human:
916 Homo_sapiens.NCBI36.40.ncrna.fa.gz
917 Transcript sequences for all ncRNA gene types.
918
919
920 -------------------------------
921 FASTA Sequence Header Lines
922 ------------------------------
923 The FASTA sequence header lines are designed to be consistent across
924 all types of Ensembl FASTA sequences. This gives enough information
925 for the sequence to be identified outside the context of the FASTA file.
926
927 General format:
928
929 >ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA
930 ^ ^ ^ ^ ^ ^ ^
931 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype
932 | STATUS
933 SEQTYPE
934
935
936 README
937 );
938
939 my $warning = <<'README';
940 #### README ####
941
942 IMPORTANT: Please note you can download correlation data tables,
943 supported by Ensembl, via the highly customisable BioMart and
944 EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
945 http://www.ebi.ac.uk/biomart/ for more information.
946
947 README
948
949 my ( $self, $data_type ) = @_;
950 my $base_path = $self->fasta_path();
951 my $path = File::Spec->catfile( $base_path, $data_type, 'README' );
952 my $accession = $self->assembly_accession();
953 my $txt = $text{$data_type};
954 throw "Cannot find README text for type $data_type" unless $txt;
955
956 #Add accession information if it is available
957 if($data_type eq 'dna' && $accession) {
958 my $type = $self->assembly_accession_type();
959 $warning .= <<EXTRA;
960 The genome assembly represented here corresponds to $type
961 $accession
962
963 EXTRA
964 }
965
966 work_with_file($path, 'w', sub {
967 my ($fh) = @_;
968 print $fh $warning;
969 print $fh $txt;
970 return;
971 });
972 return;
973 }
974
975 1;
976