Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
6 Genome Research Limited. All rights reserved. | |
7 | |
8 This software is distributed under a modified Apache license. | |
9 For license details, please see | |
10 | |
11 http://www.ensembl.org/info/about/code_licence.html | |
12 | |
13 =head1 CONTACT | |
14 | |
15 Please email comments or questions to the public Ensembl | |
16 developers list at <dev@ensembl.org>. | |
17 | |
18 Questions may also be sent to the Ensembl help desk at | |
19 <helpdesk@ensembl.org>. | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Pipeline::FASTA::DumpFile | |
24 | |
25 =head1 DESCRIPTION | |
26 | |
27 The main workhorse of the FASTA dumping pipeline. This module has two | |
28 functions | |
29 | |
30 =over 8 | |
31 | |
32 =item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats | |
33 | |
34 =item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included) | |
35 | |
36 =back | |
37 | |
38 The script is responsible for creating the filenames of these target | |
39 files, taking data from the database and the formatting of the FASTA | |
40 headers. It is also responsible for the creation of README files pertaining | |
41 to the type of dumps produced. The final files are all Gzipped at normal | |
42 levels of compression. | |
43 | |
44 B<N.B.> This code will remove any files already found in the target directory | |
45 on its first run as it assumes all data will be dumped in the one process. It | |
46 is selective of its directory meaning a rerun of DNA dumps will not cause | |
47 the protein/cdna files to be removed. | |
48 | |
49 Allowed parameters are: | |
50 | |
51 =over 8 | |
52 | |
53 =item species - The species to dump | |
54 | |
55 =item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed | |
56 | |
57 =item release - A required parameter for the version of Ensembl we are dumping for | |
58 | |
59 =item db_types - Array reference of the database groups to use. Defaults to core | |
60 | |
61 =item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names | |
62 | |
63 =item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these) | |
64 | |
65 | |
66 =item base_path - The base of the dumps | |
67 | |
68 =item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and | |
69 process when formatting FASTA files. Normally do not | |
70 touch | |
71 | |
72 =item allow_appending - If the same file name is generated we will | |
73 append into that file rather than overwriting | |
74 | |
75 =item overwrite_files - If the same file name is generated we will overwrite | |
76 the into that file rather than appending | |
77 | |
78 =back | |
79 | |
80 =cut | |
81 | |
82 package Bio::EnsEMBL::Pipeline::FASTA::DumpFile; | |
83 | |
84 use strict; | |
85 use warnings; | |
86 | |
87 use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base); | |
88 | |
89 use File::Spec; | |
90 use IO::Compress::Gzip qw/gzip $GzipError/; | |
91 use IO::File; | |
92 use Bio::EnsEMBL::PaddedSlice; | |
93 use Bio::EnsEMBL::Utils::BiotypeMapper; | |
94 use Bio::EnsEMBL::Utils::Exception qw/throw/; | |
95 use Bio::EnsEMBL::Utils::Scalar qw/check_ref/; | |
96 use Bio::EnsEMBL::Utils::IO::FASTASerializer; | |
97 use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/; | |
98 | |
99 my $DNA_INDEXING_FLOW = 1; | |
100 my $PEPTIDE_INDEXING_FLOW = 2; | |
101 my $GENE_INDEXING_FLOW = 3; | |
102 | |
103 sub param_defaults { | |
104 my ($self) = @_; | |
105 return { | |
106 #user configurable | |
107 allow_appending => 1, | |
108 overwrite_files => 0, | |
109 | |
110 dna_chunk_size => 17000, | |
111 | |
112 skip_logic_names => [], | |
113 process_logic_names => [], | |
114 | |
115 #DON'T MESS | |
116 #used to track if we need to reopen a file in append mode or not | |
117 generated_files => {}, | |
118 remove_files_from_dir => {}, | |
119 dataflows => [] | |
120 }; | |
121 } | |
122 | |
123 sub fetch_input { | |
124 my ($self) = @_; | |
125 | |
126 my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') }; | |
127 $self->param('sequence_types', \%sequence_types); | |
128 | |
129 my $dba = $self->get_DBAdaptor(); | |
130 my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis'); | |
131 $self->param('analyses', $analyses); | |
132 | |
133 my $types = $self->param('db_types'); | |
134 $types = ['core'] unless $types; | |
135 $self->param('db_types', $types); | |
136 | |
137 my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')}; | |
138 $self->param('skip_logic', \%skip_logic_names); | |
139 $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')}; | |
140 my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')}; | |
141 $self->param('process_logic', \%process_logic_names); | |
142 $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')}; | |
143 | |
144 return; | |
145 } | |
146 | |
147 sub run { | |
148 my ($self) = @_; | |
149 my $types = $self->param('db_types'); | |
150 foreach my $type (@{$types}) { | |
151 my $dba = $self->get_DBAdaptor($type); | |
152 if(! $dba) { | |
153 $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type); | |
154 next; | |
155 } | |
156 $self->run_type($type); | |
157 } | |
158 return; | |
159 } | |
160 | |
161 sub write_output { | |
162 my ($self) = @_; | |
163 my $dataflows = $self->param('dataflows'); | |
164 foreach my $flow (@{$dataflows}) { | |
165 $self->dataflow_output_id(@{$flow}); | |
166 } | |
167 return; | |
168 } | |
169 | |
170 sub run_type { | |
171 my ($self, $type) = @_; | |
172 | |
173 my $species = $self->param('species'); | |
174 my $sequence_types = $self->param('sequence_types'); | |
175 | |
176 # dump file for each type on a per slice basis | |
177 # types are dna,cDNA, peptide, ncRNA | |
178 | |
179 #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref | |
180 my $dba = $self->get_DBAdaptor($type); | |
181 if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) { | |
182 $self->info( "Starting dna dump for " . $species ); | |
183 $self->_dump_dna($type); | |
184 $self->_create_README('dna'); | |
185 } | |
186 | |
187 if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not | |
188 $self->info( "Starting cdna dump for " . $species ); | |
189 my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type); | |
190 | |
191 $self->info( "Starting prediction transcript dumps for " . $species ); | |
192 my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type); | |
193 | |
194 $self->_create_README('cdna') if $transcripts || $pred_transcripts; | |
195 $self->_create_README('pep') if $peptide || $pred_proteins; | |
196 } | |
197 if ( $sequence_types->{ncrna} ) { | |
198 $self->info( "Starting ncRNA dump for " . $species ); | |
199 my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type); | |
200 $self->_create_README('ncrna') if $ncrna_transcripts; | |
201 } | |
202 | |
203 $self->cleanup_DBAdaptor($type); | |
204 } | |
205 | |
206 # Dump entire sequence, also dump data into chromosome files as appropriate | |
207 sub _dump_dna { | |
208 my ($self,$type) = @_; | |
209 | |
210 my @chromosomes; | |
211 my @non_chromosomes; | |
212 my $filter_human = 1; | |
213 foreach my $s (@{$self->get_Slices($type, $filter_human)}) { | |
214 my $chr = $s->is_chromosome(); | |
215 push(@chromosomes, $s) if $chr; | |
216 push(@non_chromosomes, $s) if ! $chr; | |
217 } | |
218 | |
219 ############ NON CHROMOSOME WORK | |
220 $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes)); | |
221 if(@non_chromosomes) { | |
222 my ( $non_specific_file, $non_specific_fh, $other_serializer ) = | |
223 $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' ); | |
224 my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) = | |
225 $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' ); | |
226 foreach my $s (@non_chromosomes) { | |
227 $self->_dump_slice($s, $other_serializer, $other_rm_serializer); | |
228 } | |
229 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call | |
230 $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1); | |
231 my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh); | |
232 | |
233 $self->tidy_file_handle( $non_specific_fh, $non_specific_file ); | |
234 $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file ); | |
235 $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file); | |
236 $self->info('Dumped non-chromosomes'); | |
237 } | |
238 | |
239 ############ CHROMOSOME WORK | |
240 $self->info('Processing %d chromosome(s)', scalar(@chromosomes)); | |
241 foreach my $s (@chromosomes) { | |
242 my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) = | |
243 $self->_generate_fasta_serializer( 'dna', 'chromosome', | |
244 $s->seq_region_name(), undef); | |
245 # repeat masked data too | |
246 my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) = | |
247 $self->_generate_fasta_serializer( 'dna_sm', 'chromosome', | |
248 $s->seq_region_name(), undef); | |
249 | |
250 $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer); | |
251 | |
252 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call | |
253 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1); | |
254 my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh); | |
255 | |
256 $self->tidy_file_handle($chromo_fh, $chromo_file_name); | |
257 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name); | |
258 $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file); | |
259 } | |
260 $self->info("Dumped chromosomes"); | |
261 | |
262 #input_id | |
263 push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]); | |
264 push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); | |
265 push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]); | |
266 | |
267 return; | |
268 } | |
269 | |
270 sub _dump_slice { | |
271 my ($self, $s, $serialiser, $rm_serialiser) = @_; | |
272 | |
273 my $analyses = $self->param('analyses'); | |
274 | |
275 my $chr = $s->is_chromosome(); | |
276 $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end()); | |
277 $self->info(' Slice is a chromosome') if $chr; | |
278 $self->info(' Slice is non-chromosomal') if ! $chr; | |
279 | |
280 # Make a padded slice (to automatically pad with N's outside of known regions) | |
281 # and make a repeat-masked slice and then pad that too. | |
282 my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s); | |
283 $serialiser->print_Seq($padded_slice); | |
284 | |
285 my $soft_mask = 1; | |
286 my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask); | |
287 my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice); | |
288 $rm_serialiser->print_Seq($padded_masked_slice); | |
289 | |
290 return; | |
291 } | |
292 | |
293 #Assumes we are working with un-compressed files | |
294 sub _convert_softmask_to_hardmask { | |
295 my ($self, $soft_mask_file, $soft_mask_fh) = @_; | |
296 if(! -f $soft_mask_file) { | |
297 $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file); | |
298 return; | |
299 } | |
300 my $hard_mask_file = $soft_mask_file; | |
301 $hard_mask_file =~ s/\.dna_sm\./.dna_rm./; | |
302 my $hm_fh = IO::File->new($hard_mask_file, 'w'); | |
303 $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file); | |
304 work_with_file($soft_mask_file, 'r', sub { | |
305 my ($sm_fh) = @_; | |
306 while(my $line = <$sm_fh>) { | |
307 if(index($line, '>') == 0) { | |
308 $line =~ s/dna_sm/dna_rm/; | |
309 } | |
310 else { | |
311 $line =~ tr/[acgtn]/N/; | |
312 } | |
313 print $hm_fh $line; | |
314 }; | |
315 return; | |
316 }); | |
317 return ($hm_fh, $hard_mask_file); | |
318 } | |
319 | |
320 sub _dump_transcripts { | |
321 my ($self, $transcript_type, $type) = @_; | |
322 | |
323 my $has_transcript_data = 0; | |
324 my $has_protein_data = 0; | |
325 | |
326 my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef; | |
327 my ( $filename, $fh, $transcript_serializer ) = | |
328 $self->_generate_fasta_serializer( $transcript_type, $transcript_level ); | |
329 | |
330 my ( $peptide_filename, $pep_fh, $peptide_serializer ); | |
331 | |
332 # some cDNAs are translated, make a file to receive them. | |
333 if ( $transcript_type eq 'cdna') { | |
334 ( $peptide_filename, $pep_fh, $peptide_serializer ) = | |
335 $self->_generate_fasta_serializer( 'pep', 'all' ); | |
336 } | |
337 | |
338 # work out what biotypes correspond to $transcript_type | |
339 my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new(); | |
340 my $biotypes_list = $biotype_mapper->group_members($transcript_type); | |
341 | |
342 my $dba = $self->get_DBAdaptor($type); | |
343 my $gene_adaptor = $dba->get_GeneAdaptor(); | |
344 | |
345 # get all the transcripts that are $transcript_type e.g. cdna, ncrna, | |
346 foreach my $biotype ( @{$biotypes_list} ) { | |
347 my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype); | |
348 $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} )); | |
349 while ( my $gene = shift @{$gene_list} ) { | |
350 $self->fine( 'Gene %s', $gene->display_id ); | |
351 my $transcript_list = $gene->get_all_Transcripts(); | |
352 foreach my $transcript ( @{$transcript_list} ) { | |
353 $self->fine( 'Transcript %s', $transcript->display_id ); | |
354 next unless $self->ok_to_process_logic_name($transcript); | |
355 | |
356 # foreach transcripts of all genes with biotypes classed as cdna | |
357 my $transcript_seq = $transcript->seq(); | |
358 $self->_create_display_id($transcript, $transcript_seq, $transcript_type); | |
359 $transcript_serializer->print_Seq($transcript_seq); | |
360 if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) { | |
361 my $translation = $transcript->translation(); | |
362 if ($translation) { | |
363 my $translation_seq = $transcript->translate(); | |
364 $self->_create_display_id($translation, $translation_seq, $transcript_type); | |
365 $peptide_serializer->print_Seq($translation_seq); | |
366 | |
367 $has_protein_data = 1; | |
368 } | |
369 } | |
370 | |
371 $has_transcript_data = 1; | |
372 } | |
373 } | |
374 } | |
375 | |
376 $self->tidy_file_handle( $fh, $filename ); | |
377 if ( $transcript_type eq 'cdna' ) { | |
378 $self->tidy_file_handle( $pep_fh, $peptide_filename ); | |
379 } | |
380 | |
381 if($has_protein_data) { | |
382 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); | |
383 } | |
384 if($has_transcript_data) { | |
385 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); | |
386 } | |
387 | |
388 return ($has_transcript_data, $has_protein_data); | |
389 } | |
390 | |
391 # Dump prediction transcripts and peptides. All predicted transcripts have translations | |
392 sub _dump_prediction_transcripts { | |
393 my ($self, $type) = @_; | |
394 my $dba = $self->get_DBAdaptor($type); | |
395 | |
396 my $has_transcript_data = 0; | |
397 my $has_protein_data = 0; | |
398 | |
399 my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor(); | |
400 my $transcript_list = $prediction_transcript_adaptor->fetch_all(); | |
401 my $count = scalar(@{$transcript_list}); | |
402 $self->info('Found %d prediction transcript(s)', $count); | |
403 if($count) { | |
404 my ( $abinitio_filename, $fh, $abinitio_serializer ) = | |
405 $self->_generate_fasta_serializer( 'cdna', 'abinitio' ); | |
406 my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) = | |
407 $self->_generate_fasta_serializer( 'pep', 'abinitio' ); | |
408 | |
409 while ( my $transcript = shift @{$transcript_list} ) { | |
410 next unless $self->ok_to_process_logic_name($transcript); | |
411 | |
412 $has_transcript_data = 1; | |
413 my $transcript_seq = $transcript->seq(); | |
414 $self->_create_display_id( $transcript, $transcript_seq, 'cdna' ); | |
415 $abinitio_serializer->print_Seq($transcript_seq); | |
416 | |
417 my $translation_seq = $transcript->translate(); | |
418 if ( $transcript->translation() ) { | |
419 $has_protein_data = 1; | |
420 $self->_create_display_id( $transcript, $translation_seq, 'pep' ); | |
421 $abinitio_peptide_serializer->print_Seq($translation_seq); | |
422 } | |
423 } | |
424 | |
425 $self->tidy_file_handle( $fh, $abinitio_filename ); | |
426 $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename ); | |
427 | |
428 if($has_protein_data) { | |
429 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]); | |
430 } | |
431 if($has_transcript_data) { | |
432 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]); | |
433 } | |
434 } | |
435 | |
436 | |
437 return ($has_transcript_data, $has_protein_data); | |
438 } | |
439 | |
440 # We can optionally skip the Gzip process & just delegate to the super class | |
441 # for it's cleanup routines which only work with an open file handle. Therefore | |
442 # only pass it onto the super implementation *if* the handle was open. | |
443 # Also only Gzip if the source file exists (it could have been unlinked from | |
444 # an earlier call) | |
445 | |
446 sub tidy_file_handle { | |
447 my ($self, $fh, $path, $no_gzip) = @_; | |
448 if($fh->opened()) { | |
449 my $tidy = $self->SUPER::tidy_file_handle($fh, $path); | |
450 return 1 if $tidy; | |
451 } | |
452 | |
453 return if $no_gzip; #don't gzip if we were told to skip | |
454 return if ! -f $path; #don't gzip if we had no file | |
455 | |
456 my $target = $path.".gz"; | |
457 $self->info('Gzipping "%s"', $path); | |
458 my %args; | |
459 if($self->param('generated_files')->{$target}) { | |
460 if($self->param('allow_appending')) { | |
461 $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target); | |
462 $args{Append} = 1; | |
463 } | |
464 elsif($self->param('overwrite_files')) { | |
465 $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target); | |
466 } | |
467 else { | |
468 $self->throw("Cannot continue. The file %s has already been created this session. Fail!"); | |
469 } | |
470 } | |
471 gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError"; | |
472 $self->info(' Removing original file from filesystem'); | |
473 unlink $path or throw "Could not delete $path: $!"; | |
474 $self->info(' Finished'); | |
475 $self->param('generated_files')->{$target} = 1; | |
476 return 0; | |
477 } | |
478 | |
479 #We assume a transcript is ok to process unless proven otherwise | |
480 sub ok_to_process_logic_name { | |
481 my ($self, $transcript) = @_; | |
482 my $ok = 1; | |
483 my $logic_name = $transcript->analysis()->logic_name(); | |
484 if($self->param('process_logic_active')) { | |
485 if(! $self->param('process_logic')->{$logic_name}) { | |
486 $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name); | |
487 $ok = 0; | |
488 } | |
489 } | |
490 if($self->param('skip_logic_active')) { | |
491 if($self->param('skip_logic')->{$logic_name}) { | |
492 $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name); | |
493 $ok = 0; | |
494 } | |
495 } | |
496 return $ok; | |
497 } | |
498 | |
499 #Generates a FASTA serializer but returns the (filename, handle & instance) | |
500 sub _generate_fasta_serializer { | |
501 my ( $self, $datatype, $level, $section, $header_formatter ) = @_; | |
502 $header_formatter ||= $self->_custom_header(); | |
503 my $chunk = $self->param('dna_chunk_size'); | |
504 my $filename = $self->_generate_file_name( $datatype, $level, $section ); | |
505 my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!"; | |
506 my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk); | |
507 return ( $filename, $fh, $ser ); | |
508 } | |
509 | |
510 # | |
511 # _generate_file_name(data type, level, section ) | |
512 # dna toplevel undef | |
513 # dna chromosome 6 | |
514 | |
515 sub _generate_file_name { | |
516 my ( $self, $data_type, $level, $section ) = @_; #level & section is optional | |
517 | |
518 # File name format looks like: | |
519 # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa | |
520 # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa | |
521 # Homo_sapiens.GRCh37.64.dna.chromosome.20.fa | |
522 # Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa | |
523 my @name_bits; | |
524 push @name_bits, $self->web_name(); | |
525 push @name_bits, $self->assembly(); | |
526 push @name_bits, $self->param('release'); | |
527 push @name_bits, lc($data_type); | |
528 push @name_bits, $level if $level; | |
529 push @name_bits, $section if $section; | |
530 push @name_bits, 'fa'; | |
531 | |
532 my $file_name = join( '.', @name_bits ); | |
533 | |
534 $data_type =~ s/_[rs]m$//; # remove repeatmask or softmask designation from path component | |
535 my $data_type_dir = $self->fasta_path($data_type); | |
536 $self->_remove_files_from_dir($data_type_dir); | |
537 return File::Spec->catfile( $data_type_dir, $file_name ); | |
538 } | |
539 | |
540 # Attempts to remove any generated files previously present for the instance | |
541 # of the Process | |
542 sub _remove_files_from_dir { | |
543 my ($self, $dir) = @_; | |
544 if(! $self->param('remove_files_from_dir')->{$dir}) { | |
545 $self->unlink_all_files($dir); | |
546 $self->param('remove_files_from_dir')->{$dir} = 1; | |
547 } | |
548 return; | |
549 } | |
550 | |
551 ##Logic used to generate the expected format for a FASTA header | |
552 sub _create_display_id { | |
553 my ($self, $object, $seq, $type) = @_; | |
554 | |
555 my $stable_id; | |
556 my $location; | |
557 my $decoded_type; | |
558 my $decoded_status; | |
559 my %attributes; | |
560 | |
561 if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) { | |
562 $attributes{transcript_biotype} = $object->biotype(); | |
563 | |
564 #If pred transcript then no gene but type & status are different | |
565 if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) { | |
566 $stable_id = $object->stable_id(); | |
567 $location = $object->feature_Slice()->name(); | |
568 $decoded_type = $type; | |
569 $decoded_status = lc($object->analysis()->logic_name()); | |
570 if($type eq 'pep') { | |
571 $attributes{transcript} = $stable_id; | |
572 } | |
573 } | |
574 #Must be a real "transcript" | |
575 else { | |
576 $stable_id = $object->stable_id(); | |
577 $location = $object->feature_Slice()->name(); | |
578 my $gene = $object->get_Gene(); | |
579 $attributes{gene} = $gene->stable_id(); | |
580 $attributes{gene_biotype} = $gene->biotype(); | |
581 | |
582 #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script) | |
583 if($type eq 'ncrna') { | |
584 $decoded_type = lc($object->analysis()->logic_name()); | |
585 $decoded_status = $gene->biotype(); | |
586 } | |
587 elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) { | |
588 $decoded_type = $type; | |
589 $decoded_status = 'pseudogene'; | |
590 } | |
591 #Otherwise use type & object's transcript's status | |
592 else { | |
593 $decoded_type = $type; | |
594 $decoded_status = lc($object->status()); | |
595 } | |
596 } | |
597 } | |
598 #If it's a translation then grab the transcript and gene then set accordingly | |
599 elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) { | |
600 my $transcript = $object->transcript(); | |
601 my $gene = $transcript->get_Gene(); | |
602 $stable_id = $object->stable_id(); | |
603 $location = $transcript->feature_Slice()->name(); | |
604 %attributes = ( | |
605 gene => $gene->stable_id(), | |
606 gene_biotype => $gene->biotype(), | |
607 transcript => $transcript->stable_id(), | |
608 transcript_biotype => $transcript->biotype() | |
609 ); | |
610 $decoded_type = 'pep'; | |
611 $decoded_status = lc($transcript->status()); | |
612 } | |
613 else { | |
614 throw sprintf( 'Do not understand how to format a display_id for type "%s"', | |
615 ref($object) ); | |
616 } | |
617 | |
618 my $attr_str = join(q{ }, | |
619 map { $_.':'.$attributes{$_} } | |
620 grep { exists $attributes{$_} } | |
621 qw/gene transcript gene_biotype transcript_biotype/); | |
622 | |
623 my $format = '%s %s:%s %s %s'; | |
624 | |
625 my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str); | |
626 $seq->display_id($id); | |
627 | |
628 return; | |
629 } | |
630 | |
631 sub _custom_header { | |
632 my ($self) = @_; | |
633 return sub { | |
634 my $slice = shift; | |
635 if ( !$slice->isa('Bio::EnsEMBL::Slice') ) { | |
636 return $slice->display_id(); | |
637 } | |
638 | |
639 #RMS means masked data. soft_mask() true means it was softmasked | |
640 my $dna_type = 'dna'; | |
641 if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) { | |
642 $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm'; | |
643 } | |
644 | |
645 my $id = $slice->seq_region_name; | |
646 my $idtype = $slice->coord_system->name; | |
647 my $location = $slice->name; | |
648 my $ref = $slice->assembly_exception_type(); | |
649 my $header = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref); | |
650 return $header; | |
651 }; | |
652 } | |
653 | |
654 sub _final_filename { | |
655 my ($self, $filename) = @_; | |
656 return $filename if $filename =~ /\.gz$/; | |
657 return $filename.'.gz'; | |
658 } | |
659 | |
660 sub assembly_accession { | |
661 my ($self) = @_; | |
662 my $mc = $self->get_DBAdaptor()->get_MetaContainer(); | |
663 return $mc->single_value_by_key('assembly.accession'); | |
664 } | |
665 | |
666 sub assembly_accession_type { | |
667 my ($self) = @_; | |
668 my $mc = $self->get_DBAdaptor()->get_MetaContainer(); | |
669 return $mc->single_value_by_key('assembly.web_accession_type'); | |
670 } | |
671 | |
672 sub _create_README { | |
673 | |
674 #Text for readme files | |
675 | |
676 my %text = ( | |
677 dna => <<'README', | |
678 ####################### | |
679 Fasta DNA dumps | |
680 ####################### | |
681 | |
682 ----------- | |
683 FILE NAMES | |
684 ------------ | |
685 The files are consistently named following this pattern: | |
686 <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz | |
687 | |
688 <species>: The systematic name of the species. | |
689 <assembly>: The assembly build name. | |
690 <release>: The release number. | |
691 <sequence type>: | |
692 * 'dna' - unmasked genomic DNA sequences. | |
693 * 'dna_rm' - masked genomic DNA. Interspersed repeats and low | |
694 complexity regions are detected with the RepeatMasker tool and masked | |
695 by replacing repeats with 'N's. | |
696 * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions | |
697 have been replaced with lowercased versions of their nucleic base | |
698 <id type> One of the following: | |
699 * 'chromosome'a - The top-level coordinate system in most species in Ensembl | |
700 * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome | |
701 * 'seqlevel' - This is usually sequence scaffolds, chunks or clones. | |
702 -- 'scaffold' - Larger sequence contigs from the assembly of shorter | |
703 sequencing reads (often from whole genome shotgun, WGS) which could | |
704 not yet be assembled into chromosomes. Often more genome sequencing | |
705 is needed to narrow gaps and establish a tiling path. | |
706 -- 'chunk' - While contig sequences can be assembled into large entities, | |
707 they sometimes have to be artificially broken down into smaller entities | |
708 called 'chunks'. This is due to limitations in the annotation | |
709 pipeline and the finite record size imposed by MySQL which stores the | |
710 sequence and annotation information. | |
711 -- 'clone' - In general this is the smallest sequence entity. It is often | |
712 identical to the sequence of one BAC clone, or sequence region | |
713 of one BAC clone which forms the tiling path. | |
714 <id>: The actual sequence identifier. Depending on the <id type> the <id> | |
715 could represent the name of a chromosome, a scaffold, a contig, a clone .. | |
716 Field is empty for seqlevel files | |
717 fa: All files in these directories represent FASTA database files | |
718 gz: All files are compacted with GNU Zip for storage efficiency. | |
719 | |
720 ----------- | |
721 TOPLEVEL | |
722 ---------- | |
723 These files contain the full sequence of the assembly in fasta format. | |
724 They contain one chromosome per file. | |
725 | |
726 EXAMPLES | |
727 The genomic sequence of human chromosome 1: | |
728 Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz | |
729 | |
730 The masked version of the genome sequence on human chromosome 1 | |
731 (contains '_rm' or '_sm' in the name): | |
732 Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz | |
733 Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz | |
734 | |
735 Non-chromosomal assembly sequences: | |
736 e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes | |
737 Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz | |
738 Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz | |
739 Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz | |
740 | |
741 | |
742 -------------- | |
743 SPECIAL CASES | |
744 -------------- | |
745 Some chromosomes have alternate haplotypes which are presented in files with | |
746 the haplotype sequence only: | |
747 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz | |
748 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz | |
749 | |
750 | |
751 Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR) | |
752 on the Y is annotated. By definition the PAR region is identical on the | |
753 X and Y chromosome. We provide this sequence in the following way. | |
754 -- The Y chromosome file contains the complete sequence of the PAR: | |
755 Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz | |
756 -- The top level file includes only the unique portion of Y (i.e. the PAR | |
757 (region is N-masked): | |
758 Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz | |
759 | |
760 README | |
761 | |
762 pep => <<'README', | |
763 #################### | |
764 Fasta Peptide dumps | |
765 #################### | |
766 These files hold the protein translations of Ensembl gene predictions. | |
767 | |
768 ----------- | |
769 FILE NAMES | |
770 ------------ | |
771 The files are consistently named following this pattern: | |
772 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz | |
773 | |
774 <species>: The systematic name of the species. | |
775 <assembly>: The assembly build name. | |
776 <release>: The release number. | |
777 <sequence type>: pep for peptide sequences | |
778 <status> | |
779 * 'pep.all' - the super-set of all translations resulting from Ensembl known | |
780 or novel gene predictions. | |
781 * 'pep.abinitio' translations resulting from 'ab initio' gene | |
782 prediction algorithms such as SNAP and GENSCAN. In general, all | |
783 'ab initio' predictions are based solely on the genomic sequence and | |
784 not any other experimental evidence. Therefore, not all GENSCAN | |
785 or SNAP predictions represent biologically real proteins. | |
786 fa : All files in these directories represent FASTA database files | |
787 gz : All files are compacted with GNU Zip for storage efficiency. | |
788 | |
789 EXAMPLES (Note: Most species do not sequences for each different <status>) | |
790 for Human: | |
791 Homo_sapiens.NCBI36.40.pep.all.fa.gz | |
792 contains all known and novel peptides | |
793 Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz | |
794 contains all abinitio predicted peptide | |
795 | |
796 Difference between known and novel | |
797 ---------------------------------- | |
798 Protein models that can be mapped to species-specific entries in | |
799 Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as | |
800 known genes. Those that cannot be mapped are called novel | |
801 (e.g. genes predicted on the basis of evidence from closely related species). | |
802 | |
803 For models annotated by HAVANA the status is set manually. Models that have | |
804 an HGNC name are referred to as known and the remaining models are referred to | |
805 as novel. | |
806 | |
807 ------------------------------- | |
808 FASTA Sequence Header Lines | |
809 ------------------------------ | |
810 The FASTA sequence header lines are designed to be consistent across | |
811 all types of Ensembl FASTA sequences. This gives enough information | |
812 for the sequence to be identified outside the context of the FASTA | |
813 database file. | |
814 | |
815 General format: | |
816 | |
817 >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | |
818 | |
819 Example of Ensembl Peptide header: | |
820 | |
821 >ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding | |
822 ^ ^ ^ ^ ^ ^ ^ ^ | |
823 ID | | LOCATION GENE:stable gene ID | GENE: gene biotype TRANSCRIPT: transcript biotype | |
824 | STATUS TRANSCRIPT: stable transcript ID | |
825 SEQTYPE | |
826 | |
827 README | |
828 | |
829 cdna => <<'README', | |
830 ################## | |
831 Fasta cDNA dumps | |
832 ################# | |
833 | |
834 These files hold the cDNA sequences corresponding to Ensembl gene predictions. | |
835 | |
836 ------------ | |
837 FILE NAMES | |
838 ------------ | |
839 The files are consistently named following this pattern: | |
840 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz | |
841 | |
842 <species>: The systematic name of the species. | |
843 <assembly>: The assembly build name. | |
844 <release>: The release number. | |
845 <sequence type>: cdna for cDNA sequences | |
846 <status> | |
847 * 'cdna.all' - the super-set of all transcripts resulting from | |
848 Ensembl known, novel and pseudo gene predictions (see more below). | |
849 * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction | |
850 algorithms such as SNAP and GENSCAN. In general all 'ab initio' | |
851 predictions are solely based on the genomic sequence and do not | |
852 use other experimental evidence. Therefore, not all GENSCAN or SNAP | |
853 cDNA predictions represent biologically real cDNAs. | |
854 Consequently, these predictions should be used with care. | |
855 | |
856 EXAMPLES (Note: Most species do not sequences for each different <status>) | |
857 for Human: | |
858 Homo_sapiens.NCBI36.40.cdna.all.fa.gz | |
859 cDNA sequences for all transcripts: known, novel and pseudo | |
860 Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz | |
861 cDNA sequences for 'ab-initio' prediction transcripts. | |
862 | |
863 Difference between known and novel transcripts | |
864 ----------------------------------------------- | |
865 Transcript or protein models that can be mapped to species-specific entries | |
866 in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl. | |
867 Those that cannot be mapped are called novel genes (e.g. genes predicted on | |
868 the basis of evidence from closely related species). | |
869 | |
870 For models annotated by HAVANA the status is set manually. Models that have | |
871 an HGNC name are referred to as known and the remaining models are referred to | |
872 as novel. | |
873 | |
874 ------------------------------- | |
875 FASTA Sequence Header Lines | |
876 ------------------------------ | |
877 The FASTA sequence header lines are designed to be consistent across | |
878 all types of Ensembl FASTA sequences. This gives enough information | |
879 for the sequence to be identified outside the context of the FASTA file. | |
880 | |
881 General format: | |
882 | |
883 >ID SEQTYPE:STATUS LOCATION GENE | |
884 | |
885 Example of an Ensembl cDNA header: | |
886 | |
887 >ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding | |
888 ^ ^ ^ ^ ^ ^ ^ | |
889 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype | |
890 | STATUS | |
891 SEQTYPE | |
892 | |
893 | |
894 README | |
895 | |
896 ncrna => <<'README', | |
897 ################## | |
898 Fasta RNA dumps | |
899 ################# | |
900 | |
901 These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA). | |
902 | |
903 ------------ | |
904 FILE NAMES | |
905 ------------ | |
906 The files are consistently named following this pattern: | |
907 <species>.<assembly>.<release>.<sequence type>.fa.gz | |
908 | |
909 <species>: The systematic name of the species. | |
910 <assembly>: The assembly build name. | |
911 <release>: The release number. | |
912 <sequence type>: ncrna for non-coding RNA sequences | |
913 | |
914 EXAMPLES | |
915 for Human: | |
916 Homo_sapiens.NCBI36.40.ncrna.fa.gz | |
917 Transcript sequences for all ncRNA gene types. | |
918 | |
919 | |
920 ------------------------------- | |
921 FASTA Sequence Header Lines | |
922 ------------------------------ | |
923 The FASTA sequence header lines are designed to be consistent across | |
924 all types of Ensembl FASTA sequences. This gives enough information | |
925 for the sequence to be identified outside the context of the FASTA file. | |
926 | |
927 General format: | |
928 | |
929 >ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA | |
930 ^ ^ ^ ^ ^ ^ ^ | |
931 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype | |
932 | STATUS | |
933 SEQTYPE | |
934 | |
935 | |
936 README | |
937 ); | |
938 | |
939 my $warning = <<'README'; | |
940 #### README #### | |
941 | |
942 IMPORTANT: Please note you can download correlation data tables, | |
943 supported by Ensembl, via the highly customisable BioMart and | |
944 EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or | |
945 http://www.ebi.ac.uk/biomart/ for more information. | |
946 | |
947 README | |
948 | |
949 my ( $self, $data_type ) = @_; | |
950 my $base_path = $self->fasta_path(); | |
951 my $path = File::Spec->catfile( $base_path, $data_type, 'README' ); | |
952 my $accession = $self->assembly_accession(); | |
953 my $txt = $text{$data_type}; | |
954 throw "Cannot find README text for type $data_type" unless $txt; | |
955 | |
956 #Add accession information if it is available | |
957 if($data_type eq 'dna' && $accession) { | |
958 my $type = $self->assembly_accession_type(); | |
959 $warning .= <<EXTRA; | |
960 The genome assembly represented here corresponds to $type | |
961 $accession | |
962 | |
963 EXTRA | |
964 } | |
965 | |
966 work_with_file($path, 'w', sub { | |
967 my ($fh) = @_; | |
968 print $fh $warning; | |
969 print $fh $txt; | |
970 return; | |
971 }); | |
972 return; | |
973 } | |
974 | |
975 1; | |
976 |