0
|
1 =pod
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Pipeline::FASTA::DumpFile
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 The main workhorse of the FASTA dumping pipeline. This module has two
|
|
28 functions
|
|
29
|
|
30 =over 8
|
|
31
|
|
32 =item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats
|
|
33
|
|
34 =item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included)
|
|
35
|
|
36 =back
|
|
37
|
|
38 The script is responsible for creating the filenames of these target
|
|
39 files, taking data from the database and the formatting of the FASTA
|
|
40 headers. It is also responsible for the creation of README files pertaining
|
|
41 to the type of dumps produced. The final files are all Gzipped at normal
|
|
42 levels of compression.
|
|
43
|
|
44 B<N.B.> This code will remove any files already found in the target directory
|
|
45 on its first run as it assumes all data will be dumped in the one process. It
|
|
46 is selective of its directory meaning a rerun of DNA dumps will not cause
|
|
47 the protein/cdna files to be removed.
|
|
48
|
|
49 Allowed parameters are:
|
|
50
|
|
51 =over 8
|
|
52
|
|
53 =item species - The species to dump
|
|
54
|
|
55 =item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed
|
|
56
|
|
57 =item release - A required parameter for the version of Ensembl we are dumping for
|
|
58
|
|
59 =item db_types - Array reference of the database groups to use. Defaults to core
|
|
60
|
|
61 =item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names
|
|
62
|
|
63 =item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these)
|
|
64
|
|
65
|
|
66 =item base_path - The base of the dumps
|
|
67
|
|
68 =item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and
|
|
69 process when formatting FASTA files. Normally do not
|
|
70 touch
|
|
71
|
|
72 =item allow_appending - If the same file name is generated we will
|
|
73 append into that file rather than overwriting
|
|
74
|
|
75 =item overwrite_files - If the same file name is generated we will overwrite
|
|
76 the into that file rather than appending
|
|
77
|
|
78 =back
|
|
79
|
|
80 =cut
|
|
81
|
|
82 package Bio::EnsEMBL::Pipeline::FASTA::DumpFile;
|
|
83
|
|
84 use strict;
|
|
85 use warnings;
|
|
86
|
|
87 use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base);
|
|
88
|
|
89 use File::Spec;
|
|
90 use IO::Compress::Gzip qw/gzip $GzipError/;
|
|
91 use IO::File;
|
|
92 use Bio::EnsEMBL::PaddedSlice;
|
|
93 use Bio::EnsEMBL::Utils::BiotypeMapper;
|
|
94 use Bio::EnsEMBL::Utils::Exception qw/throw/;
|
|
95 use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
|
|
96 use Bio::EnsEMBL::Utils::IO::FASTASerializer;
|
|
97 use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;
|
|
98
|
|
99 my $DNA_INDEXING_FLOW = 1;
|
|
100 my $PEPTIDE_INDEXING_FLOW = 2;
|
|
101 my $GENE_INDEXING_FLOW = 3;
|
|
102
|
|
103 sub param_defaults {
|
|
104 my ($self) = @_;
|
|
105 return {
|
|
106 #user configurable
|
|
107 allow_appending => 1,
|
|
108 overwrite_files => 0,
|
|
109
|
|
110 dna_chunk_size => 17000,
|
|
111
|
|
112 skip_logic_names => [],
|
|
113 process_logic_names => [],
|
|
114
|
|
115 #DON'T MESS
|
|
116 #used to track if we need to reopen a file in append mode or not
|
|
117 generated_files => {},
|
|
118 remove_files_from_dir => {},
|
|
119 dataflows => []
|
|
120 };
|
|
121 }
|
|
122
|
|
123 sub fetch_input {
|
|
124 my ($self) = @_;
|
|
125
|
|
126 my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') };
|
|
127 $self->param('sequence_types', \%sequence_types);
|
|
128
|
|
129 my $dba = $self->get_DBAdaptor();
|
|
130 my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis');
|
|
131 $self->param('analyses', $analyses);
|
|
132
|
|
133 my $types = $self->param('db_types');
|
|
134 $types = ['core'] unless $types;
|
|
135 $self->param('db_types', $types);
|
|
136
|
|
137 my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')};
|
|
138 $self->param('skip_logic', \%skip_logic_names);
|
|
139 $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')};
|
|
140 my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')};
|
|
141 $self->param('process_logic', \%process_logic_names);
|
|
142 $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')};
|
|
143
|
|
144 return;
|
|
145 }
|
|
146
|
|
147 sub run {
|
|
148 my ($self) = @_;
|
|
149 my $types = $self->param('db_types');
|
|
150 foreach my $type (@{$types}) {
|
|
151 my $dba = $self->get_DBAdaptor($type);
|
|
152 if(! $dba) {
|
|
153 $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type);
|
|
154 next;
|
|
155 }
|
|
156 $self->run_type($type);
|
|
157 }
|
|
158 return;
|
|
159 }
|
|
160
|
|
161 sub write_output {
|
|
162 my ($self) = @_;
|
|
163 my $dataflows = $self->param('dataflows');
|
|
164 foreach my $flow (@{$dataflows}) {
|
|
165 $self->dataflow_output_id(@{$flow});
|
|
166 }
|
|
167 return;
|
|
168 }
|
|
169
|
|
170 sub run_type {
|
|
171 my ($self, $type) = @_;
|
|
172
|
|
173 my $species = $self->param('species');
|
|
174 my $sequence_types = $self->param('sequence_types');
|
|
175
|
|
176 # dump file for each type on a per slice basis
|
|
177 # types are dna,cDNA, peptide, ncRNA
|
|
178
|
|
179 #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref
|
|
180 my $dba = $self->get_DBAdaptor($type);
|
|
181 if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) {
|
|
182 $self->info( "Starting dna dump for " . $species );
|
|
183 $self->_dump_dna($type);
|
|
184 $self->_create_README('dna');
|
|
185 }
|
|
186
|
|
187 if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not
|
|
188 $self->info( "Starting cdna dump for " . $species );
|
|
189 my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type);
|
|
190
|
|
191 $self->info( "Starting prediction transcript dumps for " . $species );
|
|
192 my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type);
|
|
193
|
|
194 $self->_create_README('cdna') if $transcripts || $pred_transcripts;
|
|
195 $self->_create_README('pep') if $peptide || $pred_proteins;
|
|
196 }
|
|
197 if ( $sequence_types->{ncrna} ) {
|
|
198 $self->info( "Starting ncRNA dump for " . $species );
|
|
199 my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type);
|
|
200 $self->_create_README('ncrna') if $ncrna_transcripts;
|
|
201 }
|
|
202
|
|
203 $self->cleanup_DBAdaptor($type);
|
|
204 }
|
|
205
|
|
206 # Dump entire sequence, also dump data into chromosome files as appropriate
|
|
207 sub _dump_dna {
|
|
208 my ($self,$type) = @_;
|
|
209
|
|
210 my @chromosomes;
|
|
211 my @non_chromosomes;
|
|
212 my $filter_human = 1;
|
|
213 foreach my $s (@{$self->get_Slices($type, $filter_human)}) {
|
|
214 my $chr = $s->is_chromosome();
|
|
215 push(@chromosomes, $s) if $chr;
|
|
216 push(@non_chromosomes, $s) if ! $chr;
|
|
217 }
|
|
218
|
|
219 ############ NON CHROMOSOME WORK
|
|
220 $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes));
|
|
221 if(@non_chromosomes) {
|
|
222 my ( $non_specific_file, $non_specific_fh, $other_serializer ) =
|
|
223 $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' );
|
|
224 my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) =
|
|
225 $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' );
|
|
226 foreach my $s (@non_chromosomes) {
|
|
227 $self->_dump_slice($s, $other_serializer, $other_rm_serializer);
|
|
228 }
|
|
229 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
|
|
230 $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1);
|
|
231 my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh);
|
|
232
|
|
233 $self->tidy_file_handle( $non_specific_fh, $non_specific_file );
|
|
234 $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file );
|
|
235 $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file);
|
|
236 $self->info('Dumped non-chromosomes');
|
|
237 }
|
|
238
|
|
239 ############ CHROMOSOME WORK
|
|
240 $self->info('Processing %d chromosome(s)', scalar(@chromosomes));
|
|
241 foreach my $s (@chromosomes) {
|
|
242 my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) =
|
|
243 $self->_generate_fasta_serializer( 'dna', 'chromosome',
|
|
244 $s->seq_region_name(), undef);
|
|
245 # repeat masked data too
|
|
246 my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) =
|
|
247 $self->_generate_fasta_serializer( 'dna_sm', 'chromosome',
|
|
248 $s->seq_region_name(), undef);
|
|
249
|
|
250 $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer);
|
|
251
|
|
252 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
|
|
253 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1);
|
|
254 my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh);
|
|
255
|
|
256 $self->tidy_file_handle($chromo_fh, $chromo_file_name);
|
|
257 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name);
|
|
258 $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file);
|
|
259 }
|
|
260 $self->info("Dumped chromosomes");
|
|
261
|
|
262 #input_id
|
|
263 push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
|
|
264 push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
|
|
265 push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
|
|
266
|
|
267 return;
|
|
268 }
|
|
269
|
|
270 sub _dump_slice {
|
|
271 my ($self, $s, $serialiser, $rm_serialiser) = @_;
|
|
272
|
|
273 my $analyses = $self->param('analyses');
|
|
274
|
|
275 my $chr = $s->is_chromosome();
|
|
276 $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end());
|
|
277 $self->info(' Slice is a chromosome') if $chr;
|
|
278 $self->info(' Slice is non-chromosomal') if ! $chr;
|
|
279
|
|
280 # Make a padded slice (to automatically pad with N's outside of known regions)
|
|
281 # and make a repeat-masked slice and then pad that too.
|
|
282 my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s);
|
|
283 $serialiser->print_Seq($padded_slice);
|
|
284
|
|
285 my $soft_mask = 1;
|
|
286 my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask);
|
|
287 my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice);
|
|
288 $rm_serialiser->print_Seq($padded_masked_slice);
|
|
289
|
|
290 return;
|
|
291 }
|
|
292
|
|
293 #Assumes we are working with un-compressed files
|
|
294 sub _convert_softmask_to_hardmask {
|
|
295 my ($self, $soft_mask_file, $soft_mask_fh) = @_;
|
|
296 if(! -f $soft_mask_file) {
|
|
297 $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file);
|
|
298 return;
|
|
299 }
|
|
300 my $hard_mask_file = $soft_mask_file;
|
|
301 $hard_mask_file =~ s/\.dna_sm\./.dna_rm./;
|
|
302 my $hm_fh = IO::File->new($hard_mask_file, 'w');
|
|
303 $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file);
|
|
304 work_with_file($soft_mask_file, 'r', sub {
|
|
305 my ($sm_fh) = @_;
|
|
306 while(my $line = <$sm_fh>) {
|
|
307 if(index($line, '>') == 0) {
|
|
308 $line =~ s/dna_sm/dna_rm/;
|
|
309 }
|
|
310 else {
|
|
311 $line =~ tr/[acgtn]/N/;
|
|
312 }
|
|
313 print $hm_fh $line;
|
|
314 };
|
|
315 return;
|
|
316 });
|
|
317 return ($hm_fh, $hard_mask_file);
|
|
318 }
|
|
319
|
|
320 sub _dump_transcripts {
|
|
321 my ($self, $transcript_type, $type) = @_;
|
|
322
|
|
323 my $has_transcript_data = 0;
|
|
324 my $has_protein_data = 0;
|
|
325
|
|
326 my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef;
|
|
327 my ( $filename, $fh, $transcript_serializer ) =
|
|
328 $self->_generate_fasta_serializer( $transcript_type, $transcript_level );
|
|
329
|
|
330 my ( $peptide_filename, $pep_fh, $peptide_serializer );
|
|
331
|
|
332 # some cDNAs are translated, make a file to receive them.
|
|
333 if ( $transcript_type eq 'cdna') {
|
|
334 ( $peptide_filename, $pep_fh, $peptide_serializer ) =
|
|
335 $self->_generate_fasta_serializer( 'pep', 'all' );
|
|
336 }
|
|
337
|
|
338 # work out what biotypes correspond to $transcript_type
|
|
339 my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new();
|
|
340 my $biotypes_list = $biotype_mapper->group_members($transcript_type);
|
|
341
|
|
342 my $dba = $self->get_DBAdaptor($type);
|
|
343 my $gene_adaptor = $dba->get_GeneAdaptor();
|
|
344
|
|
345 # get all the transcripts that are $transcript_type e.g. cdna, ncrna,
|
|
346 foreach my $biotype ( @{$biotypes_list} ) {
|
|
347 my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype);
|
|
348 $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} ));
|
|
349 while ( my $gene = shift @{$gene_list} ) {
|
|
350 $self->fine( 'Gene %s', $gene->display_id );
|
|
351 my $transcript_list = $gene->get_all_Transcripts();
|
|
352 foreach my $transcript ( @{$transcript_list} ) {
|
|
353 $self->fine( 'Transcript %s', $transcript->display_id );
|
|
354 next unless $self->ok_to_process_logic_name($transcript);
|
|
355
|
|
356 # foreach transcripts of all genes with biotypes classed as cdna
|
|
357 my $transcript_seq = $transcript->seq();
|
|
358 $self->_create_display_id($transcript, $transcript_seq, $transcript_type);
|
|
359 $transcript_serializer->print_Seq($transcript_seq);
|
|
360 if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) {
|
|
361 my $translation = $transcript->translation();
|
|
362 if ($translation) {
|
|
363 my $translation_seq = $transcript->translate();
|
|
364 $self->_create_display_id($translation, $translation_seq, $transcript_type);
|
|
365 $peptide_serializer->print_Seq($translation_seq);
|
|
366
|
|
367 $has_protein_data = 1;
|
|
368 }
|
|
369 }
|
|
370
|
|
371 $has_transcript_data = 1;
|
|
372 }
|
|
373 }
|
|
374 }
|
|
375
|
|
376 $self->tidy_file_handle( $fh, $filename );
|
|
377 if ( $transcript_type eq 'cdna' ) {
|
|
378 $self->tidy_file_handle( $pep_fh, $peptide_filename );
|
|
379 }
|
|
380
|
|
381 if($has_protein_data) {
|
|
382 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
|
|
383 }
|
|
384 if($has_transcript_data) {
|
|
385 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
|
|
386 }
|
|
387
|
|
388 return ($has_transcript_data, $has_protein_data);
|
|
389 }
|
|
390
|
|
391 # Dump prediction transcripts and peptides. All predicted transcripts have translations
|
|
392 sub _dump_prediction_transcripts {
|
|
393 my ($self, $type) = @_;
|
|
394 my $dba = $self->get_DBAdaptor($type);
|
|
395
|
|
396 my $has_transcript_data = 0;
|
|
397 my $has_protein_data = 0;
|
|
398
|
|
399 my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor();
|
|
400 my $transcript_list = $prediction_transcript_adaptor->fetch_all();
|
|
401 my $count = scalar(@{$transcript_list});
|
|
402 $self->info('Found %d prediction transcript(s)', $count);
|
|
403 if($count) {
|
|
404 my ( $abinitio_filename, $fh, $abinitio_serializer ) =
|
|
405 $self->_generate_fasta_serializer( 'cdna', 'abinitio' );
|
|
406 my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) =
|
|
407 $self->_generate_fasta_serializer( 'pep', 'abinitio' );
|
|
408
|
|
409 while ( my $transcript = shift @{$transcript_list} ) {
|
|
410 next unless $self->ok_to_process_logic_name($transcript);
|
|
411
|
|
412 $has_transcript_data = 1;
|
|
413 my $transcript_seq = $transcript->seq();
|
|
414 $self->_create_display_id( $transcript, $transcript_seq, 'cdna' );
|
|
415 $abinitio_serializer->print_Seq($transcript_seq);
|
|
416
|
|
417 my $translation_seq = $transcript->translate();
|
|
418 if ( $transcript->translation() ) {
|
|
419 $has_protein_data = 1;
|
|
420 $self->_create_display_id( $transcript, $translation_seq, 'pep' );
|
|
421 $abinitio_peptide_serializer->print_Seq($translation_seq);
|
|
422 }
|
|
423 }
|
|
424
|
|
425 $self->tidy_file_handle( $fh, $abinitio_filename );
|
|
426 $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename );
|
|
427
|
|
428 if($has_protein_data) {
|
|
429 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
|
|
430 }
|
|
431 if($has_transcript_data) {
|
|
432 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
|
|
433 }
|
|
434 }
|
|
435
|
|
436
|
|
437 return ($has_transcript_data, $has_protein_data);
|
|
438 }
|
|
439
|
|
440 # We can optionally skip the Gzip process & just delegate to the super class
|
|
441 # for it's cleanup routines which only work with an open file handle. Therefore
|
|
442 # only pass it onto the super implementation *if* the handle was open.
|
|
443 # Also only Gzip if the source file exists (it could have been unlinked from
|
|
444 # an earlier call)
|
|
445
|
|
446 sub tidy_file_handle {
|
|
447 my ($self, $fh, $path, $no_gzip) = @_;
|
|
448 if($fh->opened()) {
|
|
449 my $tidy = $self->SUPER::tidy_file_handle($fh, $path);
|
|
450 return 1 if $tidy;
|
|
451 }
|
|
452
|
|
453 return if $no_gzip; #don't gzip if we were told to skip
|
|
454 return if ! -f $path; #don't gzip if we had no file
|
|
455
|
|
456 my $target = $path.".gz";
|
|
457 $self->info('Gzipping "%s"', $path);
|
|
458 my %args;
|
|
459 if($self->param('generated_files')->{$target}) {
|
|
460 if($self->param('allow_appending')) {
|
|
461 $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target);
|
|
462 $args{Append} = 1;
|
|
463 }
|
|
464 elsif($self->param('overwrite_files')) {
|
|
465 $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target);
|
|
466 }
|
|
467 else {
|
|
468 $self->throw("Cannot continue. The file %s has already been created this session. Fail!");
|
|
469 }
|
|
470 }
|
|
471 gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError";
|
|
472 $self->info(' Removing original file from filesystem');
|
|
473 unlink $path or throw "Could not delete $path: $!";
|
|
474 $self->info(' Finished');
|
|
475 $self->param('generated_files')->{$target} = 1;
|
|
476 return 0;
|
|
477 }
|
|
478
|
|
479 #We assume a transcript is ok to process unless proven otherwise
|
|
480 sub ok_to_process_logic_name {
|
|
481 my ($self, $transcript) = @_;
|
|
482 my $ok = 1;
|
|
483 my $logic_name = $transcript->analysis()->logic_name();
|
|
484 if($self->param('process_logic_active')) {
|
|
485 if(! $self->param('process_logic')->{$logic_name}) {
|
|
486 $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name);
|
|
487 $ok = 0;
|
|
488 }
|
|
489 }
|
|
490 if($self->param('skip_logic_active')) {
|
|
491 if($self->param('skip_logic')->{$logic_name}) {
|
|
492 $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name);
|
|
493 $ok = 0;
|
|
494 }
|
|
495 }
|
|
496 return $ok;
|
|
497 }
|
|
498
|
|
499 #Generates a FASTA serializer but returns the (filename, handle & instance)
|
|
500 sub _generate_fasta_serializer {
|
|
501 my ( $self, $datatype, $level, $section, $header_formatter ) = @_;
|
|
502 $header_formatter ||= $self->_custom_header();
|
|
503 my $chunk = $self->param('dna_chunk_size');
|
|
504 my $filename = $self->_generate_file_name( $datatype, $level, $section );
|
|
505 my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!";
|
|
506 my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk);
|
|
507 return ( $filename, $fh, $ser );
|
|
508 }
|
|
509
|
|
510 #
|
|
511 # _generate_file_name(data type, level, section )
|
|
512 # dna toplevel undef
|
|
513 # dna chromosome 6
|
|
514
|
|
515 sub _generate_file_name {
|
|
516 my ( $self, $data_type, $level, $section ) = @_; #level & section is optional
|
|
517
|
|
518 # File name format looks like:
|
|
519 # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa
|
|
520 # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa
|
|
521 # Homo_sapiens.GRCh37.64.dna.chromosome.20.fa
|
|
522 # Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa
|
|
523 my @name_bits;
|
|
524 push @name_bits, $self->web_name();
|
|
525 push @name_bits, $self->assembly();
|
|
526 push @name_bits, $self->param('release');
|
|
527 push @name_bits, lc($data_type);
|
|
528 push @name_bits, $level if $level;
|
|
529 push @name_bits, $section if $section;
|
|
530 push @name_bits, 'fa';
|
|
531
|
|
532 my $file_name = join( '.', @name_bits );
|
|
533
|
|
534 $data_type =~ s/_[rs]m$//; # remove repeatmask or softmask designation from path component
|
|
535 my $data_type_dir = $self->fasta_path($data_type);
|
|
536 $self->_remove_files_from_dir($data_type_dir);
|
|
537 return File::Spec->catfile( $data_type_dir, $file_name );
|
|
538 }
|
|
539
|
|
540 # Attempts to remove any generated files previously present for the instance
|
|
541 # of the Process
|
|
542 sub _remove_files_from_dir {
|
|
543 my ($self, $dir) = @_;
|
|
544 if(! $self->param('remove_files_from_dir')->{$dir}) {
|
|
545 $self->unlink_all_files($dir);
|
|
546 $self->param('remove_files_from_dir')->{$dir} = 1;
|
|
547 }
|
|
548 return;
|
|
549 }
|
|
550
|
|
551 ##Logic used to generate the expected format for a FASTA header
|
|
552 sub _create_display_id {
|
|
553 my ($self, $object, $seq, $type) = @_;
|
|
554
|
|
555 my $stable_id;
|
|
556 my $location;
|
|
557 my $decoded_type;
|
|
558 my $decoded_status;
|
|
559 my %attributes;
|
|
560
|
|
561 if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) {
|
|
562 $attributes{transcript_biotype} = $object->biotype();
|
|
563
|
|
564 #If pred transcript then no gene but type & status are different
|
|
565 if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) {
|
|
566 $stable_id = $object->stable_id();
|
|
567 $location = $object->feature_Slice()->name();
|
|
568 $decoded_type = $type;
|
|
569 $decoded_status = lc($object->analysis()->logic_name());
|
|
570 if($type eq 'pep') {
|
|
571 $attributes{transcript} = $stable_id;
|
|
572 }
|
|
573 }
|
|
574 #Must be a real "transcript"
|
|
575 else {
|
|
576 $stable_id = $object->stable_id();
|
|
577 $location = $object->feature_Slice()->name();
|
|
578 my $gene = $object->get_Gene();
|
|
579 $attributes{gene} = $gene->stable_id();
|
|
580 $attributes{gene_biotype} = $gene->biotype();
|
|
581
|
|
582 #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script)
|
|
583 if($type eq 'ncrna') {
|
|
584 $decoded_type = lc($object->analysis()->logic_name());
|
|
585 $decoded_status = $gene->biotype();
|
|
586 }
|
|
587 elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) {
|
|
588 $decoded_type = $type;
|
|
589 $decoded_status = 'pseudogene';
|
|
590 }
|
|
591 #Otherwise use type & object's transcript's status
|
|
592 else {
|
|
593 $decoded_type = $type;
|
|
594 $decoded_status = lc($object->status());
|
|
595 }
|
|
596 }
|
|
597 }
|
|
598 #If it's a translation then grab the transcript and gene then set accordingly
|
|
599 elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) {
|
|
600 my $transcript = $object->transcript();
|
|
601 my $gene = $transcript->get_Gene();
|
|
602 $stable_id = $object->stable_id();
|
|
603 $location = $transcript->feature_Slice()->name();
|
|
604 %attributes = (
|
|
605 gene => $gene->stable_id(),
|
|
606 gene_biotype => $gene->biotype(),
|
|
607 transcript => $transcript->stable_id(),
|
|
608 transcript_biotype => $transcript->biotype()
|
|
609 );
|
|
610 $decoded_type = 'pep';
|
|
611 $decoded_status = lc($transcript->status());
|
|
612 }
|
|
613 else {
|
|
614 throw sprintf( 'Do not understand how to format a display_id for type "%s"',
|
|
615 ref($object) );
|
|
616 }
|
|
617
|
|
618 my $attr_str = join(q{ },
|
|
619 map { $_.':'.$attributes{$_} }
|
|
620 grep { exists $attributes{$_} }
|
|
621 qw/gene transcript gene_biotype transcript_biotype/);
|
|
622
|
|
623 my $format = '%s %s:%s %s %s';
|
|
624
|
|
625 my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str);
|
|
626 $seq->display_id($id);
|
|
627
|
|
628 return;
|
|
629 }
|
|
630
|
|
631 sub _custom_header {
|
|
632 my ($self) = @_;
|
|
633 return sub {
|
|
634 my $slice = shift;
|
|
635 if ( !$slice->isa('Bio::EnsEMBL::Slice') ) {
|
|
636 return $slice->display_id();
|
|
637 }
|
|
638
|
|
639 #RMS means masked data. soft_mask() true means it was softmasked
|
|
640 my $dna_type = 'dna';
|
|
641 if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) {
|
|
642 $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm';
|
|
643 }
|
|
644
|
|
645 my $id = $slice->seq_region_name;
|
|
646 my $idtype = $slice->coord_system->name;
|
|
647 my $location = $slice->name;
|
|
648 my $ref = $slice->assembly_exception_type();
|
|
649 my $header = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref);
|
|
650 return $header;
|
|
651 };
|
|
652 }
|
|
653
|
|
654 sub _final_filename {
|
|
655 my ($self, $filename) = @_;
|
|
656 return $filename if $filename =~ /\.gz$/;
|
|
657 return $filename.'.gz';
|
|
658 }
|
|
659
|
|
660 sub assembly_accession {
|
|
661 my ($self) = @_;
|
|
662 my $mc = $self->get_DBAdaptor()->get_MetaContainer();
|
|
663 return $mc->single_value_by_key('assembly.accession');
|
|
664 }
|
|
665
|
|
666 sub assembly_accession_type {
|
|
667 my ($self) = @_;
|
|
668 my $mc = $self->get_DBAdaptor()->get_MetaContainer();
|
|
669 return $mc->single_value_by_key('assembly.web_accession_type');
|
|
670 }
|
|
671
|
|
672 sub _create_README {
|
|
673
|
|
674 #Text for readme files
|
|
675
|
|
676 my %text = (
|
|
677 dna => <<'README',
|
|
678 #######################
|
|
679 Fasta DNA dumps
|
|
680 #######################
|
|
681
|
|
682 -----------
|
|
683 FILE NAMES
|
|
684 ------------
|
|
685 The files are consistently named following this pattern:
|
|
686 <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
|
|
687
|
|
688 <species>: The systematic name of the species.
|
|
689 <assembly>: The assembly build name.
|
|
690 <release>: The release number.
|
|
691 <sequence type>:
|
|
692 * 'dna' - unmasked genomic DNA sequences.
|
|
693 * 'dna_rm' - masked genomic DNA. Interspersed repeats and low
|
|
694 complexity regions are detected with the RepeatMasker tool and masked
|
|
695 by replacing repeats with 'N's.
|
|
696 * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions
|
|
697 have been replaced with lowercased versions of their nucleic base
|
|
698 <id type> One of the following:
|
|
699 * 'chromosome'a - The top-level coordinate system in most species in Ensembl
|
|
700 * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome
|
|
701 * 'seqlevel' - This is usually sequence scaffolds, chunks or clones.
|
|
702 -- 'scaffold' - Larger sequence contigs from the assembly of shorter
|
|
703 sequencing reads (often from whole genome shotgun, WGS) which could
|
|
704 not yet be assembled into chromosomes. Often more genome sequencing
|
|
705 is needed to narrow gaps and establish a tiling path.
|
|
706 -- 'chunk' - While contig sequences can be assembled into large entities,
|
|
707 they sometimes have to be artificially broken down into smaller entities
|
|
708 called 'chunks'. This is due to limitations in the annotation
|
|
709 pipeline and the finite record size imposed by MySQL which stores the
|
|
710 sequence and annotation information.
|
|
711 -- 'clone' - In general this is the smallest sequence entity. It is often
|
|
712 identical to the sequence of one BAC clone, or sequence region
|
|
713 of one BAC clone which forms the tiling path.
|
|
714 <id>: The actual sequence identifier. Depending on the <id type> the <id>
|
|
715 could represent the name of a chromosome, a scaffold, a contig, a clone ..
|
|
716 Field is empty for seqlevel files
|
|
717 fa: All files in these directories represent FASTA database files
|
|
718 gz: All files are compacted with GNU Zip for storage efficiency.
|
|
719
|
|
720 -----------
|
|
721 TOPLEVEL
|
|
722 ----------
|
|
723 These files contain the full sequence of the assembly in fasta format.
|
|
724 They contain one chromosome per file.
|
|
725
|
|
726 EXAMPLES
|
|
727 The genomic sequence of human chromosome 1:
|
|
728 Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz
|
|
729
|
|
730 The masked version of the genome sequence on human chromosome 1
|
|
731 (contains '_rm' or '_sm' in the name):
|
|
732 Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz
|
|
733 Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz
|
|
734
|
|
735 Non-chromosomal assembly sequences:
|
|
736 e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes
|
|
737 Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz
|
|
738 Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz
|
|
739 Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz
|
|
740
|
|
741
|
|
742 --------------
|
|
743 SPECIAL CASES
|
|
744 --------------
|
|
745 Some chromosomes have alternate haplotypes which are presented in files with
|
|
746 the haplotype sequence only:
|
|
747 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz
|
|
748 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz
|
|
749
|
|
750
|
|
751 Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR)
|
|
752 on the Y is annotated. By definition the PAR region is identical on the
|
|
753 X and Y chromosome. We provide this sequence in the following way.
|
|
754 -- The Y chromosome file contains the complete sequence of the PAR:
|
|
755 Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz
|
|
756 -- The top level file includes only the unique portion of Y (i.e. the PAR
|
|
757 (region is N-masked):
|
|
758 Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz
|
|
759
|
|
760 README
|
|
761
|
|
762 pep => <<'README',
|
|
763 ####################
|
|
764 Fasta Peptide dumps
|
|
765 ####################
|
|
766 These files hold the protein translations of Ensembl gene predictions.
|
|
767
|
|
768 -----------
|
|
769 FILE NAMES
|
|
770 ------------
|
|
771 The files are consistently named following this pattern:
|
|
772 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
|
|
773
|
|
774 <species>: The systematic name of the species.
|
|
775 <assembly>: The assembly build name.
|
|
776 <release>: The release number.
|
|
777 <sequence type>: pep for peptide sequences
|
|
778 <status>
|
|
779 * 'pep.all' - the super-set of all translations resulting from Ensembl known
|
|
780 or novel gene predictions.
|
|
781 * 'pep.abinitio' translations resulting from 'ab initio' gene
|
|
782 prediction algorithms such as SNAP and GENSCAN. In general, all
|
|
783 'ab initio' predictions are based solely on the genomic sequence and
|
|
784 not any other experimental evidence. Therefore, not all GENSCAN
|
|
785 or SNAP predictions represent biologically real proteins.
|
|
786 fa : All files in these directories represent FASTA database files
|
|
787 gz : All files are compacted with GNU Zip for storage efficiency.
|
|
788
|
|
789 EXAMPLES (Note: Most species do not sequences for each different <status>)
|
|
790 for Human:
|
|
791 Homo_sapiens.NCBI36.40.pep.all.fa.gz
|
|
792 contains all known and novel peptides
|
|
793 Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz
|
|
794 contains all abinitio predicted peptide
|
|
795
|
|
796 Difference between known and novel
|
|
797 ----------------------------------
|
|
798 Protein models that can be mapped to species-specific entries in
|
|
799 Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as
|
|
800 known genes. Those that cannot be mapped are called novel
|
|
801 (e.g. genes predicted on the basis of evidence from closely related species).
|
|
802
|
|
803 For models annotated by HAVANA the status is set manually. Models that have
|
|
804 an HGNC name are referred to as known and the remaining models are referred to
|
|
805 as novel.
|
|
806
|
|
807 -------------------------------
|
|
808 FASTA Sequence Header Lines
|
|
809 ------------------------------
|
|
810 The FASTA sequence header lines are designed to be consistent across
|
|
811 all types of Ensembl FASTA sequences. This gives enough information
|
|
812 for the sequence to be identified outside the context of the FASTA
|
|
813 database file.
|
|
814
|
|
815 General format:
|
|
816
|
|
817 >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
|
|
818
|
|
819 Example of Ensembl Peptide header:
|
|
820
|
|
821 >ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
|
|
822 ^ ^ ^ ^ ^ ^ ^ ^
|
|
823 ID | | LOCATION GENE:stable gene ID | GENE: gene biotype TRANSCRIPT: transcript biotype
|
|
824 | STATUS TRANSCRIPT: stable transcript ID
|
|
825 SEQTYPE
|
|
826
|
|
827 README
|
|
828
|
|
829 cdna => <<'README',
|
|
830 ##################
|
|
831 Fasta cDNA dumps
|
|
832 #################
|
|
833
|
|
834 These files hold the cDNA sequences corresponding to Ensembl gene predictions.
|
|
835
|
|
836 ------------
|
|
837 FILE NAMES
|
|
838 ------------
|
|
839 The files are consistently named following this pattern:
|
|
840 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
|
|
841
|
|
842 <species>: The systematic name of the species.
|
|
843 <assembly>: The assembly build name.
|
|
844 <release>: The release number.
|
|
845 <sequence type>: cdna for cDNA sequences
|
|
846 <status>
|
|
847 * 'cdna.all' - the super-set of all transcripts resulting from
|
|
848 Ensembl known, novel and pseudo gene predictions (see more below).
|
|
849 * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction
|
|
850 algorithms such as SNAP and GENSCAN. In general all 'ab initio'
|
|
851 predictions are solely based on the genomic sequence and do not
|
|
852 use other experimental evidence. Therefore, not all GENSCAN or SNAP
|
|
853 cDNA predictions represent biologically real cDNAs.
|
|
854 Consequently, these predictions should be used with care.
|
|
855
|
|
856 EXAMPLES (Note: Most species do not sequences for each different <status>)
|
|
857 for Human:
|
|
858 Homo_sapiens.NCBI36.40.cdna.all.fa.gz
|
|
859 cDNA sequences for all transcripts: known, novel and pseudo
|
|
860 Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz
|
|
861 cDNA sequences for 'ab-initio' prediction transcripts.
|
|
862
|
|
863 Difference between known and novel transcripts
|
|
864 -----------------------------------------------
|
|
865 Transcript or protein models that can be mapped to species-specific entries
|
|
866 in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl.
|
|
867 Those that cannot be mapped are called novel genes (e.g. genes predicted on
|
|
868 the basis of evidence from closely related species).
|
|
869
|
|
870 For models annotated by HAVANA the status is set manually. Models that have
|
|
871 an HGNC name are referred to as known and the remaining models are referred to
|
|
872 as novel.
|
|
873
|
|
874 -------------------------------
|
|
875 FASTA Sequence Header Lines
|
|
876 ------------------------------
|
|
877 The FASTA sequence header lines are designed to be consistent across
|
|
878 all types of Ensembl FASTA sequences. This gives enough information
|
|
879 for the sequence to be identified outside the context of the FASTA file.
|
|
880
|
|
881 General format:
|
|
882
|
|
883 >ID SEQTYPE:STATUS LOCATION GENE
|
|
884
|
|
885 Example of an Ensembl cDNA header:
|
|
886
|
|
887 >ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding
|
|
888 ^ ^ ^ ^ ^ ^ ^
|
|
889 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype
|
|
890 | STATUS
|
|
891 SEQTYPE
|
|
892
|
|
893
|
|
894 README
|
|
895
|
|
896 ncrna => <<'README',
|
|
897 ##################
|
|
898 Fasta RNA dumps
|
|
899 #################
|
|
900
|
|
901 These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA).
|
|
902
|
|
903 ------------
|
|
904 FILE NAMES
|
|
905 ------------
|
|
906 The files are consistently named following this pattern:
|
|
907 <species>.<assembly>.<release>.<sequence type>.fa.gz
|
|
908
|
|
909 <species>: The systematic name of the species.
|
|
910 <assembly>: The assembly build name.
|
|
911 <release>: The release number.
|
|
912 <sequence type>: ncrna for non-coding RNA sequences
|
|
913
|
|
914 EXAMPLES
|
|
915 for Human:
|
|
916 Homo_sapiens.NCBI36.40.ncrna.fa.gz
|
|
917 Transcript sequences for all ncRNA gene types.
|
|
918
|
|
919
|
|
920 -------------------------------
|
|
921 FASTA Sequence Header Lines
|
|
922 ------------------------------
|
|
923 The FASTA sequence header lines are designed to be consistent across
|
|
924 all types of Ensembl FASTA sequences. This gives enough information
|
|
925 for the sequence to be identified outside the context of the FASTA file.
|
|
926
|
|
927 General format:
|
|
928
|
|
929 >ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA
|
|
930 ^ ^ ^ ^ ^ ^ ^
|
|
931 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype
|
|
932 | STATUS
|
|
933 SEQTYPE
|
|
934
|
|
935
|
|
936 README
|
|
937 );
|
|
938
|
|
939 my $warning = <<'README';
|
|
940 #### README ####
|
|
941
|
|
942 IMPORTANT: Please note you can download correlation data tables,
|
|
943 supported by Ensembl, via the highly customisable BioMart and
|
|
944 EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
|
|
945 http://www.ebi.ac.uk/biomart/ for more information.
|
|
946
|
|
947 README
|
|
948
|
|
949 my ( $self, $data_type ) = @_;
|
|
950 my $base_path = $self->fasta_path();
|
|
951 my $path = File::Spec->catfile( $base_path, $data_type, 'README' );
|
|
952 my $accession = $self->assembly_accession();
|
|
953 my $txt = $text{$data_type};
|
|
954 throw "Cannot find README text for type $data_type" unless $txt;
|
|
955
|
|
956 #Add accession information if it is available
|
|
957 if($data_type eq 'dna' && $accession) {
|
|
958 my $type = $self->assembly_accession_type();
|
|
959 $warning .= <<EXTRA;
|
|
960 The genome assembly represented here corresponds to $type
|
|
961 $accession
|
|
962
|
|
963 EXTRA
|
|
964 }
|
|
965
|
|
966 work_with_file($path, 'w', sub {
|
|
967 my ($fh) = @_;
|
|
968 print $fh $warning;
|
|
969 print $fh $txt;
|
|
970 return;
|
|
971 });
|
|
972 return;
|
|
973 }
|
|
974
|
|
975 1;
|
|
976
|