0
|
1 =pod
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Pipeline::Flatfile::DumpFile
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 The main workhorse of the Flatfile dumping pipeline.
|
|
28
|
|
29 The script is responsible for creating the filenames of these target
|
|
30 files, taking data from the database and the formatting of the flat files
|
|
31 headers. The final files are all Gzipped at normal levels of compression.
|
|
32
|
|
33 Allowed parameters are:
|
|
34
|
|
35 =over 8
|
|
36
|
|
37 =item species - The species to dump
|
|
38
|
|
39 =item base_path - The base of the dumps
|
|
40
|
|
41 =item release - The current release we are emitting
|
|
42
|
|
43 =item type - The type of data we are emitting. Should be embl or genbank
|
|
44
|
|
45 =back
|
|
46
|
|
47 =cut
|
|
48
|
|
49 package Bio::EnsEMBL::Pipeline::Flatfile::DumpFile;
|
|
50
|
|
51 use strict;
|
|
52 use warnings;
|
|
53
|
|
54 use base qw(Bio::EnsEMBL::Pipeline::Flatfile::Base);
|
|
55
|
|
56 use Bio::EnsEMBL::Utils::Exception qw/throw/;
|
|
57 use Bio::EnsEMBL::Utils::SeqDumper;
|
|
58 use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file work_with_file/;
|
|
59 use File::Path qw/rmtree/;
|
|
60
|
|
61 sub param_defaults {
|
|
62 my ($self) = @_;
|
|
63 return {
|
|
64 supported_types => {embl => 1, genbank => 1},
|
|
65 };
|
|
66 }
|
|
67
|
|
68 sub fetch_input {
|
|
69 my ($self) = @_;
|
|
70
|
|
71 my $type = $self->param('type');
|
|
72 throw "No type specified" unless $type;
|
|
73 throw "Unsupported type '$type' specified" unless $self->param('supported_types')->{$type};
|
|
74
|
|
75 throw "Need a species" unless $self->param('species');
|
|
76 throw "Need a release" unless $self->param('release');
|
|
77 throw "Need a base_path" unless $self->param('base_path');
|
|
78
|
|
79 return;
|
|
80 }
|
|
81
|
|
82 sub run {
|
|
83 my ($self) = @_;
|
|
84
|
|
85 my $root = $self->data_path();
|
|
86 if(-d $root) {
|
|
87 $self->info('Directory "%s" already exists; removing', $root);
|
|
88 rmtree($root);
|
|
89 }
|
|
90
|
|
91 my $type = $self->param('type');
|
|
92 my $target = "dump_${type}";
|
|
93 my $seq_dumper = $self->_seq_dumper();
|
|
94
|
|
95 my @chromosomes;
|
|
96 my @non_chromosomes;
|
|
97 foreach my $s (@{$self->get_Slices()}) {
|
|
98 my $chr = $s->is_chromosome();
|
|
99 push(@chromosomes, $s) if $chr;
|
|
100 push(@non_chromosomes, $s) if ! $chr;
|
|
101 }
|
|
102
|
|
103 if(@non_chromosomes) {
|
|
104 my $path = $self->_generate_file_name('nonchromosomal');
|
|
105 $self->info('Dumping non-chromosomal data to %s', $path);
|
|
106 gz_work_with_file($path, 'w', sub {
|
|
107 my ($fh) = @_;
|
|
108 foreach my $slice (@non_chromosomes) {
|
|
109 $self->fine('Dumping non-chromosomal %s', $slice->name());
|
|
110 $seq_dumper->$target($slice, $fh);
|
|
111 }
|
|
112 return;
|
|
113 });
|
|
114 }
|
|
115 else {
|
|
116 $self->info('Did not find any non-chromosomal data');
|
|
117 }
|
|
118
|
|
119 foreach my $slice (@chromosomes) {
|
|
120 $self->fine('Dumping chromosome %s', $slice->name());
|
|
121 my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
|
|
122 my $args = {};
|
|
123 if(-f $path) {
|
|
124 $self->fine('Path "%s" already exists; appending', $path);
|
|
125 $args->{Append} = 1;
|
|
126 }
|
|
127 gz_work_with_file($path, 'w', sub {
|
|
128 my ($fh) = @_;
|
|
129 $seq_dumper->$target($slice, $fh);
|
|
130 return;
|
|
131 }, $args);
|
|
132 }
|
|
133
|
|
134 $self->_create_README();
|
|
135
|
|
136 return;
|
|
137 }
|
|
138
|
|
139 sub _seq_dumper {
|
|
140 my ($self) = @_;
|
|
141 my $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
|
|
142 $seq_dumper->disable_feature_type('similarity');
|
|
143 $seq_dumper->disable_feature_type('genscan');
|
|
144 $seq_dumper->disable_feature_type('variation');
|
|
145 $seq_dumper->disable_feature_type('repeat');
|
|
146 return $seq_dumper;
|
|
147 }
|
|
148
|
|
149 sub _generate_file_name {
|
|
150 my ($self, $section, $name) = @_;
|
|
151
|
|
152 # File name format looks like:
|
|
153 # <species>.<assembly>.<release>.<section.name|section>.dat.gz
|
|
154 # e.g. Homo_sapiens.GRCh37.64.chromosome.20.dat.gz
|
|
155 # Homo_sapiens.GRCh37.64.nonchromosomal.dat.gz
|
|
156 my @name_bits;
|
|
157 push @name_bits, $self->web_name();
|
|
158 push @name_bits, $self->assembly();
|
|
159 push @name_bits, $self->param('release');
|
|
160 push @name_bits, $section if $section;
|
|
161 push @name_bits, $name if $name;
|
|
162 push @name_bits, 'dat', 'gz';
|
|
163
|
|
164 my $file_name = join( '.', @name_bits );
|
|
165 my $path = $self->data_path();
|
|
166 return File::Spec->catfile($path, $file_name);
|
|
167 }
|
|
168
|
|
169 sub _create_README {
|
|
170 my ($self) = @_;
|
|
171 my $species = $self->scientific_name();
|
|
172 my $format = uc($self->param('type'));
|
|
173
|
|
174 my $readme = <<README;
|
|
175 #### README ####
|
|
176
|
|
177 IMPORTANT: Please note you can download correlation data tables,
|
|
178 supported by Ensembl, via the highly customisable BioMart and
|
|
179 EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
|
|
180 http://www.ebi.ac.uk/biomart/ for more information.
|
|
181
|
|
182 -----------------------
|
|
183 $format FLATFILE DUMPS
|
|
184 -----------------------
|
|
185 This directory contains $species $format flatfile dumps. To ease
|
|
186 downloading of the files, the $format format entries are bundled
|
|
187 into groups of chromosomes and non-chromosomal regions.
|
|
188 All files are then compacted with gzip.
|
|
189
|
|
190 Ensembl provides an automatic reannotation of $species genomic data.
|
|
191 These data will be dumped in a number of forms - one of them being
|
|
192 $format flat files. As the annotation of this form comes from Ensembl,
|
|
193 and not the original sequence entry, the two annotations are
|
|
194 likely to be different.
|
|
195
|
|
196 $format flat file format dumping provides all the confirmed protein coding
|
|
197 genes known by Ensembl. Considerably more information is stored in Ensembl:
|
|
198 the flat file just gives a representation which is compatible with
|
|
199 existing tools.
|
|
200
|
|
201 The main body of the entry gives the same information as is in the main
|
|
202 $format flat file entry.
|
|
203
|
|
204 * ID - the $format id
|
|
205 * AC - the EMBL/GenBank/DDBJ accession number (only the primary
|
|
206 accession number used)
|
|
207 * SV - The accession.version pair which gives the exact reference to
|
|
208 a particular sequence
|
|
209 * CC - comment lines to help you interpret the entry
|
|
210
|
|
211 Currently the following features are dumped into the feature table of
|
|
212 the Ensembl entry:
|
|
213
|
|
214 * Transcripts as CDS entries. Each transcript has the following
|
|
215 attributes attached
|
|
216 o Transcript id - a stable id, which Ensembl will attempt to
|
|
217 preserve as sensibly as possible during updates of the data
|
|
218 o Gene id - indication of the gene that this transcript belongs
|
|
219 to. gene ids are stable and preserved as sensibly as possible
|
|
220 during updates of the data
|
|
221 o Translation - the peptide translation of the transcript.
|
|
222 * Exons as exon entries. Each exon has the following information
|
|
223 o Exon id. The exon id is stable and preserved as sensibly
|
|
224 as possible during sequence updates
|
|
225 o start_phase. The phase of the splice site at the 5' end
|
|
226 of the exon. Phase 0 means between two codons, phase 1
|
|
227 means between the first and the second base of the codon
|
|
228 (meaning that there are 2 bases until the reading frame of
|
|
229 the exon) and phase 2 means between the second and the third
|
|
230 base of the codon (one base until the reading frame starts).
|
|
231 o end_phase. The phase of the splice site at the 3' end of the
|
|
232 exon: same definition as above (though of course, being end_phase,
|
|
233 the position relative to the exon's reading frame is different
|
|
234 for phase 1 and 2).
|
|
235
|
|
236 We are considering other information that should be made dumpable. In
|
|
237 general we would prefer people to use database access over flat file
|
|
238 access if you want to do something serious with the data.
|
|
239
|
|
240 README
|
|
241
|
|
242 my $path = File::Spec->catfile($self->data_path(), 'README');
|
|
243 work_with_file($path, 'w', sub {
|
|
244 my ($fh) = @_;
|
|
245 print $fh $readme;
|
|
246 return;
|
|
247 });
|
|
248 return;
|
|
249 }
|
|
250
|
|
251
|
|
252 1;
|
|
253
|