comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 CONTACT
14
15 Please email comments or questions to the public Ensembl
16 developers list at <dev@ensembl.org>.
17
18 Questions may also be sent to the Ensembl help desk at
19 <helpdesk@ensembl.org>.
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Pipeline::Flatfile::DumpFile
24
25 =head1 DESCRIPTION
26
27 The main workhorse of the Flatfile dumping pipeline.
28
29 The script is responsible for creating the filenames of these target
30 files, taking data from the database and the formatting of the flat files
31 headers. The final files are all Gzipped at normal levels of compression.
32
33 Allowed parameters are:
34
35 =over 8
36
37 =item species - The species to dump
38
39 =item base_path - The base of the dumps
40
41 =item release - The current release we are emitting
42
43 =item type - The type of data we are emitting. Should be embl or genbank
44
45 =back
46
47 =cut
48
49 package Bio::EnsEMBL::Pipeline::Flatfile::DumpFile;
50
51 use strict;
52 use warnings;
53
54 use base qw(Bio::EnsEMBL::Pipeline::Flatfile::Base);
55
56 use Bio::EnsEMBL::Utils::Exception qw/throw/;
57 use Bio::EnsEMBL::Utils::SeqDumper;
58 use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file work_with_file/;
59 use File::Path qw/rmtree/;
60
61 sub param_defaults {
62 my ($self) = @_;
63 return {
64 supported_types => {embl => 1, genbank => 1},
65 };
66 }
67
68 sub fetch_input {
69 my ($self) = @_;
70
71 my $type = $self->param('type');
72 throw "No type specified" unless $type;
73 throw "Unsupported type '$type' specified" unless $self->param('supported_types')->{$type};
74
75 throw "Need a species" unless $self->param('species');
76 throw "Need a release" unless $self->param('release');
77 throw "Need a base_path" unless $self->param('base_path');
78
79 return;
80 }
81
82 sub run {
83 my ($self) = @_;
84
85 my $root = $self->data_path();
86 if(-d $root) {
87 $self->info('Directory "%s" already exists; removing', $root);
88 rmtree($root);
89 }
90
91 my $type = $self->param('type');
92 my $target = "dump_${type}";
93 my $seq_dumper = $self->_seq_dumper();
94
95 my @chromosomes;
96 my @non_chromosomes;
97 foreach my $s (@{$self->get_Slices()}) {
98 my $chr = $s->is_chromosome();
99 push(@chromosomes, $s) if $chr;
100 push(@non_chromosomes, $s) if ! $chr;
101 }
102
103 if(@non_chromosomes) {
104 my $path = $self->_generate_file_name('nonchromosomal');
105 $self->info('Dumping non-chromosomal data to %s', $path);
106 gz_work_with_file($path, 'w', sub {
107 my ($fh) = @_;
108 foreach my $slice (@non_chromosomes) {
109 $self->fine('Dumping non-chromosomal %s', $slice->name());
110 $seq_dumper->$target($slice, $fh);
111 }
112 return;
113 });
114 }
115 else {
116 $self->info('Did not find any non-chromosomal data');
117 }
118
119 foreach my $slice (@chromosomes) {
120 $self->fine('Dumping chromosome %s', $slice->name());
121 my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
122 my $args = {};
123 if(-f $path) {
124 $self->fine('Path "%s" already exists; appending', $path);
125 $args->{Append} = 1;
126 }
127 gz_work_with_file($path, 'w', sub {
128 my ($fh) = @_;
129 $seq_dumper->$target($slice, $fh);
130 return;
131 }, $args);
132 }
133
134 $self->_create_README();
135
136 return;
137 }
138
139 sub _seq_dumper {
140 my ($self) = @_;
141 my $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
142 $seq_dumper->disable_feature_type('similarity');
143 $seq_dumper->disable_feature_type('genscan');
144 $seq_dumper->disable_feature_type('variation');
145 $seq_dumper->disable_feature_type('repeat');
146 return $seq_dumper;
147 }
148
149 sub _generate_file_name {
150 my ($self, $section, $name) = @_;
151
152 # File name format looks like:
153 # <species>.<assembly>.<release>.<section.name|section>.dat.gz
154 # e.g. Homo_sapiens.GRCh37.64.chromosome.20.dat.gz
155 # Homo_sapiens.GRCh37.64.nonchromosomal.dat.gz
156 my @name_bits;
157 push @name_bits, $self->web_name();
158 push @name_bits, $self->assembly();
159 push @name_bits, $self->param('release');
160 push @name_bits, $section if $section;
161 push @name_bits, $name if $name;
162 push @name_bits, 'dat', 'gz';
163
164 my $file_name = join( '.', @name_bits );
165 my $path = $self->data_path();
166 return File::Spec->catfile($path, $file_name);
167 }
168
169 sub _create_README {
170 my ($self) = @_;
171 my $species = $self->scientific_name();
172 my $format = uc($self->param('type'));
173
174 my $readme = <<README;
175 #### README ####
176
177 IMPORTANT: Please note you can download correlation data tables,
178 supported by Ensembl, via the highly customisable BioMart and
179 EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
180 http://www.ebi.ac.uk/biomart/ for more information.
181
182 -----------------------
183 $format FLATFILE DUMPS
184 -----------------------
185 This directory contains $species $format flatfile dumps. To ease
186 downloading of the files, the $format format entries are bundled
187 into groups of chromosomes and non-chromosomal regions.
188 All files are then compacted with gzip.
189
190 Ensembl provides an automatic reannotation of $species genomic data.
191 These data will be dumped in a number of forms - one of them being
192 $format flat files. As the annotation of this form comes from Ensembl,
193 and not the original sequence entry, the two annotations are
194 likely to be different.
195
196 $format flat file format dumping provides all the confirmed protein coding
197 genes known by Ensembl. Considerably more information is stored in Ensembl:
198 the flat file just gives a representation which is compatible with
199 existing tools.
200
201 The main body of the entry gives the same information as is in the main
202 $format flat file entry.
203
204 * ID - the $format id
205 * AC - the EMBL/GenBank/DDBJ accession number (only the primary
206 accession number used)
207 * SV - The accession.version pair which gives the exact reference to
208 a particular sequence
209 * CC - comment lines to help you interpret the entry
210
211 Currently the following features are dumped into the feature table of
212 the Ensembl entry:
213
214 * Transcripts as CDS entries. Each transcript has the following
215 attributes attached
216 o Transcript id - a stable id, which Ensembl will attempt to
217 preserve as sensibly as possible during updates of the data
218 o Gene id - indication of the gene that this transcript belongs
219 to. gene ids are stable and preserved as sensibly as possible
220 during updates of the data
221 o Translation - the peptide translation of the transcript.
222 * Exons as exon entries. Each exon has the following information
223 o Exon id. The exon id is stable and preserved as sensibly
224 as possible during sequence updates
225 o start_phase. The phase of the splice site at the 5' end
226 of the exon. Phase 0 means between two codons, phase 1
227 means between the first and the second base of the codon
228 (meaning that there are 2 bases until the reading frame of
229 the exon) and phase 2 means between the second and the third
230 base of the codon (one base until the reading frame starts).
231 o end_phase. The phase of the splice site at the 3' end of the
232 exon: same definition as above (though of course, being end_phase,
233 the position relative to the exon's reading frame is different
234 for phase 1 and 2).
235
236 We are considering other information that should be made dumpable. In
237 general we would prefer people to use database access over flat file
238 access if you want to do something serious with the data.
239
240 README
241
242 my $path = File::Spec->catfile($self->data_path(), 'README');
243 work_with_file($path, 'w', sub {
244 my ($fh) = @_;
245 print $fh $readme;
246 return;
247 });
248 return;
249 }
250
251
252 1;
253