Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =pod | |
| 2 | |
| 3 =head1 LICENSE | |
| 4 | |
| 5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 6 Genome Research Limited. All rights reserved. | |
| 7 | |
| 8 This software is distributed under a modified Apache license. | |
| 9 For license details, please see | |
| 10 | |
| 11 http://www.ensembl.org/info/about/code_licence.html | |
| 12 | |
| 13 =head1 CONTACT | |
| 14 | |
| 15 Please email comments or questions to the public Ensembl | |
| 16 developers list at <dev@ensembl.org>. | |
| 17 | |
| 18 Questions may also be sent to the Ensembl help desk at | |
| 19 <helpdesk@ensembl.org>. | |
| 20 | |
| 21 =head1 NAME | |
| 22 | |
| 23 Bio::EnsEMBL::Pipeline::Flatfile::DumpFile | |
| 24 | |
| 25 =head1 DESCRIPTION | |
| 26 | |
| 27 The main workhorse of the Flatfile dumping pipeline. | |
| 28 | |
| 29 The script is responsible for creating the filenames of these target | |
| 30 files, taking data from the database and the formatting of the flat files | |
| 31 headers. The final files are all Gzipped at normal levels of compression. | |
| 32 | |
| 33 Allowed parameters are: | |
| 34 | |
| 35 =over 8 | |
| 36 | |
| 37 =item species - The species to dump | |
| 38 | |
| 39 =item base_path - The base of the dumps | |
| 40 | |
| 41 =item release - The current release we are emitting | |
| 42 | |
| 43 =item type - The type of data we are emitting. Should be embl or genbank | |
| 44 | |
| 45 =back | |
| 46 | |
| 47 =cut | |
| 48 | |
| 49 package Bio::EnsEMBL::Pipeline::Flatfile::DumpFile; | |
| 50 | |
| 51 use strict; | |
| 52 use warnings; | |
| 53 | |
| 54 use base qw(Bio::EnsEMBL::Pipeline::Flatfile::Base); | |
| 55 | |
| 56 use Bio::EnsEMBL::Utils::Exception qw/throw/; | |
| 57 use Bio::EnsEMBL::Utils::SeqDumper; | |
| 58 use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file work_with_file/; | |
| 59 use File::Path qw/rmtree/; | |
| 60 | |
| 61 sub param_defaults { | |
| 62 my ($self) = @_; | |
| 63 return { | |
| 64 supported_types => {embl => 1, genbank => 1}, | |
| 65 }; | |
| 66 } | |
| 67 | |
| 68 sub fetch_input { | |
| 69 my ($self) = @_; | |
| 70 | |
| 71 my $type = $self->param('type'); | |
| 72 throw "No type specified" unless $type; | |
| 73 throw "Unsupported type '$type' specified" unless $self->param('supported_types')->{$type}; | |
| 74 | |
| 75 throw "Need a species" unless $self->param('species'); | |
| 76 throw "Need a release" unless $self->param('release'); | |
| 77 throw "Need a base_path" unless $self->param('base_path'); | |
| 78 | |
| 79 return; | |
| 80 } | |
| 81 | |
| 82 sub run { | |
| 83 my ($self) = @_; | |
| 84 | |
| 85 my $root = $self->data_path(); | |
| 86 if(-d $root) { | |
| 87 $self->info('Directory "%s" already exists; removing', $root); | |
| 88 rmtree($root); | |
| 89 } | |
| 90 | |
| 91 my $type = $self->param('type'); | |
| 92 my $target = "dump_${type}"; | |
| 93 my $seq_dumper = $self->_seq_dumper(); | |
| 94 | |
| 95 my @chromosomes; | |
| 96 my @non_chromosomes; | |
| 97 foreach my $s (@{$self->get_Slices()}) { | |
| 98 my $chr = $s->is_chromosome(); | |
| 99 push(@chromosomes, $s) if $chr; | |
| 100 push(@non_chromosomes, $s) if ! $chr; | |
| 101 } | |
| 102 | |
| 103 if(@non_chromosomes) { | |
| 104 my $path = $self->_generate_file_name('nonchromosomal'); | |
| 105 $self->info('Dumping non-chromosomal data to %s', $path); | |
| 106 gz_work_with_file($path, 'w', sub { | |
| 107 my ($fh) = @_; | |
| 108 foreach my $slice (@non_chromosomes) { | |
| 109 $self->fine('Dumping non-chromosomal %s', $slice->name()); | |
| 110 $seq_dumper->$target($slice, $fh); | |
| 111 } | |
| 112 return; | |
| 113 }); | |
| 114 } | |
| 115 else { | |
| 116 $self->info('Did not find any non-chromosomal data'); | |
| 117 } | |
| 118 | |
| 119 foreach my $slice (@chromosomes) { | |
| 120 $self->fine('Dumping chromosome %s', $slice->name()); | |
| 121 my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name()); | |
| 122 my $args = {}; | |
| 123 if(-f $path) { | |
| 124 $self->fine('Path "%s" already exists; appending', $path); | |
| 125 $args->{Append} = 1; | |
| 126 } | |
| 127 gz_work_with_file($path, 'w', sub { | |
| 128 my ($fh) = @_; | |
| 129 $seq_dumper->$target($slice, $fh); | |
| 130 return; | |
| 131 }, $args); | |
| 132 } | |
| 133 | |
| 134 $self->_create_README(); | |
| 135 | |
| 136 return; | |
| 137 } | |
| 138 | |
| 139 sub _seq_dumper { | |
| 140 my ($self) = @_; | |
| 141 my $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new(); | |
| 142 $seq_dumper->disable_feature_type('similarity'); | |
| 143 $seq_dumper->disable_feature_type('genscan'); | |
| 144 $seq_dumper->disable_feature_type('variation'); | |
| 145 $seq_dumper->disable_feature_type('repeat'); | |
| 146 return $seq_dumper; | |
| 147 } | |
| 148 | |
| 149 sub _generate_file_name { | |
| 150 my ($self, $section, $name) = @_; | |
| 151 | |
| 152 # File name format looks like: | |
| 153 # <species>.<assembly>.<release>.<section.name|section>.dat.gz | |
| 154 # e.g. Homo_sapiens.GRCh37.64.chromosome.20.dat.gz | |
| 155 # Homo_sapiens.GRCh37.64.nonchromosomal.dat.gz | |
| 156 my @name_bits; | |
| 157 push @name_bits, $self->web_name(); | |
| 158 push @name_bits, $self->assembly(); | |
| 159 push @name_bits, $self->param('release'); | |
| 160 push @name_bits, $section if $section; | |
| 161 push @name_bits, $name if $name; | |
| 162 push @name_bits, 'dat', 'gz'; | |
| 163 | |
| 164 my $file_name = join( '.', @name_bits ); | |
| 165 my $path = $self->data_path(); | |
| 166 return File::Spec->catfile($path, $file_name); | |
| 167 } | |
| 168 | |
| 169 sub _create_README { | |
| 170 my ($self) = @_; | |
| 171 my $species = $self->scientific_name(); | |
| 172 my $format = uc($self->param('type')); | |
| 173 | |
| 174 my $readme = <<README; | |
| 175 #### README #### | |
| 176 | |
| 177 IMPORTANT: Please note you can download correlation data tables, | |
| 178 supported by Ensembl, via the highly customisable BioMart and | |
| 179 EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or | |
| 180 http://www.ebi.ac.uk/biomart/ for more information. | |
| 181 | |
| 182 ----------------------- | |
| 183 $format FLATFILE DUMPS | |
| 184 ----------------------- | |
| 185 This directory contains $species $format flatfile dumps. To ease | |
| 186 downloading of the files, the $format format entries are bundled | |
| 187 into groups of chromosomes and non-chromosomal regions. | |
| 188 All files are then compacted with gzip. | |
| 189 | |
| 190 Ensembl provides an automatic reannotation of $species genomic data. | |
| 191 These data will be dumped in a number of forms - one of them being | |
| 192 $format flat files. As the annotation of this form comes from Ensembl, | |
| 193 and not the original sequence entry, the two annotations are | |
| 194 likely to be different. | |
| 195 | |
| 196 $format flat file format dumping provides all the confirmed protein coding | |
| 197 genes known by Ensembl. Considerably more information is stored in Ensembl: | |
| 198 the flat file just gives a representation which is compatible with | |
| 199 existing tools. | |
| 200 | |
| 201 The main body of the entry gives the same information as is in the main | |
| 202 $format flat file entry. | |
| 203 | |
| 204 * ID - the $format id | |
| 205 * AC - the EMBL/GenBank/DDBJ accession number (only the primary | |
| 206 accession number used) | |
| 207 * SV - The accession.version pair which gives the exact reference to | |
| 208 a particular sequence | |
| 209 * CC - comment lines to help you interpret the entry | |
| 210 | |
| 211 Currently the following features are dumped into the feature table of | |
| 212 the Ensembl entry: | |
| 213 | |
| 214 * Transcripts as CDS entries. Each transcript has the following | |
| 215 attributes attached | |
| 216 o Transcript id - a stable id, which Ensembl will attempt to | |
| 217 preserve as sensibly as possible during updates of the data | |
| 218 o Gene id - indication of the gene that this transcript belongs | |
| 219 to. gene ids are stable and preserved as sensibly as possible | |
| 220 during updates of the data | |
| 221 o Translation - the peptide translation of the transcript. | |
| 222 * Exons as exon entries. Each exon has the following information | |
| 223 o Exon id. The exon id is stable and preserved as sensibly | |
| 224 as possible during sequence updates | |
| 225 o start_phase. The phase of the splice site at the 5' end | |
| 226 of the exon. Phase 0 means between two codons, phase 1 | |
| 227 means between the first and the second base of the codon | |
| 228 (meaning that there are 2 bases until the reading frame of | |
| 229 the exon) and phase 2 means between the second and the third | |
| 230 base of the codon (one base until the reading frame starts). | |
| 231 o end_phase. The phase of the splice site at the 3' end of the | |
| 232 exon: same definition as above (though of course, being end_phase, | |
| 233 the position relative to the exon's reading frame is different | |
| 234 for phase 1 and 2). | |
| 235 | |
| 236 We are considering other information that should be made dumpable. In | |
| 237 general we would prefer people to use database access over flat file | |
| 238 access if you want to do something serious with the data. | |
| 239 | |
| 240 README | |
| 241 | |
| 242 my $path = File::Spec->catfile($self->data_path(), 'README'); | |
| 243 work_with_file($path, 'w', sub { | |
| 244 my ($fh) = @_; | |
| 245 print $fh $readme; | |
| 246 return; | |
| 247 }); | |
| 248 return; | |
| 249 } | |
| 250 | |
| 251 | |
| 252 1; | |
| 253 |
