annotate variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm @ 2:a5976b2dce6f

changing defualt values for ensembl database
author mahtabm
date Thu, 11 Apr 2013 17:15:42 +1000
parents 1f6dce3d34e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 =pod
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 =head1 LICENSE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6 Genome Research Limited. All rights reserved.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8 This software is distributed under a modified Apache license.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 For license details, please see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 http://www.ensembl.org/info/about/code_licence.html
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 =head1 CONTACT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 Please email comments or questions to the public Ensembl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16 developers list at <dev@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 Questions may also be sent to the Ensembl help desk at
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 <helpdesk@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 Bio::EnsEMBL::Pipeline::FASTA::DumpFile
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 The main workhorse of the FASTA dumping pipeline. This module has two
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 functions
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 =over 8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32 =item 1 - Dumping Genomic DNA sequences in a memory efficient manner in unmasked, softmasked & hardmasked formats
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 =item 2 - Dumping Genes as cDNA, proteins and ncRNA transcripts (abinitio included)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 The script is responsible for creating the filenames of these target
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 files, taking data from the database and the formatting of the FASTA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 headers. It is also responsible for the creation of README files pertaining
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 to the type of dumps produced. The final files are all Gzipped at normal
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42 levels of compression.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 B<N.B.> This code will remove any files already found in the target directory
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 on its first run as it assumes all data will be dumped in the one process. It
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 is selective of its directory meaning a rerun of DNA dumps will not cause
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 the protein/cdna files to be removed.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49 Allowed parameters are:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51 =over 8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 =item species - The species to dump
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 =item sequence_type_list - The data to dump. I<dna>, I<cdna> and I<ncrna> are allowed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 =item release - A required parameter for the version of Ensembl we are dumping for
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 =item db_types - Array reference of the database groups to use. Defaults to core
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 =item process_logic_names - Array reference of transcript logic names to only process (only produce dumps for these). Applied before skip_logic_names
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 =item skip_logic_names - Array reference of transcript logic names to skip over (we do not produce dumps for these)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 =item base_path - The base of the dumps
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 =item dna_chunk_size - Indicates the number of 60bp chunks to retrieve and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 process when formatting FASTA files. Normally do not
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 touch
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 =item allow_appending - If the same file name is generated we will
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 append into that file rather than overwriting
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 =item overwrite_files - If the same file name is generated we will overwrite
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 the into that file rather than appending
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 package Bio::EnsEMBL::Pipeline::FASTA::DumpFile;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 use warnings;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87 use base qw(Bio::EnsEMBL::Pipeline::FASTA::Base);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 use File::Spec;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 use IO::Compress::Gzip qw/gzip $GzipError/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 use IO::File;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92 use Bio::EnsEMBL::PaddedSlice;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 use Bio::EnsEMBL::Utils::BiotypeMapper;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 use Bio::EnsEMBL::Utils::Exception qw/throw/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 use Bio::EnsEMBL::Utils::IO::FASTASerializer;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97 use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 my $DNA_INDEXING_FLOW = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 my $PEPTIDE_INDEXING_FLOW = 2;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 my $GENE_INDEXING_FLOW = 3;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 sub param_defaults {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 return {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 #user configurable
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107 allow_appending => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 overwrite_files => 0,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 dna_chunk_size => 17000,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 skip_logic_names => [],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 process_logic_names => [],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 #DON'T MESS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116 #used to track if we need to reopen a file in append mode or not
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 generated_files => {},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118 remove_files_from_dir => {},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 dataflows => []
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 sub fetch_input {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126 my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 $self->param('sequence_types', \%sequence_types);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 my $dba = $self->get_DBAdaptor();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 my $analyses = $dba->get_MetaContainer()->list_value_by_key('repeat.analysis');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131 $self->param('analyses', $analyses);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133 my $types = $self->param('db_types');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 $types = ['core'] unless $types;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 $self->param('db_types', $types);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 my %skip_logic_names = map { $_ => 1 } @{$self->param('skip_logic_names')};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 $self->param('skip_logic', \%skip_logic_names);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139 $self->param('skip_logic_active', 1) if @{$self->param('skip_logic_names')};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 my %process_logic_names = map { $_ => 1 } @{$self->param('process_logic_names')};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141 $self->param('process_logic', \%process_logic_names);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 $self->param('process_logic_active', 1) if @{$self->param('process_logic_names')};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 sub run {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 my $types = $self->param('db_types');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150 foreach my $type (@{$types}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 my $dba = $self->get_DBAdaptor($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152 if(! $dba) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 $self->info("Cannot continue with %s as we cannot find a DBAdaptor", $type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 next;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 $self->run_type($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 sub write_output {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 my $dataflows = $self->param('dataflows');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 foreach my $flow (@{$dataflows}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 $self->dataflow_output_id(@{$flow});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 sub run_type {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171 my ($self, $type) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173 my $species = $self->param('species');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174 my $sequence_types = $self->param('sequence_types');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 # dump file for each type on a per slice basis
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177 # types are dna,cDNA, peptide, ncRNA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 #Only run if we are told to & the current DBA is the same as the attached DNADB by checking the Stringified ref
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 my $dba = $self->get_DBAdaptor($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181 if ( $sequence_types->{dna} && $dba eq $dba->dnadb() ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 $self->info( "Starting dna dump for " . $species );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 $self->_dump_dna($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 $self->_create_README('dna');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187 if ( $sequence_types->{cdna} ) { #includes peptides whether you like it or not
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188 $self->info( "Starting cdna dump for " . $species );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189 my ($transcripts, $peptide) = $self->_dump_transcripts('cdna', $type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191 $self->info( "Starting prediction transcript dumps for " . $species );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 my ($pred_transcripts, $pred_proteins) = $self->_dump_prediction_transcripts($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 $self->_create_README('cdna') if $transcripts || $pred_transcripts;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 $self->_create_README('pep') if $peptide || $pred_proteins;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 if ( $sequence_types->{ncrna} ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198 $self->info( "Starting ncRNA dump for " . $species );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 my ($ncrna_transcripts) = $self->_dump_transcripts('ncrna', $type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 $self->_create_README('ncrna') if $ncrna_transcripts;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 $self->cleanup_DBAdaptor($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 # Dump entire sequence, also dump data into chromosome files as appropriate
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207 sub _dump_dna {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 my ($self,$type) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210 my @chromosomes;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211 my @non_chromosomes;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 my $filter_human = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213 foreach my $s (@{$self->get_Slices($type, $filter_human)}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214 my $chr = $s->is_chromosome();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215 push(@chromosomes, $s) if $chr;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 push(@non_chromosomes, $s) if ! $chr;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 ############ NON CHROMOSOME WORK
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220 $self->info('Processing %d non-chromosome(s)', scalar(@non_chromosomes));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221 if(@non_chromosomes) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222 my ( $non_specific_file, $non_specific_fh, $other_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223 $self->_generate_fasta_serializer( 'dna', 'nonchromosomal' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224 my ( $rm_non_specific_file, $rm_non_specific_fh, $other_rm_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 $self->_generate_fasta_serializer( 'dna_sm', 'nonchromosomal' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226 foreach my $s (@non_chromosomes) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 $self->_dump_slice($s, $other_serializer, $other_rm_serializer);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230 $self->tidy_file_handle($rm_non_specific_fh, $rm_non_specific_file, 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231 my ($hard_mask_fh, $hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_non_specific_file, $rm_non_specific_fh);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 $self->tidy_file_handle( $non_specific_fh, $non_specific_file );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234 $self->tidy_file_handle( $rm_non_specific_fh, $rm_non_specific_file );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 $self->tidy_file_handle( $hard_mask_fh, $hard_mask_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236 $self->info('Dumped non-chromosomes');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239 ############ CHROMOSOME WORK
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240 $self->info('Processing %d chromosome(s)', scalar(@chromosomes));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 foreach my $s (@chromosomes) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242 my ( $chromo_file_name, $chromo_fh, $chromo_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243 $self->_generate_fasta_serializer( 'dna', 'chromosome',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244 $s->seq_region_name(), undef);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 # repeat masked data too
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246 my ( $rm_chromo_file_name, $rm_chromo_fh, $rm_chromo_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247 $self->_generate_fasta_serializer( 'dna_sm', 'chromosome',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248 $s->seq_region_name(), undef);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250 $self->_dump_slice($s, $chromo_serializer, $rm_chromo_serializer);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252 #Quick close of the SM FH to flush all data out to disk; skip gzipping & leave that to the next call
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name, 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254 my ($chromo_hard_mask_fh, $chromo_hard_mask_file) = $self->_convert_softmask_to_hardmask($rm_chromo_file_name, $rm_chromo_fh);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256 $self->tidy_file_handle($chromo_fh, $chromo_file_name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257 $self->tidy_file_handle($rm_chromo_fh, $rm_chromo_file_name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258 $self->tidy_file_handle($chromo_hard_mask_fh, $chromo_hard_mask_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260 $self->info("Dumped chromosomes");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262 #input_id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263 push(@{$self->param('dataflows')}, [{ data_type => 'dna', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264 push(@{$self->param('dataflows')}, [{ data_type => 'dna_sm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265 push(@{$self->param('dataflows')}, [{ data_type => 'dna_rm', species => $self->param('species') }, $DNA_INDEXING_FLOW]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270 sub _dump_slice {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271 my ($self, $s, $serialiser, $rm_serialiser) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273 my $analyses = $self->param('analyses');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275 my $chr = $s->is_chromosome();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276 $self->info('Starting slice - %s:%d-%d', $s->seq_region_name(), $s->start(), $s->end());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277 $self->info(' Slice is a chromosome') if $chr;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278 $self->info(' Slice is non-chromosomal') if ! $chr;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280 # Make a padded slice (to automatically pad with N's outside of known regions)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281 # and make a repeat-masked slice and then pad that too.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282 my $padded_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $s);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283 $serialiser->print_Seq($padded_slice);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285 my $soft_mask = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 my $masked_slice = $s->get_repeatmasked_seq($analyses, $soft_mask);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287 my $padded_masked_slice = Bio::EnsEMBL::PaddedSlice->new(-SLICE => $masked_slice);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288 $rm_serialiser->print_Seq($padded_masked_slice);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293 #Assumes we are working with un-compressed files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294 sub _convert_softmask_to_hardmask {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295 my ($self, $soft_mask_file, $soft_mask_fh) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296 if(! -f $soft_mask_file) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297 $self->info('Skipping as the target file %s does not exist. Must have been deleted', $soft_mask_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300 my $hard_mask_file = $soft_mask_file;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301 $hard_mask_file =~ s/\.dna_sm\./.dna_rm./;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302 my $hm_fh = IO::File->new($hard_mask_file, 'w');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303 $self->info('Converting soft-masked file %s into hard-masked file %s', $soft_mask_file, $hard_mask_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 work_with_file($soft_mask_file, 'r', sub {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305 my ($sm_fh) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306 while(my $line = <$sm_fh>) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307 if(index($line, '>') == 0) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308 $line =~ s/dna_sm/dna_rm/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311 $line =~ tr/[acgtn]/N/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313 print $hm_fh $line;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316 });
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317 return ($hm_fh, $hard_mask_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320 sub _dump_transcripts {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321 my ($self, $transcript_type, $type) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
323 my $has_transcript_data = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
324 my $has_protein_data = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
325
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
326 my $transcript_level = ($transcript_type ne 'ncrna') ? 'all' : undef;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
327 my ( $filename, $fh, $transcript_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
328 $self->_generate_fasta_serializer( $transcript_type, $transcript_level );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
329
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
330 my ( $peptide_filename, $pep_fh, $peptide_serializer );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
331
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
332 # some cDNAs are translated, make a file to receive them.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
333 if ( $transcript_type eq 'cdna') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
334 ( $peptide_filename, $pep_fh, $peptide_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
335 $self->_generate_fasta_serializer( 'pep', 'all' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
336 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
337
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
338 # work out what biotypes correspond to $transcript_type
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
339 my $biotype_mapper = Bio::EnsEMBL::Utils::BiotypeMapper->new();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
340 my $biotypes_list = $biotype_mapper->group_members($transcript_type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
341
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
342 my $dba = $self->get_DBAdaptor($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
343 my $gene_adaptor = $dba->get_GeneAdaptor();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
344
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
345 # get all the transcripts that are $transcript_type e.g. cdna, ncrna,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
346 foreach my $biotype ( @{$biotypes_list} ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
347 my $gene_list = $gene_adaptor->fetch_all_by_biotype($biotype);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
348 $self->info("Biotype %s has %d gene(s)", $biotype, scalar( @{$gene_list} ));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
349 while ( my $gene = shift @{$gene_list} ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
350 $self->fine( 'Gene %s', $gene->display_id );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
351 my $transcript_list = $gene->get_all_Transcripts();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
352 foreach my $transcript ( @{$transcript_list} ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
353 $self->fine( 'Transcript %s', $transcript->display_id );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
354 next unless $self->ok_to_process_logic_name($transcript);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
355
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
356 # foreach transcripts of all genes with biotypes classed as cdna
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
357 my $transcript_seq = $transcript->seq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
358 $self->_create_display_id($transcript, $transcript_seq, $transcript_type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
359 $transcript_serializer->print_Seq($transcript_seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
360 if ($biotype_mapper->member_of_group( $biotype, 'peptide_producing')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
361 my $translation = $transcript->translation();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
362 if ($translation) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
363 my $translation_seq = $transcript->translate();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
364 $self->_create_display_id($translation, $translation_seq, $transcript_type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
365 $peptide_serializer->print_Seq($translation_seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
366
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
367 $has_protein_data = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
368 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
369 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
370
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
371 $has_transcript_data = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
372 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
373 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
374 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
375
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
376 $self->tidy_file_handle( $fh, $filename );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
377 if ( $transcript_type eq 'cdna' ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
378 $self->tidy_file_handle( $pep_fh, $peptide_filename );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
379 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
380
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
381 if($has_protein_data) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
382 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
383 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
384 if($has_transcript_data) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
385 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
386 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
387
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
388 return ($has_transcript_data, $has_protein_data);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
389 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
390
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
391 # Dump prediction transcripts and peptides. All predicted transcripts have translations
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
392 sub _dump_prediction_transcripts {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
393 my ($self, $type) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
394 my $dba = $self->get_DBAdaptor($type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
395
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
396 my $has_transcript_data = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
397 my $has_protein_data = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
398
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
399 my $prediction_transcript_adaptor = $dba->get_PredictionTranscriptAdaptor();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
400 my $transcript_list = $prediction_transcript_adaptor->fetch_all();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
401 my $count = scalar(@{$transcript_list});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
402 $self->info('Found %d prediction transcript(s)', $count);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
403 if($count) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
404 my ( $abinitio_filename, $fh, $abinitio_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
405 $self->_generate_fasta_serializer( 'cdna', 'abinitio' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
406 my ( $abinitio_peptide_filename, $pep_fh, $abinitio_peptide_serializer ) =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
407 $self->_generate_fasta_serializer( 'pep', 'abinitio' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
408
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
409 while ( my $transcript = shift @{$transcript_list} ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
410 next unless $self->ok_to_process_logic_name($transcript);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
411
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
412 $has_transcript_data = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
413 my $transcript_seq = $transcript->seq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
414 $self->_create_display_id( $transcript, $transcript_seq, 'cdna' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
415 $abinitio_serializer->print_Seq($transcript_seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
416
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
417 my $translation_seq = $transcript->translate();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
418 if ( $transcript->translation() ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
419 $has_protein_data = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
420 $self->_create_display_id( $transcript, $translation_seq, 'pep' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
421 $abinitio_peptide_serializer->print_Seq($translation_seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
422 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
423 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
424
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
425 $self->tidy_file_handle( $fh, $abinitio_filename );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
426 $self->tidy_file_handle( $pep_fh, $abinitio_peptide_filename );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
427
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
428 if($has_protein_data) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
429 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_peptide_filename), species => $self->param('species') }, $PEPTIDE_INDEXING_FLOW]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
430 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
431 if($has_transcript_data) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
432 push(@{$self->param('dataflows')}, [{ file => $self->_final_filename($abinitio_filename), species => $self->param('species') }, $GENE_INDEXING_FLOW]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
433 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
434 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
435
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
436
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
437 return ($has_transcript_data, $has_protein_data);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
438 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
439
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
440 # We can optionally skip the Gzip process & just delegate to the super class
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
441 # for it's cleanup routines which only work with an open file handle. Therefore
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
442 # only pass it onto the super implementation *if* the handle was open.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
443 # Also only Gzip if the source file exists (it could have been unlinked from
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
444 # an earlier call)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
445
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
446 sub tidy_file_handle {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
447 my ($self, $fh, $path, $no_gzip) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
448 if($fh->opened()) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
449 my $tidy = $self->SUPER::tidy_file_handle($fh, $path);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
450 return 1 if $tidy;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
451 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
452
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
453 return if $no_gzip; #don't gzip if we were told to skip
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
454 return if ! -f $path; #don't gzip if we had no file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
455
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
456 my $target = $path.".gz";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
457 $self->info('Gzipping "%s"', $path);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
458 my %args;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
459 if($self->param('generated_files')->{$target}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
460 if($self->param('allow_appending')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
461 $self->info('Going to append to the file %s as we have created two files of the same name in the same session', $target);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
462 $args{Append} = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
463 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
464 elsif($self->param('overwrite_files')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
465 $self->info('Overwriting the file %s as we have created two files of the same name in the same session', $target);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
466 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
467 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
468 $self->throw("Cannot continue. The file %s has already been created this session. Fail!");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
469 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
470 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
471 gzip $path => $target, %args or throw "GZip error compressing $path to $target: $GzipError";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
472 $self->info(' Removing original file from filesystem');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
473 unlink $path or throw "Could not delete $path: $!";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
474 $self->info(' Finished');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
475 $self->param('generated_files')->{$target} = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
476 return 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
477 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
478
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
479 #We assume a transcript is ok to process unless proven otherwise
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
480 sub ok_to_process_logic_name {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
481 my ($self, $transcript) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
482 my $ok = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
483 my $logic_name = $transcript->analysis()->logic_name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
484 if($self->param('process_logic_active')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
485 if(! $self->param('process_logic')->{$logic_name}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
486 $self->fine('Transcript %s has been filtered because logic_name %s is not in the active logic name list', $transcript->stable_id(), $logic_name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
487 $ok = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
488 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
489 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
490 if($self->param('skip_logic_active')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
491 if($self->param('skip_logic')->{$logic_name}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
492 $self->fine('Transcript %s has been filtered because logic_name %s is in the skip logic name list', $transcript->stable_id(), $logic_name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
493 $ok = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
494 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
495 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
496 return $ok;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
497 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
498
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
499 #Generates a FASTA serializer but returns the (filename, handle & instance)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
500 sub _generate_fasta_serializer {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
501 my ( $self, $datatype, $level, $section, $header_formatter ) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
502 $header_formatter ||= $self->_custom_header();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
503 my $chunk = $self->param('dna_chunk_size');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
504 my $filename = $self->_generate_file_name( $datatype, $level, $section );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
505 my $fh = IO::File->new($filename, '>') or throw "Cannot open $filename for writing: $!";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
506 my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh, $header_formatter, $chunk);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
507 return ( $filename, $fh, $ser );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
508 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
509
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
510 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
511 # _generate_file_name(data type, level, section )
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
512 # dna toplevel undef
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
513 # dna chromosome 6
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
514
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
515 sub _generate_file_name {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
516 my ( $self, $data_type, $level, $section ) = @_; #level & section is optional
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
517
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
518 # File name format looks like:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
519 # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
520 # e.g. Homo_sapiens.GRCh37.64.dna_rm.chromosome.HG905_PATCH.fa
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
521 # Homo_sapiens.GRCh37.64.dna.chromosome.20.fa
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
522 # Ciona_savignyi.CSAV2.0.65.dna.toplevel.fa
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
523 my @name_bits;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
524 push @name_bits, $self->web_name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
525 push @name_bits, $self->assembly();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
526 push @name_bits, $self->param('release');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
527 push @name_bits, lc($data_type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
528 push @name_bits, $level if $level;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
529 push @name_bits, $section if $section;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
530 push @name_bits, 'fa';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
531
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
532 my $file_name = join( '.', @name_bits );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
533
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
534 $data_type =~ s/_[rs]m$//; # remove repeatmask or softmask designation from path component
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
535 my $data_type_dir = $self->fasta_path($data_type);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
536 $self->_remove_files_from_dir($data_type_dir);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
537 return File::Spec->catfile( $data_type_dir, $file_name );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
538 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
539
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
540 # Attempts to remove any generated files previously present for the instance
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
541 # of the Process
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
542 sub _remove_files_from_dir {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
543 my ($self, $dir) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
544 if(! $self->param('remove_files_from_dir')->{$dir}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
545 $self->unlink_all_files($dir);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
546 $self->param('remove_files_from_dir')->{$dir} = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
547 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
548 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
549 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
550
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
551 ##Logic used to generate the expected format for a FASTA header
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
552 sub _create_display_id {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
553 my ($self, $object, $seq, $type) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
554
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
555 my $stable_id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
556 my $location;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
557 my $decoded_type;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
558 my $decoded_status;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
559 my %attributes;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
560
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
561 if(check_ref( $object, 'Bio::EnsEMBL::Transcript')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
562 $attributes{transcript_biotype} = $object->biotype();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
563
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
564 #If pred transcript then no gene but type & status are different
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
565 if(check_ref($object, 'Bio::EnsEMBL::PredictionTranscript')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
566 $stable_id = $object->stable_id();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
567 $location = $object->feature_Slice()->name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
568 $decoded_type = $type;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
569 $decoded_status = lc($object->analysis()->logic_name());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
570 if($type eq 'pep') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
571 $attributes{transcript} = $stable_id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
572 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
573 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
574 #Must be a real "transcript"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
575 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
576 $stable_id = $object->stable_id();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
577 $location = $object->feature_Slice()->name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
578 my $gene = $object->get_Gene();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
579 $attributes{gene} = $gene->stable_id();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
580 $attributes{gene_biotype} = $gene->biotype();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
581
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
582 #If ncRNA then we set type to the logic name and status to gene's biotype (taken from original script)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
583 if($type eq 'ncrna') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
584 $decoded_type = lc($object->analysis()->logic_name());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
585 $decoded_status = $gene->biotype();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
586 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
587 elsif($object->biotype() =~ /pseudogene/i && ! $object->translation()) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
588 $decoded_type = $type;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
589 $decoded_status = 'pseudogene';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
590 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
591 #Otherwise use type & object's transcript's status
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
592 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
593 $decoded_type = $type;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
594 $decoded_status = lc($object->status());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
595 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
596 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
597 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
598 #If it's a translation then grab the transcript and gene then set accordingly
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
599 elsif(check_ref($object, 'Bio::EnsEMBL::Translation')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
600 my $transcript = $object->transcript();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
601 my $gene = $transcript->get_Gene();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
602 $stable_id = $object->stable_id();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
603 $location = $transcript->feature_Slice()->name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
604 %attributes = (
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
605 gene => $gene->stable_id(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
606 gene_biotype => $gene->biotype(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
607 transcript => $transcript->stable_id(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
608 transcript_biotype => $transcript->biotype()
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
609 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
610 $decoded_type = 'pep';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
611 $decoded_status = lc($transcript->status());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
612 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
613 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
614 throw sprintf( 'Do not understand how to format a display_id for type "%s"',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
615 ref($object) );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
616 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
617
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
618 my $attr_str = join(q{ },
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
619 map { $_.':'.$attributes{$_} }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
620 grep { exists $attributes{$_} }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
621 qw/gene transcript gene_biotype transcript_biotype/);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
622
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
623 my $format = '%s %s:%s %s %s';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
624
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
625 my $id = sprintf( $format, $stable_id, $decoded_type, $decoded_status, $location, $attr_str);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
626 $seq->display_id($id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
627
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
628 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
629 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
630
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
631 sub _custom_header {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
632 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
633 return sub {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
634 my $slice = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
635 if ( !$slice->isa('Bio::EnsEMBL::Slice') ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
636 return $slice->display_id();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
637 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
638
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
639 #RMS means masked data. soft_mask() true means it was softmasked
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
640 my $dna_type = 'dna';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
641 if($slice->isa('Bio::EnsEMBL::RepeatMaskedSlice')) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
642 $dna_type .= ($slice->soft_mask()) ? '_sm' : '_rm';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
643 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
644
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
645 my $id = $slice->seq_region_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
646 my $idtype = $slice->coord_system->name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
647 my $location = $slice->name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
648 my $ref = $slice->assembly_exception_type();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
649 my $header = sprintf('%s %s:%s %s %s', $id, $dna_type, $idtype, $location, $ref);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
650 return $header;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
651 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
652 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
653
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
654 sub _final_filename {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
655 my ($self, $filename) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
656 return $filename if $filename =~ /\.gz$/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
657 return $filename.'.gz';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
658 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
659
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
660 sub assembly_accession {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
661 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
662 my $mc = $self->get_DBAdaptor()->get_MetaContainer();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
663 return $mc->single_value_by_key('assembly.accession');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
664 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
665
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
666 sub assembly_accession_type {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
667 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
668 my $mc = $self->get_DBAdaptor()->get_MetaContainer();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
669 return $mc->single_value_by_key('assembly.web_accession_type');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
670 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
671
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
672 sub _create_README {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
673
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
674 #Text for readme files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
675
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
676 my %text = (
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
677 dna => <<'README',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
678 #######################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
679 Fasta DNA dumps
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
680 #######################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
681
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
682 -----------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
683 FILE NAMES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
684 ------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
685 The files are consistently named following this pattern:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
686 <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
687
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
688 <species>: The systematic name of the species.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
689 <assembly>: The assembly build name.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
690 <release>: The release number.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
691 <sequence type>:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
692 * 'dna' - unmasked genomic DNA sequences.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
693 * 'dna_rm' - masked genomic DNA. Interspersed repeats and low
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
694 complexity regions are detected with the RepeatMasker tool and masked
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
695 by replacing repeats with 'N's.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
696 * 'dna_sm' - soft-masked genomic DNA. All repeats and low complexity regions
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
697 have been replaced with lowercased versions of their nucleic base
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
698 <id type> One of the following:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
699 * 'chromosome'a - The top-level coordinate system in most species in Ensembl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
700 * 'nonchromosomal' - Contains DNA that has not been assigned a chromosome
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
701 * 'seqlevel' - This is usually sequence scaffolds, chunks or clones.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
702 -- 'scaffold' - Larger sequence contigs from the assembly of shorter
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
703 sequencing reads (often from whole genome shotgun, WGS) which could
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
704 not yet be assembled into chromosomes. Often more genome sequencing
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
705 is needed to narrow gaps and establish a tiling path.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
706 -- 'chunk' - While contig sequences can be assembled into large entities,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
707 they sometimes have to be artificially broken down into smaller entities
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
708 called 'chunks'. This is due to limitations in the annotation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
709 pipeline and the finite record size imposed by MySQL which stores the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
710 sequence and annotation information.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
711 -- 'clone' - In general this is the smallest sequence entity. It is often
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
712 identical to the sequence of one BAC clone, or sequence region
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
713 of one BAC clone which forms the tiling path.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
714 <id>: The actual sequence identifier. Depending on the <id type> the <id>
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
715 could represent the name of a chromosome, a scaffold, a contig, a clone ..
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
716 Field is empty for seqlevel files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
717 fa: All files in these directories represent FASTA database files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
718 gz: All files are compacted with GNU Zip for storage efficiency.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
719
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
720 -----------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
721 TOPLEVEL
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
722 ----------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
723 These files contain the full sequence of the assembly in fasta format.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
724 They contain one chromosome per file.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
725
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
726 EXAMPLES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
727 The genomic sequence of human chromosome 1:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
728 Homo_sapiens.GRCh37.57.dna.chromosome.1.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
729
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
730 The masked version of the genome sequence on human chromosome 1
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
731 (contains '_rm' or '_sm' in the name):
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
732 Homo_sapiens.GRCh37.57.dna_rm.chromosome.1.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
733 Homo_sapiens.GRCh37.57.dna_sm.chromosome.1.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
734
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
735 Non-chromosomal assembly sequences:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
736 e.g. mitochondrial genome, sequence contigs not yet mapped on chromosomes
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
737 Homo_sapiens.GRCh37.57.dna.nonchromosomal.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
738 Homo_sapiens.GRCh37.57.dna_rm.nonchromosomal.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
739 Homo_sapiens.GRCh37.57.dna_sm.nonchromosomal.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
740
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
741
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
742 --------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
743 SPECIAL CASES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
744 --------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
745 Some chromosomes have alternate haplotypes which are presented in files with
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
746 the haplotype sequence only:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
747 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR6_MHC_QBL.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
748 Homo_sapiens.GRCh37.56.dna_rm.chromosome.HSCHR17_1.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
749
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
750
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
751 Some species have sequenced Y chromosomes and the pseudoautosomal region (PAR)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
752 on the Y is annotated. By definition the PAR region is identical on the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
753 X and Y chromosome. We provide this sequence in the following way.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
754 -- The Y chromosome file contains the complete sequence of the PAR:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
755 Homo_sapiens.GRCh37.56.dna.chromosome.Y.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
756 -- The top level file includes only the unique portion of Y (i.e. the PAR
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
757 (region is N-masked):
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
758 Homo_sapiens.GRCh37.56.dna.toplevel.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
759
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
760 README
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
761
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
762 pep => <<'README',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
763 ####################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
764 Fasta Peptide dumps
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
765 ####################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
766 These files hold the protein translations of Ensembl gene predictions.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
767
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
768 -----------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
769 FILE NAMES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
770 ------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
771 The files are consistently named following this pattern:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
772 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
773
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
774 <species>: The systematic name of the species.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
775 <assembly>: The assembly build name.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
776 <release>: The release number.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
777 <sequence type>: pep for peptide sequences
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
778 <status>
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
779 * 'pep.all' - the super-set of all translations resulting from Ensembl known
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
780 or novel gene predictions.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
781 * 'pep.abinitio' translations resulting from 'ab initio' gene
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
782 prediction algorithms such as SNAP and GENSCAN. In general, all
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
783 'ab initio' predictions are based solely on the genomic sequence and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
784 not any other experimental evidence. Therefore, not all GENSCAN
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
785 or SNAP predictions represent biologically real proteins.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
786 fa : All files in these directories represent FASTA database files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
787 gz : All files are compacted with GNU Zip for storage efficiency.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
788
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
789 EXAMPLES (Note: Most species do not sequences for each different <status>)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
790 for Human:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
791 Homo_sapiens.NCBI36.40.pep.all.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
792 contains all known and novel peptides
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
793 Homo_sapiens.NCBI36.40.pep.abinitio.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
794 contains all abinitio predicted peptide
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
795
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
796 Difference between known and novel
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
797 ----------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
798 Protein models that can be mapped to species-specific entries in
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
799 Swiss-Prot, RefSeq or SPTrEMBL are referred to in Ensembl as
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
800 known genes. Those that cannot be mapped are called novel
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
801 (e.g. genes predicted on the basis of evidence from closely related species).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
802
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
803 For models annotated by HAVANA the status is set manually. Models that have
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
804 an HGNC name are referred to as known and the remaining models are referred to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
805 as novel.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
806
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
807 -------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
808 FASTA Sequence Header Lines
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
809 ------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
810 The FASTA sequence header lines are designed to be consistent across
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
811 all types of Ensembl FASTA sequences. This gives enough information
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
812 for the sequence to be identified outside the context of the FASTA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
813 database file.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
814
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
815 General format:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
816
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
817 >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
818
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
819 Example of Ensembl Peptide header:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
820
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
821 >ENSP00000328693 pep:novel chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
822 ^ ^ ^ ^ ^ ^ ^ ^
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
823 ID | | LOCATION GENE:stable gene ID | GENE: gene biotype TRANSCRIPT: transcript biotype
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
824 | STATUS TRANSCRIPT: stable transcript ID
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
825 SEQTYPE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
826
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
827 README
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
828
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
829 cdna => <<'README',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
830 ##################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
831 Fasta cDNA dumps
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
832 #################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
833
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
834 These files hold the cDNA sequences corresponding to Ensembl gene predictions.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
835
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
836 ------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
837 FILE NAMES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
838 ------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
839 The files are consistently named following this pattern:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
840 <species>.<assembly>.<release>.<sequence type>.<status>.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
841
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
842 <species>: The systematic name of the species.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
843 <assembly>: The assembly build name.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
844 <release>: The release number.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
845 <sequence type>: cdna for cDNA sequences
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
846 <status>
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
847 * 'cdna.all' - the super-set of all transcripts resulting from
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
848 Ensembl known, novel and pseudo gene predictions (see more below).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
849 * 'cdna.abinitio' - transcripts resulting from 'ab initio' gene prediction
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
850 algorithms such as SNAP and GENSCAN. In general all 'ab initio'
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
851 predictions are solely based on the genomic sequence and do not
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
852 use other experimental evidence. Therefore, not all GENSCAN or SNAP
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
853 cDNA predictions represent biologically real cDNAs.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
854 Consequently, these predictions should be used with care.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
855
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
856 EXAMPLES (Note: Most species do not sequences for each different <status>)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
857 for Human:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
858 Homo_sapiens.NCBI36.40.cdna.all.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
859 cDNA sequences for all transcripts: known, novel and pseudo
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
860 Homo_sapiens.NCBI36.40.cdna.abinitio.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
861 cDNA sequences for 'ab-initio' prediction transcripts.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
862
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
863 Difference between known and novel transcripts
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
864 -----------------------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
865 Transcript or protein models that can be mapped to species-specific entries
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
866 in Swiss-Prot, RefSeq or SPTrEMBL are referred to as known genes in Ensembl.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
867 Those that cannot be mapped are called novel genes (e.g. genes predicted on
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
868 the basis of evidence from closely related species).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
869
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
870 For models annotated by HAVANA the status is set manually. Models that have
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
871 an HGNC name are referred to as known and the remaining models are referred to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
872 as novel.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
873
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
874 -------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
875 FASTA Sequence Header Lines
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
876 ------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
877 The FASTA sequence header lines are designed to be consistent across
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
878 all types of Ensembl FASTA sequences. This gives enough information
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
879 for the sequence to be identified outside the context of the FASTA file.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
880
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
881 General format:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
882
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
883 >ID SEQTYPE:STATUS LOCATION GENE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
884
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
885 Example of an Ensembl cDNA header:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
886
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
887 >ENST00000289823 cdna:known chromosome:NCBI35:8:21922367:21927699:1 gene:ENSG00000158815 gene_biotype:protein_coding transcript_biotype:protein_coding
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
888 ^ ^ ^ ^ ^ ^ ^
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
889 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
890 | STATUS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
891 SEQTYPE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
892
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
893
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
894 README
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
895
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
896 ncrna => <<'README',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
897 ##################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
898 Fasta RNA dumps
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
899 #################
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
900
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
901 These files hold the transcript sequences corresponding to non-coding RNA genes (ncRNA).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
902
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
903 ------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
904 FILE NAMES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
905 ------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
906 The files are consistently named following this pattern:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
907 <species>.<assembly>.<release>.<sequence type>.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
908
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
909 <species>: The systematic name of the species.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
910 <assembly>: The assembly build name.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
911 <release>: The release number.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
912 <sequence type>: ncrna for non-coding RNA sequences
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
913
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
914 EXAMPLES
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
915 for Human:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
916 Homo_sapiens.NCBI36.40.ncrna.fa.gz
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
917 Transcript sequences for all ncRNA gene types.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
918
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
919
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
920 -------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
921 FASTA Sequence Header Lines
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
922 ------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
923 The FASTA sequence header lines are designed to be consistent across
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
924 all types of Ensembl FASTA sequences. This gives enough information
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
925 for the sequence to be identified outside the context of the FASTA file.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
926
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
927 General format:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
928
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
929 >ENST00000347977 ncrna:miRNA chromosome:NCBI35:1:217347790:217347874:-1 gene:ENSG00000195671 gene_biotype:ncRNA transcript_biotype:ncRNA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
930 ^ ^ ^ ^ ^ ^ ^
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
931 ID | | LOCATION GENE: gene stable ID GENE: gene biotype TRANSCRIPT: transcript biotype
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
932 | STATUS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
933 SEQTYPE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
934
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
935
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
936 README
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
937 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
938
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
939 my $warning = <<'README';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
940 #### README ####
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
941
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
942 IMPORTANT: Please note you can download correlation data tables,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
943 supported by Ensembl, via the highly customisable BioMart and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
944 EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
945 http://www.ebi.ac.uk/biomart/ for more information.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
946
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
947 README
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
948
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
949 my ( $self, $data_type ) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
950 my $base_path = $self->fasta_path();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
951 my $path = File::Spec->catfile( $base_path, $data_type, 'README' );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
952 my $accession = $self->assembly_accession();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
953 my $txt = $text{$data_type};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
954 throw "Cannot find README text for type $data_type" unless $txt;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
955
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
956 #Add accession information if it is available
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
957 if($data_type eq 'dna' && $accession) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
958 my $type = $self->assembly_accession_type();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
959 $warning .= <<EXTRA;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
960 The genome assembly represented here corresponds to $type
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
961 $accession
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
962
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
963 EXTRA
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
964 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
965
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
966 work_with_file($path, 'w', sub {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
967 my ($fh) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
968 print $fh $warning;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
969 print $fh $txt;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
970 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
971 });
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
972 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
973 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
974
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
975 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
976