comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/Archiver.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive
24
25 =head1 SYNOPSIS
26
27 my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new(
28 -LOGGER => $logger,
29 -CONF => $conf,
30 -CACHE => $cache
31 );
32
33 # create gene and peptide archive
34 $archiver->create_archive($mapping_session_id);
35
36 # dump existing archive tables to file
37 my $num_entries =
38 $archiver->dump_table_to_file( 'source', 'gene_archive',
39 'gene_archive_existing.txt', 1 );
40
41 =head1 DESCRIPTION
42
43 This module creates the gene_archive and peptide_archive
44 tables. Data is written to a file as tab-delimited text for
45 loading into a MySQL database (this can be done manually, or using
46 StableIdmapper->upload_file_into_table()).
47
48 An archive entry for a given source gene is created if no target
49 gene exists, or if any of its transcripts or their translations
50 changed. Non-coding transcripts only have an entry in gene_archive (i.e.
51 without a corresponding peptide_archive entry).
52
53 =head1 METHODS
54
55 create_archive
56 dump_gene
57 dump_tuple
58 dump_nc_row
59 mapping_session_id
60
61 =cut
62
63
64 package Bio::EnsEMBL::IdMapping::Archiver;
65
66 use strict;
67 use warnings;
68 no warnings 'uninitialized';
69
70 use Bio::EnsEMBL::IdMapping::BaseObject;
71 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
72
73 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
74 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
75 use Digest::MD5 qw(md5_hex);
76
77
78 # instance variables
79 my $pa_id;
80
81
82 =head2 create_archive
83
84 Arg[1] : Int $mapping_session_id - the mapping_session_id for this run
85 Example : $archiver->create_archive($stable_id_mapper->mapping_session_id);
86 Description : Creates the gene_archive and peptide_archive tables and writes
87 the data to a tab-delimited file. The decision as to what to
88 archive is deferred to dump_gene(), see documentation there for
89 details.
90 Return type : none
91 Exceptions : Thrown on missing argument.
92 Caller : id_mapping.pl
93 Status : At Risk
94 : under development
95
96 =cut
97
98 sub create_archive {
99 my $self = shift;
100 my $mapping_session_id = shift;
101
102 # argument check
103 unless ($mapping_session_id) {
104 $self->logger->warning("No mapping_session_id set.");
105 }
106
107 $self->mapping_session_id($mapping_session_id);
108
109 # get filehandles to write gene and peptide archive
110 my $ga_fh = $self->get_filehandle('gene_archive_new.txt', 'tables');
111 my $pa_fh = $self->get_filehandle('peptide_archive_new.txt', 'tables');
112
113 # get the currently highest peptide_archive_id from the source db
114 my $s_dba = $self->cache->get_DBAdaptor('source');
115 my $s_dbh = $s_dba->dbc->db_handle;
116 my $sql = qq(SELECT MAX(peptide_archive_id) FROM peptide_archive);
117 $pa_id = $self->fetch_value_from_db($s_dbh, $sql);
118
119 unless ($pa_id) {
120 $self->logger->warning("No max(peptide_archive_id) found in db.\n", 1);
121 $self->logger->info("That's ok if this is the first stable ID mapping for this species.\n", 1);
122 }
123
124 $pa_id++;
125 $self->logger->debug("Starting with peptide_archive_id $pa_id.\n");
126
127 # lookup hash of target gene stable IDs
128 my %target_genes = map { $_->stable_id => $_ }
129 values %{ $self->cache->get_by_name("genes_by_id", 'target') };
130
131 # loop over source genes and dump to archive (dump_gene() will decide whether
132 # to do full or partial dump)
133 foreach my $source_gene (values %{ $self->cache->get_by_name("genes_by_id",
134 'source') }) {
135
136 $self->dump_gene($source_gene, $target_genes{$source_gene->stable_id},
137 $ga_fh, $pa_fh);
138 }
139
140 close($ga_fh);
141 close($pa_fh);
142 }
143
144
145 =head2 dump_gene
146
147 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene
148 Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene
149 Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data
150 Arg[4] : Filehandle $pa_fh - filehandle for writing peptide_archive data
151 Example : my $target_gene = $gene_mappings{$source_gene->stable_id};
152 $archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh);
153 Description : Given a source gene, it will write a gene_achive and
154 peptide_achive entry for it if no target gene exists, or if any
155 of its transcripts or their translation changed.
156 Return type : none
157 Exceptions : none
158 Caller : create_archive()
159 Status : At Risk
160 : under development
161
162 =cut
163
164 sub dump_gene {
165 my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_;
166
167 # private method, so no argument check done for performance reasons
168
169 # deal with ncRNA differently
170 # hope this simple biotype regex is accurate enough...
171 my $is_ncRNA = 0;
172 $is_ncRNA = 1 if ($s_gene->biotype =~ /RNA/);
173
174 # loop over source transcripts
175 foreach my $s_tr (@{ $s_gene->get_all_Transcripts }) {
176 my $s_tl = $s_tr->translation;
177
178 # we got a coding transcript
179 if ($s_tl) {
180
181 # do a full dump of this gene if no target gene exists
182 if (! $t_gene) {
183 $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
184
185 # otherwise, only dump if any transcript or its translation changed
186 } else {
187
188 my $changed_flag = 1;
189
190 foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) {
191 my $t_tl = $t_tr->translation;
192 next unless ($t_tl);
193
194 if (($s_tr->stable_id eq $t_tr->stable_id) and
195 ($s_tl->stable_id eq $t_tl->stable_id) and
196 ($s_tl->seq eq $t_tl->seq)) {
197 $changed_flag = 0;
198 }
199 }
200
201 if ($changed_flag) {
202 $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
203 }
204 }
205
206 # now deal with ncRNAs (they don't translate but we still want to archive
207 # them)
208 } elsif ($is_ncRNA) {
209
210 if (! $t_gene) {
211
212 $self->dump_nc_row($s_gene, $s_tr, $ga_fh);
213
214 } else {
215
216 my $changed_flag = 1;
217
218 foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) {
219 $changed_flag = 0 if ($s_tr->stable_id eq $t_tr->stable_id);
220 }
221
222 if ($changed_flag) {
223 $self->dump_nc_row($s_gene, $s_tr, $ga_fh);
224 }
225
226 }
227 }
228 }
229 }
230
231
232 =head2 dump_tuple
233
234 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
235 Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
236 Arg[3] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation
237 Arg[4] : Filehandle $ga_fh - filehandle for writing gene_archive data
238 Arg[5] : Filehandle $pa_fh - filehandle for writing peptide_archive data
239 Example : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
240 Description : Writes entry lines for gene_archive and peptide_archive.
241 Return type : none
242 Exceptions : none
243 Caller : dump_gene()
244 Status : At Risk
245 : under development
246
247 =cut
248
249 sub dump_tuple {
250 my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_;
251
252 # private method, so no argument check done for performance reasons
253
254 # gene archive
255 print $ga_fh join("\t",
256 $gene->stable_id,
257 $gene->version,
258 $tr->stable_id,
259 $tr->version,
260 $tl->stable_id,
261 $tl->version,
262 $pa_id,
263 $self->mapping_session_id
264 );
265 print $ga_fh "\n";
266
267 # peptide archive
268 my $pep_seq = $tl->seq;
269 print $pa_fh join("\t", $pa_id, md5_hex($pep_seq), $pep_seq);
270 print $pa_fh "\n";
271
272 # increment peptide_archive_id
273 $pa_id++;
274 }
275
276
277 =head2 dump_nc_row
278
279 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
280 Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
281 Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data
282 Example : $archive->dump_nc_row($s_gene, $s_tr, $ga_fh);
283 Description : Writes an entry line for gene_archive for non-coding
284 transcripts.
285 Return type : none
286 Exceptions : none
287 Caller : dump_gene()
288 Status : At Risk
289 : under development
290
291 =cut
292
293 sub dump_nc_row {
294 my ($self, $gene, $tr, $ga_fh) = @_;
295
296 # private method, so no argument check done for performance reasons
297
298 # gene archive
299 print $ga_fh join("\t",
300 $gene->stable_id,
301 $gene->version,
302 $tr->stable_id,
303 $tr->version,
304 '\N',
305 '\N',
306 '\N',
307 $self->mapping_session_id
308 );
309 print $ga_fh "\n";
310 }
311
312
313 =head2 mapping_session_id
314
315 Arg[1] : (optional) Int - mapping_session_id to set
316 Example : my $msi = $archiver->mapping_session_id;
317 Description : Getter/setter for mapping_session_id.
318 Return type : Int
319 Exceptions : none
320 Caller : create_archive()
321 Status : At Risk
322 : under development
323
324 =cut
325
326 sub mapping_session_id {
327 my $self = shift;
328 $self->{'_mapping_session_id'} = shift if (@_);
329 return $self->{'_mapping_session_id'};
330 }
331
332
333 1;
334