0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new(
|
|
28 -LOGGER => $logger,
|
|
29 -CONF => $conf,
|
|
30 -CACHE => $cache
|
|
31 );
|
|
32
|
|
33 # create gene and peptide archive
|
|
34 $archiver->create_archive($mapping_session_id);
|
|
35
|
|
36 # dump existing archive tables to file
|
|
37 my $num_entries =
|
|
38 $archiver->dump_table_to_file( 'source', 'gene_archive',
|
|
39 'gene_archive_existing.txt', 1 );
|
|
40
|
|
41 =head1 DESCRIPTION
|
|
42
|
|
43 This module creates the gene_archive and peptide_archive
|
|
44 tables. Data is written to a file as tab-delimited text for
|
|
45 loading into a MySQL database (this can be done manually, or using
|
|
46 StableIdmapper->upload_file_into_table()).
|
|
47
|
|
48 An archive entry for a given source gene is created if no target
|
|
49 gene exists, or if any of its transcripts or their translations
|
|
50 changed. Non-coding transcripts only have an entry in gene_archive (i.e.
|
|
51 without a corresponding peptide_archive entry).
|
|
52
|
|
53 =head1 METHODS
|
|
54
|
|
55 create_archive
|
|
56 dump_gene
|
|
57 dump_tuple
|
|
58 dump_nc_row
|
|
59 mapping_session_id
|
|
60
|
|
61 =cut
|
|
62
|
|
63
|
|
64 package Bio::EnsEMBL::IdMapping::Archiver;
|
|
65
|
|
66 use strict;
|
|
67 use warnings;
|
|
68 no warnings 'uninitialized';
|
|
69
|
|
70 use Bio::EnsEMBL::IdMapping::BaseObject;
|
|
71 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
|
|
72
|
|
73 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
74 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
|
|
75 use Digest::MD5 qw(md5_hex);
|
|
76
|
|
77
|
|
78 # instance variables
|
|
79 my $pa_id;
|
|
80
|
|
81
|
|
82 =head2 create_archive
|
|
83
|
|
84 Arg[1] : Int $mapping_session_id - the mapping_session_id for this run
|
|
85 Example : $archiver->create_archive($stable_id_mapper->mapping_session_id);
|
|
86 Description : Creates the gene_archive and peptide_archive tables and writes
|
|
87 the data to a tab-delimited file. The decision as to what to
|
|
88 archive is deferred to dump_gene(), see documentation there for
|
|
89 details.
|
|
90 Return type : none
|
|
91 Exceptions : Thrown on missing argument.
|
|
92 Caller : id_mapping.pl
|
|
93 Status : At Risk
|
|
94 : under development
|
|
95
|
|
96 =cut
|
|
97
|
|
98 sub create_archive {
|
|
99 my $self = shift;
|
|
100 my $mapping_session_id = shift;
|
|
101
|
|
102 # argument check
|
|
103 unless ($mapping_session_id) {
|
|
104 $self->logger->warning("No mapping_session_id set.");
|
|
105 }
|
|
106
|
|
107 $self->mapping_session_id($mapping_session_id);
|
|
108
|
|
109 # get filehandles to write gene and peptide archive
|
|
110 my $ga_fh = $self->get_filehandle('gene_archive_new.txt', 'tables');
|
|
111 my $pa_fh = $self->get_filehandle('peptide_archive_new.txt', 'tables');
|
|
112
|
|
113 # get the currently highest peptide_archive_id from the source db
|
|
114 my $s_dba = $self->cache->get_DBAdaptor('source');
|
|
115 my $s_dbh = $s_dba->dbc->db_handle;
|
|
116 my $sql = qq(SELECT MAX(peptide_archive_id) FROM peptide_archive);
|
|
117 $pa_id = $self->fetch_value_from_db($s_dbh, $sql);
|
|
118
|
|
119 unless ($pa_id) {
|
|
120 $self->logger->warning("No max(peptide_archive_id) found in db.\n", 1);
|
|
121 $self->logger->info("That's ok if this is the first stable ID mapping for this species.\n", 1);
|
|
122 }
|
|
123
|
|
124 $pa_id++;
|
|
125 $self->logger->debug("Starting with peptide_archive_id $pa_id.\n");
|
|
126
|
|
127 # lookup hash of target gene stable IDs
|
|
128 my %target_genes = map { $_->stable_id => $_ }
|
|
129 values %{ $self->cache->get_by_name("genes_by_id", 'target') };
|
|
130
|
|
131 # loop over source genes and dump to archive (dump_gene() will decide whether
|
|
132 # to do full or partial dump)
|
|
133 foreach my $source_gene (values %{ $self->cache->get_by_name("genes_by_id",
|
|
134 'source') }) {
|
|
135
|
|
136 $self->dump_gene($source_gene, $target_genes{$source_gene->stable_id},
|
|
137 $ga_fh, $pa_fh);
|
|
138 }
|
|
139
|
|
140 close($ga_fh);
|
|
141 close($pa_fh);
|
|
142 }
|
|
143
|
|
144
|
|
145 =head2 dump_gene
|
|
146
|
|
147 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene
|
|
148 Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene
|
|
149 Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data
|
|
150 Arg[4] : Filehandle $pa_fh - filehandle for writing peptide_archive data
|
|
151 Example : my $target_gene = $gene_mappings{$source_gene->stable_id};
|
|
152 $archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh);
|
|
153 Description : Given a source gene, it will write a gene_achive and
|
|
154 peptide_achive entry for it if no target gene exists, or if any
|
|
155 of its transcripts or their translation changed.
|
|
156 Return type : none
|
|
157 Exceptions : none
|
|
158 Caller : create_archive()
|
|
159 Status : At Risk
|
|
160 : under development
|
|
161
|
|
162 =cut
|
|
163
|
|
164 sub dump_gene {
|
|
165 my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_;
|
|
166
|
|
167 # private method, so no argument check done for performance reasons
|
|
168
|
|
169 # deal with ncRNA differently
|
|
170 # hope this simple biotype regex is accurate enough...
|
|
171 my $is_ncRNA = 0;
|
|
172 $is_ncRNA = 1 if ($s_gene->biotype =~ /RNA/);
|
|
173
|
|
174 # loop over source transcripts
|
|
175 foreach my $s_tr (@{ $s_gene->get_all_Transcripts }) {
|
|
176 my $s_tl = $s_tr->translation;
|
|
177
|
|
178 # we got a coding transcript
|
|
179 if ($s_tl) {
|
|
180
|
|
181 # do a full dump of this gene if no target gene exists
|
|
182 if (! $t_gene) {
|
|
183 $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
|
|
184
|
|
185 # otherwise, only dump if any transcript or its translation changed
|
|
186 } else {
|
|
187
|
|
188 my $changed_flag = 1;
|
|
189
|
|
190 foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) {
|
|
191 my $t_tl = $t_tr->translation;
|
|
192 next unless ($t_tl);
|
|
193
|
|
194 if (($s_tr->stable_id eq $t_tr->stable_id) and
|
|
195 ($s_tl->stable_id eq $t_tl->stable_id) and
|
|
196 ($s_tl->seq eq $t_tl->seq)) {
|
|
197 $changed_flag = 0;
|
|
198 }
|
|
199 }
|
|
200
|
|
201 if ($changed_flag) {
|
|
202 $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
|
|
203 }
|
|
204 }
|
|
205
|
|
206 # now deal with ncRNAs (they don't translate but we still want to archive
|
|
207 # them)
|
|
208 } elsif ($is_ncRNA) {
|
|
209
|
|
210 if (! $t_gene) {
|
|
211
|
|
212 $self->dump_nc_row($s_gene, $s_tr, $ga_fh);
|
|
213
|
|
214 } else {
|
|
215
|
|
216 my $changed_flag = 1;
|
|
217
|
|
218 foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) {
|
|
219 $changed_flag = 0 if ($s_tr->stable_id eq $t_tr->stable_id);
|
|
220 }
|
|
221
|
|
222 if ($changed_flag) {
|
|
223 $self->dump_nc_row($s_gene, $s_tr, $ga_fh);
|
|
224 }
|
|
225
|
|
226 }
|
|
227 }
|
|
228 }
|
|
229 }
|
|
230
|
|
231
|
|
232 =head2 dump_tuple
|
|
233
|
|
234 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
|
|
235 Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
|
|
236 Arg[3] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation
|
|
237 Arg[4] : Filehandle $ga_fh - filehandle for writing gene_archive data
|
|
238 Arg[5] : Filehandle $pa_fh - filehandle for writing peptide_archive data
|
|
239 Example : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
|
|
240 Description : Writes entry lines for gene_archive and peptide_archive.
|
|
241 Return type : none
|
|
242 Exceptions : none
|
|
243 Caller : dump_gene()
|
|
244 Status : At Risk
|
|
245 : under development
|
|
246
|
|
247 =cut
|
|
248
|
|
249 sub dump_tuple {
|
|
250 my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_;
|
|
251
|
|
252 # private method, so no argument check done for performance reasons
|
|
253
|
|
254 # gene archive
|
|
255 print $ga_fh join("\t",
|
|
256 $gene->stable_id,
|
|
257 $gene->version,
|
|
258 $tr->stable_id,
|
|
259 $tr->version,
|
|
260 $tl->stable_id,
|
|
261 $tl->version,
|
|
262 $pa_id,
|
|
263 $self->mapping_session_id
|
|
264 );
|
|
265 print $ga_fh "\n";
|
|
266
|
|
267 # peptide archive
|
|
268 my $pep_seq = $tl->seq;
|
|
269 print $pa_fh join("\t", $pa_id, md5_hex($pep_seq), $pep_seq);
|
|
270 print $pa_fh "\n";
|
|
271
|
|
272 # increment peptide_archive_id
|
|
273 $pa_id++;
|
|
274 }
|
|
275
|
|
276
|
|
277 =head2 dump_nc_row
|
|
278
|
|
279 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
|
|
280 Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
|
|
281 Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data
|
|
282 Example : $archive->dump_nc_row($s_gene, $s_tr, $ga_fh);
|
|
283 Description : Writes an entry line for gene_archive for non-coding
|
|
284 transcripts.
|
|
285 Return type : none
|
|
286 Exceptions : none
|
|
287 Caller : dump_gene()
|
|
288 Status : At Risk
|
|
289 : under development
|
|
290
|
|
291 =cut
|
|
292
|
|
293 sub dump_nc_row {
|
|
294 my ($self, $gene, $tr, $ga_fh) = @_;
|
|
295
|
|
296 # private method, so no argument check done for performance reasons
|
|
297
|
|
298 # gene archive
|
|
299 print $ga_fh join("\t",
|
|
300 $gene->stable_id,
|
|
301 $gene->version,
|
|
302 $tr->stable_id,
|
|
303 $tr->version,
|
|
304 '\N',
|
|
305 '\N',
|
|
306 '\N',
|
|
307 $self->mapping_session_id
|
|
308 );
|
|
309 print $ga_fh "\n";
|
|
310 }
|
|
311
|
|
312
|
|
313 =head2 mapping_session_id
|
|
314
|
|
315 Arg[1] : (optional) Int - mapping_session_id to set
|
|
316 Example : my $msi = $archiver->mapping_session_id;
|
|
317 Description : Getter/setter for mapping_session_id.
|
|
318 Return type : Int
|
|
319 Exceptions : none
|
|
320 Caller : create_archive()
|
|
321 Status : At Risk
|
|
322 : under development
|
|
323
|
|
324 =cut
|
|
325
|
|
326 sub mapping_session_id {
|
|
327 my $self = shift;
|
|
328 $self->{'_mapping_session_id'} = shift if (@_);
|
|
329 return $self->{'_mapping_session_id'};
|
|
330 }
|
|
331
|
|
332
|
|
333 1;
|
|
334
|