Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/Archiver.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new( | |
28 -LOGGER => $logger, | |
29 -CONF => $conf, | |
30 -CACHE => $cache | |
31 ); | |
32 | |
33 # create gene and peptide archive | |
34 $archiver->create_archive($mapping_session_id); | |
35 | |
36 # dump existing archive tables to file | |
37 my $num_entries = | |
38 $archiver->dump_table_to_file( 'source', 'gene_archive', | |
39 'gene_archive_existing.txt', 1 ); | |
40 | |
41 =head1 DESCRIPTION | |
42 | |
43 This module creates the gene_archive and peptide_archive | |
44 tables. Data is written to a file as tab-delimited text for | |
45 loading into a MySQL database (this can be done manually, or using | |
46 StableIdmapper->upload_file_into_table()). | |
47 | |
48 An archive entry for a given source gene is created if no target | |
49 gene exists, or if any of its transcripts or their translations | |
50 changed. Non-coding transcripts only have an entry in gene_archive (i.e. | |
51 without a corresponding peptide_archive entry). | |
52 | |
53 =head1 METHODS | |
54 | |
55 create_archive | |
56 dump_gene | |
57 dump_tuple | |
58 dump_nc_row | |
59 mapping_session_id | |
60 | |
61 =cut | |
62 | |
63 | |
64 package Bio::EnsEMBL::IdMapping::Archiver; | |
65 | |
66 use strict; | |
67 use warnings; | |
68 no warnings 'uninitialized'; | |
69 | |
70 use Bio::EnsEMBL::IdMapping::BaseObject; | |
71 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); | |
72 | |
73 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
74 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); | |
75 use Digest::MD5 qw(md5_hex); | |
76 | |
77 | |
78 # instance variables | |
79 my $pa_id; | |
80 | |
81 | |
82 =head2 create_archive | |
83 | |
84 Arg[1] : Int $mapping_session_id - the mapping_session_id for this run | |
85 Example : $archiver->create_archive($stable_id_mapper->mapping_session_id); | |
86 Description : Creates the gene_archive and peptide_archive tables and writes | |
87 the data to a tab-delimited file. The decision as to what to | |
88 archive is deferred to dump_gene(), see documentation there for | |
89 details. | |
90 Return type : none | |
91 Exceptions : Thrown on missing argument. | |
92 Caller : id_mapping.pl | |
93 Status : At Risk | |
94 : under development | |
95 | |
96 =cut | |
97 | |
98 sub create_archive { | |
99 my $self = shift; | |
100 my $mapping_session_id = shift; | |
101 | |
102 # argument check | |
103 unless ($mapping_session_id) { | |
104 $self->logger->warning("No mapping_session_id set."); | |
105 } | |
106 | |
107 $self->mapping_session_id($mapping_session_id); | |
108 | |
109 # get filehandles to write gene and peptide archive | |
110 my $ga_fh = $self->get_filehandle('gene_archive_new.txt', 'tables'); | |
111 my $pa_fh = $self->get_filehandle('peptide_archive_new.txt', 'tables'); | |
112 | |
113 # get the currently highest peptide_archive_id from the source db | |
114 my $s_dba = $self->cache->get_DBAdaptor('source'); | |
115 my $s_dbh = $s_dba->dbc->db_handle; | |
116 my $sql = qq(SELECT MAX(peptide_archive_id) FROM peptide_archive); | |
117 $pa_id = $self->fetch_value_from_db($s_dbh, $sql); | |
118 | |
119 unless ($pa_id) { | |
120 $self->logger->warning("No max(peptide_archive_id) found in db.\n", 1); | |
121 $self->logger->info("That's ok if this is the first stable ID mapping for this species.\n", 1); | |
122 } | |
123 | |
124 $pa_id++; | |
125 $self->logger->debug("Starting with peptide_archive_id $pa_id.\n"); | |
126 | |
127 # lookup hash of target gene stable IDs | |
128 my %target_genes = map { $_->stable_id => $_ } | |
129 values %{ $self->cache->get_by_name("genes_by_id", 'target') }; | |
130 | |
131 # loop over source genes and dump to archive (dump_gene() will decide whether | |
132 # to do full or partial dump) | |
133 foreach my $source_gene (values %{ $self->cache->get_by_name("genes_by_id", | |
134 'source') }) { | |
135 | |
136 $self->dump_gene($source_gene, $target_genes{$source_gene->stable_id}, | |
137 $ga_fh, $pa_fh); | |
138 } | |
139 | |
140 close($ga_fh); | |
141 close($pa_fh); | |
142 } | |
143 | |
144 | |
145 =head2 dump_gene | |
146 | |
147 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene | |
148 Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene | |
149 Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data | |
150 Arg[4] : Filehandle $pa_fh - filehandle for writing peptide_archive data | |
151 Example : my $target_gene = $gene_mappings{$source_gene->stable_id}; | |
152 $archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh); | |
153 Description : Given a source gene, it will write a gene_achive and | |
154 peptide_achive entry for it if no target gene exists, or if any | |
155 of its transcripts or their translation changed. | |
156 Return type : none | |
157 Exceptions : none | |
158 Caller : create_archive() | |
159 Status : At Risk | |
160 : under development | |
161 | |
162 =cut | |
163 | |
164 sub dump_gene { | |
165 my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_; | |
166 | |
167 # private method, so no argument check done for performance reasons | |
168 | |
169 # deal with ncRNA differently | |
170 # hope this simple biotype regex is accurate enough... | |
171 my $is_ncRNA = 0; | |
172 $is_ncRNA = 1 if ($s_gene->biotype =~ /RNA/); | |
173 | |
174 # loop over source transcripts | |
175 foreach my $s_tr (@{ $s_gene->get_all_Transcripts }) { | |
176 my $s_tl = $s_tr->translation; | |
177 | |
178 # we got a coding transcript | |
179 if ($s_tl) { | |
180 | |
181 # do a full dump of this gene if no target gene exists | |
182 if (! $t_gene) { | |
183 $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); | |
184 | |
185 # otherwise, only dump if any transcript or its translation changed | |
186 } else { | |
187 | |
188 my $changed_flag = 1; | |
189 | |
190 foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) { | |
191 my $t_tl = $t_tr->translation; | |
192 next unless ($t_tl); | |
193 | |
194 if (($s_tr->stable_id eq $t_tr->stable_id) and | |
195 ($s_tl->stable_id eq $t_tl->stable_id) and | |
196 ($s_tl->seq eq $t_tl->seq)) { | |
197 $changed_flag = 0; | |
198 } | |
199 } | |
200 | |
201 if ($changed_flag) { | |
202 $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); | |
203 } | |
204 } | |
205 | |
206 # now deal with ncRNAs (they don't translate but we still want to archive | |
207 # them) | |
208 } elsif ($is_ncRNA) { | |
209 | |
210 if (! $t_gene) { | |
211 | |
212 $self->dump_nc_row($s_gene, $s_tr, $ga_fh); | |
213 | |
214 } else { | |
215 | |
216 my $changed_flag = 1; | |
217 | |
218 foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) { | |
219 $changed_flag = 0 if ($s_tr->stable_id eq $t_tr->stable_id); | |
220 } | |
221 | |
222 if ($changed_flag) { | |
223 $self->dump_nc_row($s_gene, $s_tr, $ga_fh); | |
224 } | |
225 | |
226 } | |
227 } | |
228 } | |
229 } | |
230 | |
231 | |
232 =head2 dump_tuple | |
233 | |
234 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive | |
235 Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript | |
236 Arg[3] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation | |
237 Arg[4] : Filehandle $ga_fh - filehandle for writing gene_archive data | |
238 Arg[5] : Filehandle $pa_fh - filehandle for writing peptide_archive data | |
239 Example : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); | |
240 Description : Writes entry lines for gene_archive and peptide_archive. | |
241 Return type : none | |
242 Exceptions : none | |
243 Caller : dump_gene() | |
244 Status : At Risk | |
245 : under development | |
246 | |
247 =cut | |
248 | |
249 sub dump_tuple { | |
250 my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_; | |
251 | |
252 # private method, so no argument check done for performance reasons | |
253 | |
254 # gene archive | |
255 print $ga_fh join("\t", | |
256 $gene->stable_id, | |
257 $gene->version, | |
258 $tr->stable_id, | |
259 $tr->version, | |
260 $tl->stable_id, | |
261 $tl->version, | |
262 $pa_id, | |
263 $self->mapping_session_id | |
264 ); | |
265 print $ga_fh "\n"; | |
266 | |
267 # peptide archive | |
268 my $pep_seq = $tl->seq; | |
269 print $pa_fh join("\t", $pa_id, md5_hex($pep_seq), $pep_seq); | |
270 print $pa_fh "\n"; | |
271 | |
272 # increment peptide_archive_id | |
273 $pa_id++; | |
274 } | |
275 | |
276 | |
277 =head2 dump_nc_row | |
278 | |
279 Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive | |
280 Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript | |
281 Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data | |
282 Example : $archive->dump_nc_row($s_gene, $s_tr, $ga_fh); | |
283 Description : Writes an entry line for gene_archive for non-coding | |
284 transcripts. | |
285 Return type : none | |
286 Exceptions : none | |
287 Caller : dump_gene() | |
288 Status : At Risk | |
289 : under development | |
290 | |
291 =cut | |
292 | |
293 sub dump_nc_row { | |
294 my ($self, $gene, $tr, $ga_fh) = @_; | |
295 | |
296 # private method, so no argument check done for performance reasons | |
297 | |
298 # gene archive | |
299 print $ga_fh join("\t", | |
300 $gene->stable_id, | |
301 $gene->version, | |
302 $tr->stable_id, | |
303 $tr->version, | |
304 '\N', | |
305 '\N', | |
306 '\N', | |
307 $self->mapping_session_id | |
308 ); | |
309 print $ga_fh "\n"; | |
310 } | |
311 | |
312 | |
313 =head2 mapping_session_id | |
314 | |
315 Arg[1] : (optional) Int - mapping_session_id to set | |
316 Example : my $msi = $archiver->mapping_session_id; | |
317 Description : Getter/setter for mapping_session_id. | |
318 Return type : Int | |
319 Exceptions : none | |
320 Caller : create_archive() | |
321 Status : At Risk | |
322 : under development | |
323 | |
324 =cut | |
325 | |
326 sub mapping_session_id { | |
327 my $self = shift; | |
328 $self->{'_mapping_session_id'} = shift if (@_); | |
329 return $self->{'_mapping_session_id'}; | |
330 } | |
331 | |
332 | |
333 1; | |
334 |