comparison variant_effect_predictor/Bio/EnsEMBL/Utils/VegaCuration/Transcript.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 =head1 SYNOPSIS
24
25 =head1 DESCRIPTION
26
27 =head1 METHODS
28
29 =cut
30
31 package Bio::EnsEMBL::Utils::VegaCuration::Transcript;
32
33 use strict;
34 use warnings;
35 no warnings 'uninitialized';
36 use vars qw(@ISA);
37
38 use Bio::EnsEMBL::Utils::VegaCuration::Gene;
39 use Data::Dumper;
40
41 @ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Gene);
42
43
44 =head2 find_non_overlaps
45
46 Args : arrayref of B::E::Transcripts
47 Example : find_non_overlaps($all_transcripts)
48 Description: identifies any non-overlapping transcripts
49 Returntype : array refs of stable IDs
50 Exceptions : none
51
52 =cut
53
54 sub find_non_overlaps {
55 my $self = shift;
56 my ($all_transcripts) = @_;
57 my $non_overlaps = [];
58 foreach my $transcript1 (@{$all_transcripts}) {
59 foreach my $transcript2 (@{$all_transcripts}) {
60 if ($transcript1->end < $transcript2->start) {
61 push @{$non_overlaps}, $transcript1->stable_id;
62 push @{$non_overlaps}, $transcript2->stable_id;
63 }
64 }
65 }
66 return $non_overlaps;
67 }
68
69 =head2 check_remarks_and_update_names
70
71 Arg[1] : B::E::Gene (with potentially duplicated transcript names)
72 Arg[2] : counter 1 (no. of patched genes)
73 Arg[3] : counter 2 (no. of patched transcripts)
74 Example : $support->update_names($gene,\$c1,\$c2)
75 Description: - checks remarks and patches transcripts with identical names according to
76 CDS and length
77 Returntype : true | false (depending on whether patched or not), counter1, counter2
78
79 =cut
80
81 sub check_remarks_and_update_names {
82 my $self = shift;
83 my ($gene,$gene_c,$trans_c) = @_;
84 my $action = ($self->param('dry_run')) ? 'Would add' : 'Added';
85 my $aa = $gene->adaptor->db->get_AttributeAdaptor;
86 my $dbh = $gene->adaptor->db->dbc->db_handle;
87
88 #get list of IDs that have previously been sent to annotators
89 my $seen_genes = $self->get_havana_fragmented_loci_comments;
90
91 my $gsi = $gene->stable_id;
92 my $gid = $gene->dbID;
93 my $g_name;
94 my $study_more = 1;
95 eval {
96 $g_name = $gene->display_xref->display_id;
97 };
98 if ($@) {
99 $g_name = $gene->get_all_Attributes('name')->[0]->value;
100 }
101
102 #get existing gene remarks
103 my $remarks = [ map {$_->value} @{$gene->get_all_Attributes('remark')} ];
104
105 #shout if there is no remark to identify this as being fragmented
106 if ( grep {$_ eq 'fragmented locus' } @$remarks) {
107 $study_more = 0;
108 }
109 else {
110 $self->log_warning("Gene $gsi should have a fragmented locus remark\n");
111 }
112
113 ##patch transcript names according to length and CDS
114 $gene_c++;
115
116 #separate coding and non_coding transcripts
117 my $coding_trans = [];
118 my $noncoding_trans = [];
119 foreach my $trans ( @{$gene->get_all_Transcripts()} ) {
120 if ($trans->translate) {
121 push @$coding_trans, $trans;
122 }
123 else {
124 push @$noncoding_trans, $trans;
125 }
126 }
127
128 #sort transcripts coding > non-coding, then on length
129 my $c = 0;
130 $self->log("\nPatching names according to CDS and length:\n",1);
131 foreach my $array_ref ($coding_trans,$noncoding_trans) {
132 foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) {
133 $trans_c++;
134 my $tsi = $trans->stable_id;
135 my $t_name;
136 eval {
137 $t_name = $trans->display_xref->display_id;
138 };
139 if ($@) {
140 $t_name = $trans->get_all_Attributes('name')->[0]->value;
141 }
142 $c++;
143 my $ext = sprintf("%03d", $c);
144 my $new_name = $g_name.'-'.$ext;
145 $self->log(sprintf("%-20s%-3s%-20s", "$t_name ", "-->", "$new_name")."\n",1);
146 if (! $self->param('dry_run')) {
147
148 # update transcript display xref
149 $dbh->do(qq(UPDATE xref x, external_db edb
150 SET x.display_label = "$new_name"
151 WHERE x.external_db_id = edb.external_db_id
152 AND x.dbprimary_acc = "$tsi"
153 AND edb.db_name = "Vega_transcript"));
154 }
155 }
156 }
157 return ($study_more,$gene_c,$trans_c);
158 }
159
160 =head2 check_names_and_overlap
161
162 Arg[1] : arayref of arrayrefs of duplicated names
163 Arg[2] : B::E::Gene (with potentially duplicated transcript names)
164 Arg[3] : FH (to log new duplicates)
165 Example : $support->check_names_and_overlap($transcripts,$gene,$fh)
166 Description: checks pairs of transcripts identified as having duplicate Vega names:
167 - to see if they have identical names in loutre (shouldn't have)
168 - distinguish between overlapping and non overlapping transcripts
169 Returntype : none
170
171 =cut
172
173 sub check_names_and_overlap {
174 my $self = shift;
175 my ($transcript_info,$gene,$n_flist_fh) = @_;
176 my $ta = $gene->adaptor->db->get_TranscriptAdaptor;
177 my $gsi = $gene->stable_id;
178 my $g_name = $gene->get_all_Attributes('name')->[0]->value;
179 foreach my $set (values %{$transcript_info} ) {
180 next if (scalar @{$set} == 1);
181 my $transcripts = [];
182 my $all_t_names;
183 my %ids_to_names;
184 foreach my $id1 (@{$set}) {
185 my ($name1,$tsi1) = split /\|/, $id1;
186 $ids_to_names{$tsi1} = $name1;
187 $all_t_names .= "$tsi1 [$name1] ";
188 my $t = $ta->fetch_by_stable_id($tsi1);
189 push @{$transcripts}, $t;
190 }
191
192 my $non_overlaps;
193 eval {
194 $non_overlaps = $self->find_non_overlaps($transcripts);
195 };
196 if ($@) {
197 $self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n");
198 }
199
200 #if the transcripts don't overlap
201 elsif (@{$non_overlaps}) {
202 my $tsi_string;
203 foreach my $id (@{$non_overlaps}) {
204 my $string = " $id [ $ids_to_names{$id} ] ";
205 $tsi_string .= $string;
206 }
207
208 $self->log_warning("NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'fragmented locus\' gene remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n");
209 #log gsi (to be sent to Havana)
210 print $n_flist_fh "$gsi\n";
211 }
212 #...otherwise if the transcripts do overlap
213 else {
214 $self->log_warning("NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with duplicated Vega names and it has no \'fragmented locus\' gene_remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n");
215 print $n_flist_fh "$gsi\n";
216 }
217 }
218 }
219
220 =head2 get_havana_fragmented_loci_comments
221
222 Args : none
223 Example : my $results = $support->get_havana_fragmented_loci_comments
224 Description: parses the HEREDOC containing Havana comments in this module
225 Returntype : hashref
226
227 =cut
228
229 sub get_havana_fragmented_loci_comments {
230 my $seen_genes;
231 while (<DATA>) {
232 next if /^\s+$/ or /#+/;
233 my ($obj,$comment) = split /=/;
234 $obj =~ s/^\s+|\s+$//g;
235 $comment =~ s/^\s+|\s+$//g;
236 $seen_genes->{$obj} = $comment;
237 }
238 return $seen_genes;
239 }
240
241
242
243 #details of genes with duplicated transcript names that have already been reported to Havana
244 #identified as either fragmented or as being OK to patch
245 __DATA__
246
247 OTTMUSG00000005478 = fragmented
248 OTTMUSG00000001936 = fragmented
249 OTTMUSG00000017081 = fragmented
250 OTTMUSG00000011441 = fragmented
251 OTTMUSG00000013335 = fragmented
252 OTTMUSG00000011654 = fragmented
253 OTTMUSG00000001835 = fragmented
254 OTTHUMG00000035221 = fragmented
255 OTTHUMG00000037378 = fragmented
256 OTTHUMG00000060732 = fragmented
257 OTTHUMG00000132441 = fragmented
258 OTTHUMG00000031383 = fragmented
259 OTTHUMG00000012716 = fragmented
260 OTTHUMG00000031102 = fragmented
261 OTTHUMG00000148816 = fragmented
262 OTTHUMG00000149059 = fragmented
263 OTTHUMG00000149221 = fragmented
264 OTTHUMG00000149326 = fragmented
265 OTTHUMG00000149644 = fragmented
266 OTTHUMG00000149574 = fragmented
267 OTTHUMG00000058101 = fragmented
268
269 OTTHUMG00000150119 = OK
270 OTTHUMG00000149850 = OK
271 OTTHUMG00000058101 = OK
272 OTTHUMG00000058907 = OK
273
274 OTTMUSG00000011654 = fragmented
275 OTTMUSG00000019369 = fragmented
276 OTTMUSG00000017081 = fragmented
277 OTTMUSG00000001835 = fragmented
278 OTTMUSG00000011499 = fragmented
279 OTTMUSG00000013335 = fragmented
280 OTTMUSG00000008023 = fragmented
281 OTTMUSG00000019369 = fragmented
282
283
284 OTTMUSG00000022266
285 OTTMUSG00000006697
286
287
288
289
290
291 OTTMUSG00000012302 =
292 OTTMUSG00000013368 =
293 OTTMUSG00000015766 =
294 OTTMUSG00000016025 =
295 OTTMUSG00000001066 =
296 OTTMUSG00000016331 =
297 OTTMUSG00000006935 =
298 OTTMUSG00000007263 =
299 OTTMUSG00000000304 =
300 OTTMUSG00000009150 =
301 OTTMUSG00000008023 =
302 OTTMUSG00000017077 =
303 OTTMUSG00000003440 =
304 OTTMUSG00000016310 =
305 OTTMUSG00000026199 =
306 OTTMUSG00000028423 =
307 OTTMUSG00000007427 =