0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 =head1 SYNOPSIS
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 =head1 METHODS
|
|
28
|
|
29 =cut
|
|
30
|
|
31 package Bio::EnsEMBL::Utils::VegaCuration::Transcript;
|
|
32
|
|
33 use strict;
|
|
34 use warnings;
|
|
35 no warnings 'uninitialized';
|
|
36 use vars qw(@ISA);
|
|
37
|
|
38 use Bio::EnsEMBL::Utils::VegaCuration::Gene;
|
|
39 use Data::Dumper;
|
|
40
|
|
41 @ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Gene);
|
|
42
|
|
43
|
|
44 =head2 find_non_overlaps
|
|
45
|
|
46 Args : arrayref of B::E::Transcripts
|
|
47 Example : find_non_overlaps($all_transcripts)
|
|
48 Description: identifies any non-overlapping transcripts
|
|
49 Returntype : array refs of stable IDs
|
|
50 Exceptions : none
|
|
51
|
|
52 =cut
|
|
53
|
|
54 sub find_non_overlaps {
|
|
55 my $self = shift;
|
|
56 my ($all_transcripts) = @_;
|
|
57 my $non_overlaps = [];
|
|
58 foreach my $transcript1 (@{$all_transcripts}) {
|
|
59 foreach my $transcript2 (@{$all_transcripts}) {
|
|
60 if ($transcript1->end < $transcript2->start) {
|
|
61 push @{$non_overlaps}, $transcript1->stable_id;
|
|
62 push @{$non_overlaps}, $transcript2->stable_id;
|
|
63 }
|
|
64 }
|
|
65 }
|
|
66 return $non_overlaps;
|
|
67 }
|
|
68
|
|
69 =head2 check_remarks_and_update_names
|
|
70
|
|
71 Arg[1] : B::E::Gene (with potentially duplicated transcript names)
|
|
72 Arg[2] : counter 1 (no. of patched genes)
|
|
73 Arg[3] : counter 2 (no. of patched transcripts)
|
|
74 Example : $support->update_names($gene,\$c1,\$c2)
|
|
75 Description: - checks remarks and patches transcripts with identical names according to
|
|
76 CDS and length
|
|
77 Returntype : true | false (depending on whether patched or not), counter1, counter2
|
|
78
|
|
79 =cut
|
|
80
|
|
81 sub check_remarks_and_update_names {
|
|
82 my $self = shift;
|
|
83 my ($gene,$gene_c,$trans_c) = @_;
|
|
84 my $action = ($self->param('dry_run')) ? 'Would add' : 'Added';
|
|
85 my $aa = $gene->adaptor->db->get_AttributeAdaptor;
|
|
86 my $dbh = $gene->adaptor->db->dbc->db_handle;
|
|
87
|
|
88 #get list of IDs that have previously been sent to annotators
|
|
89 my $seen_genes = $self->get_havana_fragmented_loci_comments;
|
|
90
|
|
91 my $gsi = $gene->stable_id;
|
|
92 my $gid = $gene->dbID;
|
|
93 my $g_name;
|
|
94 my $study_more = 1;
|
|
95 eval {
|
|
96 $g_name = $gene->display_xref->display_id;
|
|
97 };
|
|
98 if ($@) {
|
|
99 $g_name = $gene->get_all_Attributes('name')->[0]->value;
|
|
100 }
|
|
101
|
|
102 #get existing gene remarks
|
|
103 my $remarks = [ map {$_->value} @{$gene->get_all_Attributes('remark')} ];
|
|
104
|
|
105 #shout if there is no remark to identify this as being fragmented
|
|
106 if ( grep {$_ eq 'fragmented locus' } @$remarks) {
|
|
107 $study_more = 0;
|
|
108 }
|
|
109 else {
|
|
110 $self->log_warning("Gene $gsi should have a fragmented locus remark\n");
|
|
111 }
|
|
112
|
|
113 ##patch transcript names according to length and CDS
|
|
114 $gene_c++;
|
|
115
|
|
116 #separate coding and non_coding transcripts
|
|
117 my $coding_trans = [];
|
|
118 my $noncoding_trans = [];
|
|
119 foreach my $trans ( @{$gene->get_all_Transcripts()} ) {
|
|
120 if ($trans->translate) {
|
|
121 push @$coding_trans, $trans;
|
|
122 }
|
|
123 else {
|
|
124 push @$noncoding_trans, $trans;
|
|
125 }
|
|
126 }
|
|
127
|
|
128 #sort transcripts coding > non-coding, then on length
|
|
129 my $c = 0;
|
|
130 $self->log("\nPatching names according to CDS and length:\n",1);
|
|
131 foreach my $array_ref ($coding_trans,$noncoding_trans) {
|
|
132 foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) {
|
|
133 $trans_c++;
|
|
134 my $tsi = $trans->stable_id;
|
|
135 my $t_name;
|
|
136 eval {
|
|
137 $t_name = $trans->display_xref->display_id;
|
|
138 };
|
|
139 if ($@) {
|
|
140 $t_name = $trans->get_all_Attributes('name')->[0]->value;
|
|
141 }
|
|
142 $c++;
|
|
143 my $ext = sprintf("%03d", $c);
|
|
144 my $new_name = $g_name.'-'.$ext;
|
|
145 $self->log(sprintf("%-20s%-3s%-20s", "$t_name ", "-->", "$new_name")."\n",1);
|
|
146 if (! $self->param('dry_run')) {
|
|
147
|
|
148 # update transcript display xref
|
|
149 $dbh->do(qq(UPDATE xref x, external_db edb
|
|
150 SET x.display_label = "$new_name"
|
|
151 WHERE x.external_db_id = edb.external_db_id
|
|
152 AND x.dbprimary_acc = "$tsi"
|
|
153 AND edb.db_name = "Vega_transcript"));
|
|
154 }
|
|
155 }
|
|
156 }
|
|
157 return ($study_more,$gene_c,$trans_c);
|
|
158 }
|
|
159
|
|
160 =head2 check_names_and_overlap
|
|
161
|
|
162 Arg[1] : arayref of arrayrefs of duplicated names
|
|
163 Arg[2] : B::E::Gene (with potentially duplicated transcript names)
|
|
164 Arg[3] : FH (to log new duplicates)
|
|
165 Example : $support->check_names_and_overlap($transcripts,$gene,$fh)
|
|
166 Description: checks pairs of transcripts identified as having duplicate Vega names:
|
|
167 - to see if they have identical names in loutre (shouldn't have)
|
|
168 - distinguish between overlapping and non overlapping transcripts
|
|
169 Returntype : none
|
|
170
|
|
171 =cut
|
|
172
|
|
173 sub check_names_and_overlap {
|
|
174 my $self = shift;
|
|
175 my ($transcript_info,$gene,$n_flist_fh) = @_;
|
|
176 my $ta = $gene->adaptor->db->get_TranscriptAdaptor;
|
|
177 my $gsi = $gene->stable_id;
|
|
178 my $g_name = $gene->get_all_Attributes('name')->[0]->value;
|
|
179 foreach my $set (values %{$transcript_info} ) {
|
|
180 next if (scalar @{$set} == 1);
|
|
181 my $transcripts = [];
|
|
182 my $all_t_names;
|
|
183 my %ids_to_names;
|
|
184 foreach my $id1 (@{$set}) {
|
|
185 my ($name1,$tsi1) = split /\|/, $id1;
|
|
186 $ids_to_names{$tsi1} = $name1;
|
|
187 $all_t_names .= "$tsi1 [$name1] ";
|
|
188 my $t = $ta->fetch_by_stable_id($tsi1);
|
|
189 push @{$transcripts}, $t;
|
|
190 }
|
|
191
|
|
192 my $non_overlaps;
|
|
193 eval {
|
|
194 $non_overlaps = $self->find_non_overlaps($transcripts);
|
|
195 };
|
|
196 if ($@) {
|
|
197 $self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n");
|
|
198 }
|
|
199
|
|
200 #if the transcripts don't overlap
|
|
201 elsif (@{$non_overlaps}) {
|
|
202 my $tsi_string;
|
|
203 foreach my $id (@{$non_overlaps}) {
|
|
204 my $string = " $id [ $ids_to_names{$id} ] ";
|
|
205 $tsi_string .= $string;
|
|
206 }
|
|
207
|
|
208 $self->log_warning("NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'fragmented locus\' gene remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n");
|
|
209 #log gsi (to be sent to Havana)
|
|
210 print $n_flist_fh "$gsi\n";
|
|
211 }
|
|
212 #...otherwise if the transcripts do overlap
|
|
213 else {
|
|
214 $self->log_warning("NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with duplicated Vega names and it has no \'fragmented locus\' gene_remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n");
|
|
215 print $n_flist_fh "$gsi\n";
|
|
216 }
|
|
217 }
|
|
218 }
|
|
219
|
|
220 =head2 get_havana_fragmented_loci_comments
|
|
221
|
|
222 Args : none
|
|
223 Example : my $results = $support->get_havana_fragmented_loci_comments
|
|
224 Description: parses the HEREDOC containing Havana comments in this module
|
|
225 Returntype : hashref
|
|
226
|
|
227 =cut
|
|
228
|
|
229 sub get_havana_fragmented_loci_comments {
|
|
230 my $seen_genes;
|
|
231 while (<DATA>) {
|
|
232 next if /^\s+$/ or /#+/;
|
|
233 my ($obj,$comment) = split /=/;
|
|
234 $obj =~ s/^\s+|\s+$//g;
|
|
235 $comment =~ s/^\s+|\s+$//g;
|
|
236 $seen_genes->{$obj} = $comment;
|
|
237 }
|
|
238 return $seen_genes;
|
|
239 }
|
|
240
|
|
241
|
|
242
|
|
243 #details of genes with duplicated transcript names that have already been reported to Havana
|
|
244 #identified as either fragmented or as being OK to patch
|
|
245 __DATA__
|
|
246
|
|
247 OTTMUSG00000005478 = fragmented
|
|
248 OTTMUSG00000001936 = fragmented
|
|
249 OTTMUSG00000017081 = fragmented
|
|
250 OTTMUSG00000011441 = fragmented
|
|
251 OTTMUSG00000013335 = fragmented
|
|
252 OTTMUSG00000011654 = fragmented
|
|
253 OTTMUSG00000001835 = fragmented
|
|
254 OTTHUMG00000035221 = fragmented
|
|
255 OTTHUMG00000037378 = fragmented
|
|
256 OTTHUMG00000060732 = fragmented
|
|
257 OTTHUMG00000132441 = fragmented
|
|
258 OTTHUMG00000031383 = fragmented
|
|
259 OTTHUMG00000012716 = fragmented
|
|
260 OTTHUMG00000031102 = fragmented
|
|
261 OTTHUMG00000148816 = fragmented
|
|
262 OTTHUMG00000149059 = fragmented
|
|
263 OTTHUMG00000149221 = fragmented
|
|
264 OTTHUMG00000149326 = fragmented
|
|
265 OTTHUMG00000149644 = fragmented
|
|
266 OTTHUMG00000149574 = fragmented
|
|
267 OTTHUMG00000058101 = fragmented
|
|
268
|
|
269 OTTHUMG00000150119 = OK
|
|
270 OTTHUMG00000149850 = OK
|
|
271 OTTHUMG00000058101 = OK
|
|
272 OTTHUMG00000058907 = OK
|
|
273
|
|
274 OTTMUSG00000011654 = fragmented
|
|
275 OTTMUSG00000019369 = fragmented
|
|
276 OTTMUSG00000017081 = fragmented
|
|
277 OTTMUSG00000001835 = fragmented
|
|
278 OTTMUSG00000011499 = fragmented
|
|
279 OTTMUSG00000013335 = fragmented
|
|
280 OTTMUSG00000008023 = fragmented
|
|
281 OTTMUSG00000019369 = fragmented
|
|
282
|
|
283
|
|
284 OTTMUSG00000022266
|
|
285 OTTMUSG00000006697
|
|
286
|
|
287
|
|
288
|
|
289
|
|
290
|
|
291 OTTMUSG00000012302 =
|
|
292 OTTMUSG00000013368 =
|
|
293 OTTMUSG00000015766 =
|
|
294 OTTMUSG00000016025 =
|
|
295 OTTMUSG00000001066 =
|
|
296 OTTMUSG00000016331 =
|
|
297 OTTMUSG00000006935 =
|
|
298 OTTMUSG00000007263 =
|
|
299 OTTMUSG00000000304 =
|
|
300 OTTMUSG00000009150 =
|
|
301 OTTMUSG00000008023 =
|
|
302 OTTMUSG00000017077 =
|
|
303 OTTMUSG00000003440 =
|
|
304 OTTMUSG00000016310 =
|
|
305 OTTMUSG00000026199 =
|
|
306 OTTMUSG00000028423 =
|
|
307 OTTMUSG00000007427 =
|