0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::TranscriptFactory - Module having the fset2transcript*
|
|
24 subroutines
|
|
25
|
|
26 =head1 SYNOPSIS
|
|
27
|
|
28 use Bio::EnsEMBL::TranscriptFactory;
|
|
29
|
|
30 &Bio::EnsEMBL::TranscriptFactory::fset2transcript($fset_id);
|
|
31
|
|
32 =head1 DESCRIPTION
|
|
33
|
|
34 Module containing the subroutines fset2transcript*,
|
|
35 which create transcripts from features (formally housed in
|
|
36 Bio::EnsEMBL::DBSQL::Utils).
|
|
37
|
|
38 =head1 METHODS
|
|
39
|
|
40 =cut
|
|
41
|
|
42 package Bio::EnsEMBL::TranscriptFactory;
|
|
43
|
|
44 use strict;
|
|
45
|
|
46 use Bio::EnsEMBL::Exon;
|
|
47 use Bio::EnsEMBL::Translation;
|
|
48 use Bio::EnsEMBL::Transcript;
|
|
49
|
|
50 sub fset2transcript {
|
|
51 my ($genscan,$contig)=@_;
|
|
52
|
|
53
|
|
54 unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI"))
|
|
55 {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";}
|
|
56
|
|
57 my $transcript = new Bio::EnsEMBL::Transcript;
|
|
58 $transcript->temporary_id($contig->id . "." . $genscan->seqname);
|
|
59
|
|
60 my @exons;
|
|
61 my $count= 1;
|
|
62
|
|
63 foreach my $f ($genscan->sub_SeqFeature) {
|
|
64
|
|
65 my $exon = new Bio::EnsEMBL::Exon;
|
|
66 $transcript->add_Exon($exon);
|
|
67 $exon->contig ($contig);
|
|
68 $exon->start ($f->start);
|
|
69 $exon->end ($f->end );
|
|
70 $exon->strand ($f->strand);
|
|
71 $exon->phase ($f->phase);
|
|
72 $exon->end_phase( ($exon->phase + $exon->length)%3 );
|
|
73 #$exon->score($f->score);
|
|
74 # $exon->p_value($f->p_value);
|
|
75 $exon->slice($contig->primary_seq);
|
|
76
|
|
77 push(@exons,$exon);
|
|
78 $count++;
|
|
79
|
|
80 }
|
|
81
|
|
82 if( $count == 1 ) {
|
|
83 $genscan->throw("Got a 0 exon genscan");
|
|
84 }
|
|
85
|
|
86 my $translation = new Bio::EnsEMBL::Translation;
|
|
87 #
|
|
88 # This code got changed due to Translation convention changing. Should work...
|
|
89 #
|
|
90
|
|
91 if ($exons[0]->strand == 1) {
|
|
92 @exons = sort {$a->start <=> $b->start} @exons;
|
|
93 } else {
|
|
94 @exons = sort {$b->start <=> $a->start} @exons;
|
|
95 }
|
|
96
|
|
97 $translation->start(1);
|
|
98 $translation->end($exons[scalar(@exons)-1]->length);
|
|
99
|
|
100 $translation->start_Exon($exons[0]);
|
|
101 $translation->end_Exon($exons[$#exons]);
|
|
102
|
|
103 my $endphase = $exons[0]->end_phase;
|
|
104
|
|
105 foreach my $exon (@exons) {
|
|
106
|
|
107 if ( $exon == $exons[0] ){
|
|
108 next;
|
|
109 }
|
|
110 $exon->phase ($endphase);
|
|
111 $endphase = $exon->end_phase;
|
|
112 }
|
|
113
|
|
114 $transcript->translation($translation);
|
|
115
|
|
116 return $transcript;
|
|
117 }
|
|
118
|
|
119 sub fset2transcript_guess_phases {
|
|
120 my ($fset,$contig) = @_;
|
|
121
|
|
122 my $transcript = new Bio::EnsEMBL::Transcript;
|
|
123
|
|
124 $transcript->temporary_id($contig->id . "." . $fset->id);
|
|
125
|
|
126
|
|
127 my @exons;
|
|
128 my $count = 1;
|
|
129
|
|
130 foreach my $f ($fset->sub_SeqFeature) {
|
|
131
|
|
132 my $exon = new Bio::EnsEMBL::Exon;
|
|
133 $exon->contig ($contig);
|
|
134 $exon->start ($f->start);
|
|
135 $exon->end ($f->end );
|
|
136 $exon->strand ($f->strand);
|
|
137 #$exon->score($f->score);
|
|
138 # $exon->p_value($f->p_value);
|
|
139 $exon->slice($contig);
|
|
140 $exon->phase($f->phase);
|
|
141 push(@exons,$exon);
|
|
142 $count++;
|
|
143
|
|
144 }
|
|
145
|
|
146 my $translation = new Bio::EnsEMBL::Translation;
|
|
147
|
|
148 if ($exons[0]->strand == 1) {
|
|
149 @exons = sort {$a->start <=> $b->start} @exons;
|
|
150 } else {
|
|
151 @exons = sort {$b->start <=> $a->start} @exons;
|
|
152 }
|
|
153
|
|
154 $translation->start (1);
|
|
155 $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1);
|
|
156 $translation->start_Exon($exons[0]);
|
|
157 $translation->end_Exon($exons[$#exons]);
|
|
158 $transcript->translation($translation);
|
|
159
|
|
160 my $endphase = 0;
|
|
161
|
|
162 foreach my $exon (@exons) {
|
|
163
|
|
164 $exon ->phase ($endphase);
|
|
165 $transcript->add_Exon($exon);
|
|
166
|
|
167 $endphase = $exon->end_phase(($exon->phase + $exon->length)%3);
|
|
168
|
|
169 }
|
|
170
|
|
171
|
|
172 if ($transcript->translate->seq !~ /\*/) {
|
|
173 return $transcript;
|
|
174 }
|
|
175
|
|
176 $endphase = 1;
|
|
177
|
|
178 foreach my $exon (@exons) {
|
|
179 $exon->phase($endphase);
|
|
180 $endphase = $exon->end_phase(($exon->phase + $exon->length)%3);
|
|
181 }
|
|
182
|
|
183 if ($transcript->translate->seq !~ /\*/) {
|
|
184 return $transcript;
|
|
185 }
|
|
186
|
|
187 $endphase = 2;
|
|
188
|
|
189 foreach my $exon (@exons) {
|
|
190 $exon->phase($endphase);
|
|
191 $endphase = $exon->end_phase(($exon->phase + $exon->length)%3);
|
|
192 }
|
|
193
|
|
194 if ($transcript->translate->seq !~ /\*/) {
|
|
195 return $transcript;
|
|
196 }
|
|
197 }
|
|
198
|
|
199 sub fset2transcript_3frame {
|
|
200 my ($fset,$contig) = @_;
|
|
201
|
|
202 my @f = $fset->sub_SeqFeature;
|
|
203
|
|
204 if ($f[0]->strand == 1) {
|
|
205 @f = sort {$a->start <=> $b->start} @f;
|
|
206 } else {
|
|
207 @f = sort {$b->start <=> $a->start} @f;
|
|
208 }
|
|
209
|
|
210 my @transcripts;
|
|
211
|
|
212 my $startphase = 0;
|
|
213
|
|
214 while ($startphase < 3) {
|
|
215 my $endphase = $startphase;
|
|
216
|
|
217 my $transcript = new Bio::EnsEMBL::Transcript;
|
|
218
|
|
219 push(@transcripts,$transcript);
|
|
220
|
|
221 $transcript->temporary_id($contig->id . "." . $endphase);
|
|
222
|
|
223 my $count = 1;
|
|
224 my @exons;
|
|
225
|
|
226
|
|
227 foreach my $f (@f) {
|
|
228 #print "exon seqname = ".$f->seqname."\n";
|
|
229 my $exon = new Bio::EnsEMBL::Exon;
|
|
230 #print STDERR "exon ".$f->gffstring."\n";
|
|
231 push(@exons,$exon);
|
|
232 $exon->seqname($f->seqname);
|
|
233 $exon->temporary_id ($contig->id . ".$count");
|
|
234 $exon->contig ($contig);
|
|
235 $exon->start ($f->start);
|
|
236 $exon->end ($f->end );
|
|
237 $exon->strand ($f->strand);
|
|
238 $exon->slice($contig);
|
|
239 $exon->phase ($endphase);
|
|
240 $exon->end_phase( ($exon->phase + $exon->length)%3 );
|
|
241 #$exon->score ($f->score);
|
|
242 # $exon->p_value ($f->p_value);
|
|
243 $endphase = $exon->end_phase;
|
|
244
|
|
245 $transcript->add_Exon($exon);
|
|
246 $count++;
|
|
247
|
|
248 #print STDERR "Added exon start " . $exon->start . " end " . $exon->end . " strand " . $exon->strand . " score " . $exon->score . " pvalue " . $exon->p_value . "\n";
|
|
249 }
|
|
250
|
|
251 my $translation = new Bio::EnsEMBL::Translation;
|
|
252
|
|
253 my $contig_id = "";
|
|
254 my $fset_id = "";
|
|
255
|
|
256 if (defined($contig->id)) {
|
|
257 $contig_id = $contig->id;
|
|
258 }
|
|
259 if (defined($fset->id)) {
|
|
260 $fset_id = $fset->id;
|
|
261 }
|
|
262
|
|
263 $translation->temporary_id($contig_id . "." . $fset_id);
|
|
264 $translation->start (1);
|
|
265 $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1);
|
|
266 $translation->start_Exon($exons[0]);
|
|
267 $translation->end_Exon ($exons[$#exons]);
|
|
268 $transcript->translation($translation);
|
|
269
|
|
270 # print STDERR "Phase $startphase " . $transcript->translate->seq . "\n";
|
|
271
|
|
272 $startphase++;
|
|
273 }
|
|
274 #print "finshed fset2transcript_3frame\n";
|
|
275 return @transcripts;
|
|
276 }
|
|
277
|
|
278
|
|
279 sub fset2transcript_with_seq {
|
|
280 my ($genscan,$seq)=@_;
|
|
281
|
|
282
|
|
283 unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI"))
|
|
284 {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";}
|
|
285 unless ($seq->isa ("Bio::PrimarySeqI") || $seq->isa ("Bio::SeqI"))
|
|
286 {print "$seq must be Bio::SeqI or a Bio::PrimarySeqI\n";}
|
|
287
|
|
288 #print STDERR "running fset2transcript\n";
|
|
289 my $transcript = new Bio::EnsEMBL::Transcript;
|
|
290 $transcript->temporary_id($seq->id . "." . $genscan->seqname);
|
|
291
|
|
292 my @exons;
|
|
293 my $count= 1;
|
|
294
|
|
295 foreach my $f ($genscan->sub_SeqFeature) {
|
|
296
|
|
297 my $exon = new Bio::EnsEMBL::Exon;
|
|
298 $exon->contig ($seq);
|
|
299 $exon->start ($f->start);
|
|
300 $exon->end ($f->end );
|
|
301 $exon->strand ($f->strand);
|
|
302 $exon->phase ($f->phase);
|
|
303 $exon->end_phase( ($exon->phase + $exon->length)%3 );
|
|
304 #$exon->score ($f->score);
|
|
305 #print STDERR "contig is a = ".$seq."\n";
|
|
306 $exon->slice($seq);
|
|
307
|
|
308 push(@exons,$exon);
|
|
309 $count++;
|
|
310
|
|
311 }
|
|
312
|
|
313 foreach my $exon (@exons) {
|
|
314
|
|
315 $transcript->add_Exon($exon);
|
|
316
|
|
317
|
|
318 }
|
|
319 return $transcript;
|
|
320
|
|
321 }
|
|
322
|
|
323
|
|
324
|
|
325 1;
|