Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/TranscriptFactory.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::TranscriptFactory - Module having the fset2transcript* | |
24 subroutines | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 use Bio::EnsEMBL::TranscriptFactory; | |
29 | |
30 &Bio::EnsEMBL::TranscriptFactory::fset2transcript($fset_id); | |
31 | |
32 =head1 DESCRIPTION | |
33 | |
34 Module containing the subroutines fset2transcript*, | |
35 which create transcripts from features (formally housed in | |
36 Bio::EnsEMBL::DBSQL::Utils). | |
37 | |
38 =head1 METHODS | |
39 | |
40 =cut | |
41 | |
42 package Bio::EnsEMBL::TranscriptFactory; | |
43 | |
44 use strict; | |
45 | |
46 use Bio::EnsEMBL::Exon; | |
47 use Bio::EnsEMBL::Translation; | |
48 use Bio::EnsEMBL::Transcript; | |
49 | |
50 sub fset2transcript { | |
51 my ($genscan,$contig)=@_; | |
52 | |
53 | |
54 unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI")) | |
55 {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";} | |
56 | |
57 my $transcript = new Bio::EnsEMBL::Transcript; | |
58 $transcript->temporary_id($contig->id . "." . $genscan->seqname); | |
59 | |
60 my @exons; | |
61 my $count= 1; | |
62 | |
63 foreach my $f ($genscan->sub_SeqFeature) { | |
64 | |
65 my $exon = new Bio::EnsEMBL::Exon; | |
66 $transcript->add_Exon($exon); | |
67 $exon->contig ($contig); | |
68 $exon->start ($f->start); | |
69 $exon->end ($f->end ); | |
70 $exon->strand ($f->strand); | |
71 $exon->phase ($f->phase); | |
72 $exon->end_phase( ($exon->phase + $exon->length)%3 ); | |
73 #$exon->score($f->score); | |
74 # $exon->p_value($f->p_value); | |
75 $exon->slice($contig->primary_seq); | |
76 | |
77 push(@exons,$exon); | |
78 $count++; | |
79 | |
80 } | |
81 | |
82 if( $count == 1 ) { | |
83 $genscan->throw("Got a 0 exon genscan"); | |
84 } | |
85 | |
86 my $translation = new Bio::EnsEMBL::Translation; | |
87 # | |
88 # This code got changed due to Translation convention changing. Should work... | |
89 # | |
90 | |
91 if ($exons[0]->strand == 1) { | |
92 @exons = sort {$a->start <=> $b->start} @exons; | |
93 } else { | |
94 @exons = sort {$b->start <=> $a->start} @exons; | |
95 } | |
96 | |
97 $translation->start(1); | |
98 $translation->end($exons[scalar(@exons)-1]->length); | |
99 | |
100 $translation->start_Exon($exons[0]); | |
101 $translation->end_Exon($exons[$#exons]); | |
102 | |
103 my $endphase = $exons[0]->end_phase; | |
104 | |
105 foreach my $exon (@exons) { | |
106 | |
107 if ( $exon == $exons[0] ){ | |
108 next; | |
109 } | |
110 $exon->phase ($endphase); | |
111 $endphase = $exon->end_phase; | |
112 } | |
113 | |
114 $transcript->translation($translation); | |
115 | |
116 return $transcript; | |
117 } | |
118 | |
119 sub fset2transcript_guess_phases { | |
120 my ($fset,$contig) = @_; | |
121 | |
122 my $transcript = new Bio::EnsEMBL::Transcript; | |
123 | |
124 $transcript->temporary_id($contig->id . "." . $fset->id); | |
125 | |
126 | |
127 my @exons; | |
128 my $count = 1; | |
129 | |
130 foreach my $f ($fset->sub_SeqFeature) { | |
131 | |
132 my $exon = new Bio::EnsEMBL::Exon; | |
133 $exon->contig ($contig); | |
134 $exon->start ($f->start); | |
135 $exon->end ($f->end ); | |
136 $exon->strand ($f->strand); | |
137 #$exon->score($f->score); | |
138 # $exon->p_value($f->p_value); | |
139 $exon->slice($contig); | |
140 $exon->phase($f->phase); | |
141 push(@exons,$exon); | |
142 $count++; | |
143 | |
144 } | |
145 | |
146 my $translation = new Bio::EnsEMBL::Translation; | |
147 | |
148 if ($exons[0]->strand == 1) { | |
149 @exons = sort {$a->start <=> $b->start} @exons; | |
150 } else { | |
151 @exons = sort {$b->start <=> $a->start} @exons; | |
152 } | |
153 | |
154 $translation->start (1); | |
155 $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1); | |
156 $translation->start_Exon($exons[0]); | |
157 $translation->end_Exon($exons[$#exons]); | |
158 $transcript->translation($translation); | |
159 | |
160 my $endphase = 0; | |
161 | |
162 foreach my $exon (@exons) { | |
163 | |
164 $exon ->phase ($endphase); | |
165 $transcript->add_Exon($exon); | |
166 | |
167 $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); | |
168 | |
169 } | |
170 | |
171 | |
172 if ($transcript->translate->seq !~ /\*/) { | |
173 return $transcript; | |
174 } | |
175 | |
176 $endphase = 1; | |
177 | |
178 foreach my $exon (@exons) { | |
179 $exon->phase($endphase); | |
180 $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); | |
181 } | |
182 | |
183 if ($transcript->translate->seq !~ /\*/) { | |
184 return $transcript; | |
185 } | |
186 | |
187 $endphase = 2; | |
188 | |
189 foreach my $exon (@exons) { | |
190 $exon->phase($endphase); | |
191 $endphase = $exon->end_phase(($exon->phase + $exon->length)%3); | |
192 } | |
193 | |
194 if ($transcript->translate->seq !~ /\*/) { | |
195 return $transcript; | |
196 } | |
197 } | |
198 | |
199 sub fset2transcript_3frame { | |
200 my ($fset,$contig) = @_; | |
201 | |
202 my @f = $fset->sub_SeqFeature; | |
203 | |
204 if ($f[0]->strand == 1) { | |
205 @f = sort {$a->start <=> $b->start} @f; | |
206 } else { | |
207 @f = sort {$b->start <=> $a->start} @f; | |
208 } | |
209 | |
210 my @transcripts; | |
211 | |
212 my $startphase = 0; | |
213 | |
214 while ($startphase < 3) { | |
215 my $endphase = $startphase; | |
216 | |
217 my $transcript = new Bio::EnsEMBL::Transcript; | |
218 | |
219 push(@transcripts,$transcript); | |
220 | |
221 $transcript->temporary_id($contig->id . "." . $endphase); | |
222 | |
223 my $count = 1; | |
224 my @exons; | |
225 | |
226 | |
227 foreach my $f (@f) { | |
228 #print "exon seqname = ".$f->seqname."\n"; | |
229 my $exon = new Bio::EnsEMBL::Exon; | |
230 #print STDERR "exon ".$f->gffstring."\n"; | |
231 push(@exons,$exon); | |
232 $exon->seqname($f->seqname); | |
233 $exon->temporary_id ($contig->id . ".$count"); | |
234 $exon->contig ($contig); | |
235 $exon->start ($f->start); | |
236 $exon->end ($f->end ); | |
237 $exon->strand ($f->strand); | |
238 $exon->slice($contig); | |
239 $exon->phase ($endphase); | |
240 $exon->end_phase( ($exon->phase + $exon->length)%3 ); | |
241 #$exon->score ($f->score); | |
242 # $exon->p_value ($f->p_value); | |
243 $endphase = $exon->end_phase; | |
244 | |
245 $transcript->add_Exon($exon); | |
246 $count++; | |
247 | |
248 #print STDERR "Added exon start " . $exon->start . " end " . $exon->end . " strand " . $exon->strand . " score " . $exon->score . " pvalue " . $exon->p_value . "\n"; | |
249 } | |
250 | |
251 my $translation = new Bio::EnsEMBL::Translation; | |
252 | |
253 my $contig_id = ""; | |
254 my $fset_id = ""; | |
255 | |
256 if (defined($contig->id)) { | |
257 $contig_id = $contig->id; | |
258 } | |
259 if (defined($fset->id)) { | |
260 $fset_id = $fset->id; | |
261 } | |
262 | |
263 $translation->temporary_id($contig_id . "." . $fset_id); | |
264 $translation->start (1); | |
265 $translation->end ($exons[$#exons]->end - $exons[$#exons]->start + 1); | |
266 $translation->start_Exon($exons[0]); | |
267 $translation->end_Exon ($exons[$#exons]); | |
268 $transcript->translation($translation); | |
269 | |
270 # print STDERR "Phase $startphase " . $transcript->translate->seq . "\n"; | |
271 | |
272 $startphase++; | |
273 } | |
274 #print "finshed fset2transcript_3frame\n"; | |
275 return @transcripts; | |
276 } | |
277 | |
278 | |
279 sub fset2transcript_with_seq { | |
280 my ($genscan,$seq)=@_; | |
281 | |
282 | |
283 unless ($genscan->isa ("Bio::EnsEMBL::SeqFeatureI")) | |
284 {print "$genscan must be Bio::EnsEMBL::SeqFeatureI\n";} | |
285 unless ($seq->isa ("Bio::PrimarySeqI") || $seq->isa ("Bio::SeqI")) | |
286 {print "$seq must be Bio::SeqI or a Bio::PrimarySeqI\n";} | |
287 | |
288 #print STDERR "running fset2transcript\n"; | |
289 my $transcript = new Bio::EnsEMBL::Transcript; | |
290 $transcript->temporary_id($seq->id . "." . $genscan->seqname); | |
291 | |
292 my @exons; | |
293 my $count= 1; | |
294 | |
295 foreach my $f ($genscan->sub_SeqFeature) { | |
296 | |
297 my $exon = new Bio::EnsEMBL::Exon; | |
298 $exon->contig ($seq); | |
299 $exon->start ($f->start); | |
300 $exon->end ($f->end ); | |
301 $exon->strand ($f->strand); | |
302 $exon->phase ($f->phase); | |
303 $exon->end_phase( ($exon->phase + $exon->length)%3 ); | |
304 #$exon->score ($f->score); | |
305 #print STDERR "contig is a = ".$seq."\n"; | |
306 $exon->slice($seq); | |
307 | |
308 push(@exons,$exon); | |
309 $count++; | |
310 | |
311 } | |
312 | |
313 foreach my $exon (@exons) { | |
314 | |
315 $transcript->add_Exon($exon); | |
316 | |
317 | |
318 } | |
319 return $transcript; | |
320 | |
321 } | |
322 | |
323 | |
324 | |
325 1; |