Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/Assembly/IO/ace.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 # $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $ | |
2 # | |
3 ## BioPerl module for Bio::Assembly::IO::ace | |
4 # | |
5 # Copyright by Robson F. de Souza | |
6 # | |
7 # You may distribute this module under the same terms as perl itself | |
8 | |
9 # POD documentation - main docs before the code | |
10 | |
11 =head1 NAME | |
12 | |
13 Bio::Assembly::IO::ace - module to load phrap ACE files. | |
14 | |
15 =head1 SYNOPSYS | |
16 | |
17 # Building an input stream | |
18 use Bio::Assembly::IO; | |
19 | |
20 # Assembly loading methods | |
21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1", | |
22 -format=>"ace"); | |
23 | |
24 $assembly = $io->next_assembly; | |
25 | |
26 =head1 DESCRIPTION | |
27 | |
28 This package loads the ACE files from the (phred/phrap/consed) package | |
29 by Phill Green. It was written to be used as a driver module for | |
30 Bio::Assembly::IO input/output. | |
31 | |
32 =head2 Implemention | |
33 | |
34 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by | |
35 Bio::Assembly::Contig objects. In addition to default | |
36 "_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig | |
37 objects loaded by this module will have the following special feature | |
38 classes in their feature collection: | |
39 | |
40 "_align_clipping:$seqID" : location of subsequence in sequence $seqID | |
41 which is aligned to the contig | |
42 | |
43 "_quality_clipping:$seqID" : location of good quality subsequence for | |
44 sequence $seqID | |
45 | |
46 "consensus tags", as they are called in Consed's documentation, is | |
47 equivalent to a bioperl sequence feature and, therefore, are added to | |
48 the feature collection using their type field (see Consed's README.txt | |
49 file) as primary tag. | |
50 | |
51 "read tags" are also sequence features and are stored as | |
52 sub_SeqFeatures of the sequence's coordinate feature (the | |
53 corresponding "_aligned_coord:$seqID" feature, easily accessed through | |
54 get_seq_coord() method). | |
55 | |
56 "whole assembly tags" have no start and end, as they are not | |
57 associated to any particular sequence in the assembly, and are added | |
58 to the assembly's annotation collection using phrap as tag. | |
59 | |
60 =head1 FEEDBACK | |
61 | |
62 =head2 Mailing Lists | |
63 | |
64 User feedback is an integral part of the evolution of this and other | |
65 Bioperl modules. Send your comments and suggestions preferably to the | |
66 Bioperl mailing lists Your participation is much appreciated. | |
67 | |
68 bioperl-l@bioperl.org - General discussion | |
69 http://bio.perl.org/MailList.html - About the mailing lists | |
70 | |
71 =head2 Reporting Bugs | |
72 | |
73 Report bugs to the Bioperl bug tracking system to help us keep track | |
74 the bugs and their resolution. Bug reports can be submitted via email | |
75 or the web: | |
76 | |
77 bioperl-bugs@bio.perl.org | |
78 http://bugzilla.bioperl.org/ | |
79 | |
80 =head1 AUTHOR - Robson Francisco de Souza | |
81 | |
82 Email rfsouza@citri.iq.usp.br | |
83 | |
84 =head1 APPENDIX | |
85 | |
86 The rest of the documentation details each of the object | |
87 methods. Internal methods are usually preceded with a _ | |
88 | |
89 =cut | |
90 | |
91 package Bio::Assembly::IO::ace; | |
92 | |
93 use strict; | |
94 use vars qw(@ISA); | |
95 | |
96 use Bio::Assembly::IO; | |
97 use Bio::Assembly::Scaffold; | |
98 use Bio::Assembly::Contig; | |
99 use Bio::LocatableSeq; | |
100 use Bio::Annotation::SimpleValue; | |
101 use Bio::Seq::PrimaryQual; | |
102 use Bio::SeqFeature::Generic; | |
103 | |
104 @ISA = qw(Bio::Assembly::IO); | |
105 | |
106 =head1 Parser methods | |
107 | |
108 =head2 next_assembly | |
109 | |
110 Title : next_assembly | |
111 Usage : $unigene = $stream->next_assembly() | |
112 Function: returns the next assembly in the stream | |
113 Returns : Bio::Assembly::Scaffold object | |
114 Args : NONE | |
115 | |
116 =cut | |
117 | |
118 sub next_assembly { | |
119 my $self = shift; # Object reference | |
120 local $/="\n"; | |
121 | |
122 # Resetting assembly data structure | |
123 my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap'); | |
124 | |
125 # Looping over all ACE file lines | |
126 my ($contigOBJ,$read_name); | |
127 my $read_data = {}; # Temporary holder for read data | |
128 while ($_ = $self->_readline) { # By now, ACE files hold a single assembly | |
129 chomp; | |
130 | |
131 # Loading assembly information (ASsembly field) | |
132 # (/^AS (\d+) (\d+)/) && do { | |
133 # $assembly->_set_nof_contigs($1); | |
134 # $assembly->_set_nof_sequences_in_contigs($2); | |
135 # }; | |
136 | |
137 # Loading contig sequence (COntig sequence field) | |
138 (/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found! | |
139 my $contigID = $1; | |
140 $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID); | |
141 # $contigOBJ->set_nof_bases($2); # Contig length in base pairs | |
142 # $contigOBJ->set_nof_reads($3); # Number of reads in this contig | |
143 # $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly | |
144 my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default) | |
145 $contigOBJ->strand($ori); | |
146 my $consensus_sequence = undef; | |
147 while ($_ = $self->_readline) { # Looping over contig lines | |
148 chomp; # Drop <ENTER> (\n) on current line | |
149 last if (/^$/); # Stop if empty line (contig end) is found | |
150 s/\*/-/g; # Forcing '-' as gap symbol | |
151 $consensus_sequence .= $_; | |
152 } | |
153 | |
154 my $consensus_length = length($consensus_sequence); | |
155 $consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence, | |
156 -start=>1, | |
157 -end=>$consensus_length, | |
158 -id=>$contigID); | |
159 $contigOBJ->set_consensus_sequence($consensus_sequence); | |
160 $assembly->add_contig($contigOBJ); | |
161 }; | |
162 | |
163 # Loading contig qualities... (Base Quality field) | |
164 /^BQ/ && do { | |
165 my $consensus = $contigOBJ->get_consensus_sequence()->seq(); | |
166 my ($i,$j,@tmp); | |
167 my @quality = (); | |
168 $j = 0; | |
169 while ($_ = $self->_readline) { | |
170 chomp; | |
171 last if (/^$/); | |
172 @tmp = grep { /^\d+$/ } split(/\s+/); | |
173 $i = 0; | |
174 my $previous = 0; | |
175 my $next = 0; | |
176 while ($i<=$#tmp) { | |
177 # IF base is a gap, quality is the average for neighbouring sites | |
178 if (substr($consensus,$j,1) eq '-') { | |
179 $previous = $tmp[$i-1] unless ($i == 0); | |
180 if ($i < $#tmp) { | |
181 $next = $tmp[$i+1]; | |
182 } else { | |
183 $next = 0; | |
184 } | |
185 push(@quality,int(($previous+$next)/2)); | |
186 } else { | |
187 push(@quality,$tmp[$i]); | |
188 $i++; | |
189 } | |
190 $j++; | |
191 } | |
192 } | |
193 | |
194 my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality), | |
195 -id=>$contigOBJ->id()); | |
196 $contigOBJ->set_consensus_quality($qual); | |
197 }; | |
198 | |
199 # Loading read info... (Assembled From field) | |
200 /^AF (\S+) (C|U) (-*\d+)/ && do { | |
201 $read_name = $1; my $ori = $2; | |
202 $read_data->{$read_name}{'padded_start'} = $3; # aligned start | |
203 $ori = ( $ori eq 'U' ? 1 : -1); | |
204 $read_data->{$read_name}{'strand'} = $ori; | |
205 }; | |
206 | |
207 # Loading base segments definitions (Base Segment field) | |
208 # /^BS (\d+) (\d+) (\S+)/ && do { | |
209 # if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) { | |
210 # $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2; | |
211 # } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 } | |
212 # }; | |
213 | |
214 # Loading reads... (ReaD sequence field | |
215 /^RD (\S+) (-*\d+) (\d+) (\d+)/ && do { | |
216 $read_name = $1; | |
217 $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases | |
218 $read_data->{$read_name}{'contig'} = $contigOBJ; | |
219 # $read_data->{$read_name}{'number_of_read_info_items'} = $3; | |
220 # $read_data->{$read_name}{'number_of_tags'} = $4; | |
221 my $read_sequence; | |
222 while ($_ = $self->_readline) { | |
223 chomp; | |
224 last if (/^$/); | |
225 s/\*/-/g; # Forcing '-' as gap symbol | |
226 $read_sequence .= $_; # aligned read sequence | |
227 } | |
228 my $read = Bio::LocatableSeq->new(-seq=>$read_sequence, | |
229 -start=>1, | |
230 -end=>$read_data->{$read_name}{'length'}, | |
231 -strand=>$read_data->{$read_name}{'strand'}, | |
232 -id=>$read_name, | |
233 -primary_id=>$read_name, | |
234 -alphabet=>'dna'); | |
235 | |
236 # Adding read location and sequence to contig ("gapped consensus" coordinates) | |
237 my $padded_start = $read_data->{$read_name}{'padded_start'}; | |
238 my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1; | |
239 my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start, | |
240 -end=>$padded_end, | |
241 -strand=>$read_data->{$read_name}{'strand'}, | |
242 -tag => { 'contig' => $contigOBJ->id } | |
243 ); | |
244 $contigOBJ->set_seq_coord($coord,$read); | |
245 }; | |
246 | |
247 # Loading read trimming and alignment ranges... | |
248 /^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do { | |
249 my $qual_start = $1; my $qual_end = $2; | |
250 my $align_start = $3; my $align_end = $4; | |
251 | |
252 unless ($align_start == -1 && $align_end == -1) { | |
253 $align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start); | |
254 $align_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end); | |
255 my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start, | |
256 -end=>$align_end, | |
257 -strand=>$read_data->{$read_name}{'strand'}, | |
258 -primary=>"_align_clipping:$read_name"); | |
259 $align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) ); | |
260 $contigOBJ->add_features([ $align_feat ], 0); | |
261 } | |
262 | |
263 unless ($qual_start == -1 && $qual_end == -1) { | |
264 $qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start); | |
265 $qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end); | |
266 my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start, | |
267 -end=>$qual_end, | |
268 -strand=>$read_data->{$read_name}{'strand'}, | |
269 -primary=>"_quality_clipping:$read_name"); | |
270 $qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) ); | |
271 $contigOBJ->add_features([ $qual_feat ], 0); | |
272 } | |
273 }; | |
274 | |
275 # Loading read description (DeScription fields) | |
276 # /^DS / && do { | |
277 # /CHEM: (\S+)/ && do { | |
278 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1; | |
279 # }; | |
280 # /CHROMAT_FILE: (\S+)/ && do { | |
281 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1; | |
282 # }; | |
283 # /DIRECTION: (\w+)/ && do { | |
284 # my $ori = $1; | |
285 # if ($ori eq 'rev') { $ori = 'C' } | |
286 # elsif ($ori eq 'fwd') { $ori = 'U' } | |
287 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori; | |
288 # }; | |
289 # /DYE: (\S+)/ && do { | |
290 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1; | |
291 # }; | |
292 # /PHD_FILE: (\S+)/ && do { | |
293 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1; | |
294 # }; | |
295 # /TEMPLATE: (\S+)/ && do { | |
296 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1; | |
297 # }; | |
298 # /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do { | |
299 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1; | |
300 # }; | |
301 # }; | |
302 | |
303 # Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features) | |
304 /^CT\s*\{/ && do { | |
305 my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline); | |
306 $contigID =~ s/^Contig//i; | |
307 my $extra_info = undef; | |
308 while ($_ = $self->_readline) { | |
309 last if (/\}/); | |
310 $extra_info .= $_; | |
311 } | |
312 my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start, | |
313 -end=>$end, | |
314 -primary=>$type, | |
315 -tag=>{ 'source' => $source, | |
316 'creation_date' => $date, | |
317 'extra_info' => $extra_info | |
318 }); | |
319 $assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1); | |
320 }; | |
321 | |
322 # Loading read tags | |
323 /^RT\s*\{/ && do { | |
324 my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline); | |
325 my $extra_info = undef; | |
326 while ($_ = $self->_readline) { | |
327 last if (/\}/); | |
328 $extra_info .= $_; | |
329 } | |
330 $start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start); | |
331 $end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end); | |
332 my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start, | |
333 -end=>$end, | |
334 -primary=>$type, | |
335 -tag=>{ 'source' => $source, | |
336 'creation_date' => $date, | |
337 'extra_info' => $extra_info | |
338 }); | |
339 my $contig = $read_data->{$readID}{'contig'}; | |
340 my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) ); | |
341 $coord->add_sub_SeqFeature($read_tag); | |
342 }; | |
343 | |
344 # Loading read tags | |
345 /^WA\s*\{/ && do { | |
346 my ($type,$source,$date) = split(' ',$self->_readline); | |
347 my $extra_info = undef; | |
348 while ($_ = $self->_readline) { | |
349 last if (/\}/); | |
350 $extra_info = $_; | |
351 } | |
352 #? push(@line,\@extra_info); | |
353 my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source, | |
354 "DATE:",$date,"DATA:",$extra_info); | |
355 $assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag); | |
356 $assembly->annotation->add_Annotation('phrap',$assembly_tag); | |
357 }; | |
358 | |
359 } # while ($_ = $self->_readline) | |
360 | |
361 $assembly->update_seq_list(); | |
362 return $assembly; | |
363 } | |
364 | |
365 =head2 write_assembly | |
366 | |
367 Title : write_assembly | |
368 Usage : $ass_io->write_assembly($assembly) | |
369 Function: Write the assembly object in Phrap compatible ACE format | |
370 Returns : 1 on success, 0 for error | |
371 Args : A Bio::Assembly::Scaffold object | |
372 | |
373 =cut | |
374 | |
375 sub write_assemebly { | |
376 my $self = shift; | |
377 | |
378 $self->throw("Writing phrap ACE files is not implemented yet! Sorry..."); | |
379 } | |
380 | |
381 1; | |
382 | |
383 __END__ |