comparison variant_effect_predictor/Bio/Assembly/IO/ace.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 # $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $
2 #
3 ## BioPerl module for Bio::Assembly::IO::ace
4 #
5 # Copyright by Robson F. de Souza
6 #
7 # You may distribute this module under the same terms as perl itself
8
9 # POD documentation - main docs before the code
10
11 =head1 NAME
12
13 Bio::Assembly::IO::ace - module to load phrap ACE files.
14
15 =head1 SYNOPSYS
16
17 # Building an input stream
18 use Bio::Assembly::IO;
19
20 # Assembly loading methods
21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1",
22 -format=>"ace");
23
24 $assembly = $io->next_assembly;
25
26 =head1 DESCRIPTION
27
28 This package loads the ACE files from the (phred/phrap/consed) package
29 by Phill Green. It was written to be used as a driver module for
30 Bio::Assembly::IO input/output.
31
32 =head2 Implemention
33
34 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
35 Bio::Assembly::Contig objects. In addition to default
36 "_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig
37 objects loaded by this module will have the following special feature
38 classes in their feature collection:
39
40 "_align_clipping:$seqID" : location of subsequence in sequence $seqID
41 which is aligned to the contig
42
43 "_quality_clipping:$seqID" : location of good quality subsequence for
44 sequence $seqID
45
46 "consensus tags", as they are called in Consed's documentation, is
47 equivalent to a bioperl sequence feature and, therefore, are added to
48 the feature collection using their type field (see Consed's README.txt
49 file) as primary tag.
50
51 "read tags" are also sequence features and are stored as
52 sub_SeqFeatures of the sequence's coordinate feature (the
53 corresponding "_aligned_coord:$seqID" feature, easily accessed through
54 get_seq_coord() method).
55
56 "whole assembly tags" have no start and end, as they are not
57 associated to any particular sequence in the assembly, and are added
58 to the assembly's annotation collection using phrap as tag.
59
60 =head1 FEEDBACK
61
62 =head2 Mailing Lists
63
64 User feedback is an integral part of the evolution of this and other
65 Bioperl modules. Send your comments and suggestions preferably to the
66 Bioperl mailing lists Your participation is much appreciated.
67
68 bioperl-l@bioperl.org - General discussion
69 http://bio.perl.org/MailList.html - About the mailing lists
70
71 =head2 Reporting Bugs
72
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 the bugs and their resolution. Bug reports can be submitted via email
75 or the web:
76
77 bioperl-bugs@bio.perl.org
78 http://bugzilla.bioperl.org/
79
80 =head1 AUTHOR - Robson Francisco de Souza
81
82 Email rfsouza@citri.iq.usp.br
83
84 =head1 APPENDIX
85
86 The rest of the documentation details each of the object
87 methods. Internal methods are usually preceded with a _
88
89 =cut
90
91 package Bio::Assembly::IO::ace;
92
93 use strict;
94 use vars qw(@ISA);
95
96 use Bio::Assembly::IO;
97 use Bio::Assembly::Scaffold;
98 use Bio::Assembly::Contig;
99 use Bio::LocatableSeq;
100 use Bio::Annotation::SimpleValue;
101 use Bio::Seq::PrimaryQual;
102 use Bio::SeqFeature::Generic;
103
104 @ISA = qw(Bio::Assembly::IO);
105
106 =head1 Parser methods
107
108 =head2 next_assembly
109
110 Title : next_assembly
111 Usage : $unigene = $stream->next_assembly()
112 Function: returns the next assembly in the stream
113 Returns : Bio::Assembly::Scaffold object
114 Args : NONE
115
116 =cut
117
118 sub next_assembly {
119 my $self = shift; # Object reference
120 local $/="\n";
121
122 # Resetting assembly data structure
123 my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
124
125 # Looping over all ACE file lines
126 my ($contigOBJ,$read_name);
127 my $read_data = {}; # Temporary holder for read data
128 while ($_ = $self->_readline) { # By now, ACE files hold a single assembly
129 chomp;
130
131 # Loading assembly information (ASsembly field)
132 # (/^AS (\d+) (\d+)/) && do {
133 # $assembly->_set_nof_contigs($1);
134 # $assembly->_set_nof_sequences_in_contigs($2);
135 # };
136
137 # Loading contig sequence (COntig sequence field)
138 (/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found!
139 my $contigID = $1;
140 $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID);
141 # $contigOBJ->set_nof_bases($2); # Contig length in base pairs
142 # $contigOBJ->set_nof_reads($3); # Number of reads in this contig
143 # $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly
144 my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default)
145 $contigOBJ->strand($ori);
146 my $consensus_sequence = undef;
147 while ($_ = $self->_readline) { # Looping over contig lines
148 chomp; # Drop <ENTER> (\n) on current line
149 last if (/^$/); # Stop if empty line (contig end) is found
150 s/\*/-/g; # Forcing '-' as gap symbol
151 $consensus_sequence .= $_;
152 }
153
154 my $consensus_length = length($consensus_sequence);
155 $consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence,
156 -start=>1,
157 -end=>$consensus_length,
158 -id=>$contigID);
159 $contigOBJ->set_consensus_sequence($consensus_sequence);
160 $assembly->add_contig($contigOBJ);
161 };
162
163 # Loading contig qualities... (Base Quality field)
164 /^BQ/ && do {
165 my $consensus = $contigOBJ->get_consensus_sequence()->seq();
166 my ($i,$j,@tmp);
167 my @quality = ();
168 $j = 0;
169 while ($_ = $self->_readline) {
170 chomp;
171 last if (/^$/);
172 @tmp = grep { /^\d+$/ } split(/\s+/);
173 $i = 0;
174 my $previous = 0;
175 my $next = 0;
176 while ($i<=$#tmp) {
177 # IF base is a gap, quality is the average for neighbouring sites
178 if (substr($consensus,$j,1) eq '-') {
179 $previous = $tmp[$i-1] unless ($i == 0);
180 if ($i < $#tmp) {
181 $next = $tmp[$i+1];
182 } else {
183 $next = 0;
184 }
185 push(@quality,int(($previous+$next)/2));
186 } else {
187 push(@quality,$tmp[$i]);
188 $i++;
189 }
190 $j++;
191 }
192 }
193
194 my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
195 -id=>$contigOBJ->id());
196 $contigOBJ->set_consensus_quality($qual);
197 };
198
199 # Loading read info... (Assembled From field)
200 /^AF (\S+) (C|U) (-*\d+)/ && do {
201 $read_name = $1; my $ori = $2;
202 $read_data->{$read_name}{'padded_start'} = $3; # aligned start
203 $ori = ( $ori eq 'U' ? 1 : -1);
204 $read_data->{$read_name}{'strand'} = $ori;
205 };
206
207 # Loading base segments definitions (Base Segment field)
208 # /^BS (\d+) (\d+) (\S+)/ && do {
209 # if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) {
210 # $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2;
211 # } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 }
212 # };
213
214 # Loading reads... (ReaD sequence field
215 /^RD (\S+) (-*\d+) (\d+) (\d+)/ && do {
216 $read_name = $1;
217 $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases
218 $read_data->{$read_name}{'contig'} = $contigOBJ;
219 # $read_data->{$read_name}{'number_of_read_info_items'} = $3;
220 # $read_data->{$read_name}{'number_of_tags'} = $4;
221 my $read_sequence;
222 while ($_ = $self->_readline) {
223 chomp;
224 last if (/^$/);
225 s/\*/-/g; # Forcing '-' as gap symbol
226 $read_sequence .= $_; # aligned read sequence
227 }
228 my $read = Bio::LocatableSeq->new(-seq=>$read_sequence,
229 -start=>1,
230 -end=>$read_data->{$read_name}{'length'},
231 -strand=>$read_data->{$read_name}{'strand'},
232 -id=>$read_name,
233 -primary_id=>$read_name,
234 -alphabet=>'dna');
235
236 # Adding read location and sequence to contig ("gapped consensus" coordinates)
237 my $padded_start = $read_data->{$read_name}{'padded_start'};
238 my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1;
239 my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start,
240 -end=>$padded_end,
241 -strand=>$read_data->{$read_name}{'strand'},
242 -tag => { 'contig' => $contigOBJ->id }
243 );
244 $contigOBJ->set_seq_coord($coord,$read);
245 };
246
247 # Loading read trimming and alignment ranges...
248 /^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do {
249 my $qual_start = $1; my $qual_end = $2;
250 my $align_start = $3; my $align_end = $4;
251
252 unless ($align_start == -1 && $align_end == -1) {
253 $align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start);
254 $align_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end);
255 my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start,
256 -end=>$align_end,
257 -strand=>$read_data->{$read_name}{'strand'},
258 -primary=>"_align_clipping:$read_name");
259 $align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
260 $contigOBJ->add_features([ $align_feat ], 0);
261 }
262
263 unless ($qual_start == -1 && $qual_end == -1) {
264 $qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start);
265 $qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end);
266 my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start,
267 -end=>$qual_end,
268 -strand=>$read_data->{$read_name}{'strand'},
269 -primary=>"_quality_clipping:$read_name");
270 $qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
271 $contigOBJ->add_features([ $qual_feat ], 0);
272 }
273 };
274
275 # Loading read description (DeScription fields)
276 # /^DS / && do {
277 # /CHEM: (\S+)/ && do {
278 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1;
279 # };
280 # /CHROMAT_FILE: (\S+)/ && do {
281 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1;
282 # };
283 # /DIRECTION: (\w+)/ && do {
284 # my $ori = $1;
285 # if ($ori eq 'rev') { $ori = 'C' }
286 # elsif ($ori eq 'fwd') { $ori = 'U' }
287 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori;
288 # };
289 # /DYE: (\S+)/ && do {
290 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1;
291 # };
292 # /PHD_FILE: (\S+)/ && do {
293 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1;
294 # };
295 # /TEMPLATE: (\S+)/ && do {
296 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1;
297 # };
298 # /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do {
299 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1;
300 # };
301 # };
302
303 # Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features)
304 /^CT\s*\{/ && do {
305 my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
306 $contigID =~ s/^Contig//i;
307 my $extra_info = undef;
308 while ($_ = $self->_readline) {
309 last if (/\}/);
310 $extra_info .= $_;
311 }
312 my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start,
313 -end=>$end,
314 -primary=>$type,
315 -tag=>{ 'source' => $source,
316 'creation_date' => $date,
317 'extra_info' => $extra_info
318 });
319 $assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1);
320 };
321
322 # Loading read tags
323 /^RT\s*\{/ && do {
324 my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
325 my $extra_info = undef;
326 while ($_ = $self->_readline) {
327 last if (/\}/);
328 $extra_info .= $_;
329 }
330 $start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start);
331 $end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end);
332 my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start,
333 -end=>$end,
334 -primary=>$type,
335 -tag=>{ 'source' => $source,
336 'creation_date' => $date,
337 'extra_info' => $extra_info
338 });
339 my $contig = $read_data->{$readID}{'contig'};
340 my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) );
341 $coord->add_sub_SeqFeature($read_tag);
342 };
343
344 # Loading read tags
345 /^WA\s*\{/ && do {
346 my ($type,$source,$date) = split(' ',$self->_readline);
347 my $extra_info = undef;
348 while ($_ = $self->_readline) {
349 last if (/\}/);
350 $extra_info = $_;
351 }
352 #? push(@line,\@extra_info);
353 my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source,
354 "DATE:",$date,"DATA:",$extra_info);
355 $assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag);
356 $assembly->annotation->add_Annotation('phrap',$assembly_tag);
357 };
358
359 } # while ($_ = $self->_readline)
360
361 $assembly->update_seq_list();
362 return $assembly;
363 }
364
365 =head2 write_assembly
366
367 Title : write_assembly
368 Usage : $ass_io->write_assembly($assembly)
369 Function: Write the assembly object in Phrap compatible ACE format
370 Returns : 1 on success, 0 for error
371 Args : A Bio::Assembly::Scaffold object
372
373 =cut
374
375 sub write_assemebly {
376 my $self = shift;
377
378 $self->throw("Writing phrap ACE files is not implemented yet! Sorry...");
379 }
380
381 1;
382
383 __END__