0
|
1 # $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $
|
|
2 #
|
|
3 ## BioPerl module for Bio::Assembly::IO::ace
|
|
4 #
|
|
5 # Copyright by Robson F. de Souza
|
|
6 #
|
|
7 # You may distribute this module under the same terms as perl itself
|
|
8
|
|
9 # POD documentation - main docs before the code
|
|
10
|
|
11 =head1 NAME
|
|
12
|
|
13 Bio::Assembly::IO::ace - module to load phrap ACE files.
|
|
14
|
|
15 =head1 SYNOPSYS
|
|
16
|
|
17 # Building an input stream
|
|
18 use Bio::Assembly::IO;
|
|
19
|
|
20 # Assembly loading methods
|
|
21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1",
|
|
22 -format=>"ace");
|
|
23
|
|
24 $assembly = $io->next_assembly;
|
|
25
|
|
26 =head1 DESCRIPTION
|
|
27
|
|
28 This package loads the ACE files from the (phred/phrap/consed) package
|
|
29 by Phill Green. It was written to be used as a driver module for
|
|
30 Bio::Assembly::IO input/output.
|
|
31
|
|
32 =head2 Implemention
|
|
33
|
|
34 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
|
|
35 Bio::Assembly::Contig objects. In addition to default
|
|
36 "_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig
|
|
37 objects loaded by this module will have the following special feature
|
|
38 classes in their feature collection:
|
|
39
|
|
40 "_align_clipping:$seqID" : location of subsequence in sequence $seqID
|
|
41 which is aligned to the contig
|
|
42
|
|
43 "_quality_clipping:$seqID" : location of good quality subsequence for
|
|
44 sequence $seqID
|
|
45
|
|
46 "consensus tags", as they are called in Consed's documentation, is
|
|
47 equivalent to a bioperl sequence feature and, therefore, are added to
|
|
48 the feature collection using their type field (see Consed's README.txt
|
|
49 file) as primary tag.
|
|
50
|
|
51 "read tags" are also sequence features and are stored as
|
|
52 sub_SeqFeatures of the sequence's coordinate feature (the
|
|
53 corresponding "_aligned_coord:$seqID" feature, easily accessed through
|
|
54 get_seq_coord() method).
|
|
55
|
|
56 "whole assembly tags" have no start and end, as they are not
|
|
57 associated to any particular sequence in the assembly, and are added
|
|
58 to the assembly's annotation collection using phrap as tag.
|
|
59
|
|
60 =head1 FEEDBACK
|
|
61
|
|
62 =head2 Mailing Lists
|
|
63
|
|
64 User feedback is an integral part of the evolution of this and other
|
|
65 Bioperl modules. Send your comments and suggestions preferably to the
|
|
66 Bioperl mailing lists Your participation is much appreciated.
|
|
67
|
|
68 bioperl-l@bioperl.org - General discussion
|
|
69 http://bio.perl.org/MailList.html - About the mailing lists
|
|
70
|
|
71 =head2 Reporting Bugs
|
|
72
|
|
73 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
74 the bugs and their resolution. Bug reports can be submitted via email
|
|
75 or the web:
|
|
76
|
|
77 bioperl-bugs@bio.perl.org
|
|
78 http://bugzilla.bioperl.org/
|
|
79
|
|
80 =head1 AUTHOR - Robson Francisco de Souza
|
|
81
|
|
82 Email rfsouza@citri.iq.usp.br
|
|
83
|
|
84 =head1 APPENDIX
|
|
85
|
|
86 The rest of the documentation details each of the object
|
|
87 methods. Internal methods are usually preceded with a _
|
|
88
|
|
89 =cut
|
|
90
|
|
91 package Bio::Assembly::IO::ace;
|
|
92
|
|
93 use strict;
|
|
94 use vars qw(@ISA);
|
|
95
|
|
96 use Bio::Assembly::IO;
|
|
97 use Bio::Assembly::Scaffold;
|
|
98 use Bio::Assembly::Contig;
|
|
99 use Bio::LocatableSeq;
|
|
100 use Bio::Annotation::SimpleValue;
|
|
101 use Bio::Seq::PrimaryQual;
|
|
102 use Bio::SeqFeature::Generic;
|
|
103
|
|
104 @ISA = qw(Bio::Assembly::IO);
|
|
105
|
|
106 =head1 Parser methods
|
|
107
|
|
108 =head2 next_assembly
|
|
109
|
|
110 Title : next_assembly
|
|
111 Usage : $unigene = $stream->next_assembly()
|
|
112 Function: returns the next assembly in the stream
|
|
113 Returns : Bio::Assembly::Scaffold object
|
|
114 Args : NONE
|
|
115
|
|
116 =cut
|
|
117
|
|
118 sub next_assembly {
|
|
119 my $self = shift; # Object reference
|
|
120 local $/="\n";
|
|
121
|
|
122 # Resetting assembly data structure
|
|
123 my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
|
|
124
|
|
125 # Looping over all ACE file lines
|
|
126 my ($contigOBJ,$read_name);
|
|
127 my $read_data = {}; # Temporary holder for read data
|
|
128 while ($_ = $self->_readline) { # By now, ACE files hold a single assembly
|
|
129 chomp;
|
|
130
|
|
131 # Loading assembly information (ASsembly field)
|
|
132 # (/^AS (\d+) (\d+)/) && do {
|
|
133 # $assembly->_set_nof_contigs($1);
|
|
134 # $assembly->_set_nof_sequences_in_contigs($2);
|
|
135 # };
|
|
136
|
|
137 # Loading contig sequence (COntig sequence field)
|
|
138 (/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found!
|
|
139 my $contigID = $1;
|
|
140 $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID);
|
|
141 # $contigOBJ->set_nof_bases($2); # Contig length in base pairs
|
|
142 # $contigOBJ->set_nof_reads($3); # Number of reads in this contig
|
|
143 # $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly
|
|
144 my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default)
|
|
145 $contigOBJ->strand($ori);
|
|
146 my $consensus_sequence = undef;
|
|
147 while ($_ = $self->_readline) { # Looping over contig lines
|
|
148 chomp; # Drop <ENTER> (\n) on current line
|
|
149 last if (/^$/); # Stop if empty line (contig end) is found
|
|
150 s/\*/-/g; # Forcing '-' as gap symbol
|
|
151 $consensus_sequence .= $_;
|
|
152 }
|
|
153
|
|
154 my $consensus_length = length($consensus_sequence);
|
|
155 $consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence,
|
|
156 -start=>1,
|
|
157 -end=>$consensus_length,
|
|
158 -id=>$contigID);
|
|
159 $contigOBJ->set_consensus_sequence($consensus_sequence);
|
|
160 $assembly->add_contig($contigOBJ);
|
|
161 };
|
|
162
|
|
163 # Loading contig qualities... (Base Quality field)
|
|
164 /^BQ/ && do {
|
|
165 my $consensus = $contigOBJ->get_consensus_sequence()->seq();
|
|
166 my ($i,$j,@tmp);
|
|
167 my @quality = ();
|
|
168 $j = 0;
|
|
169 while ($_ = $self->_readline) {
|
|
170 chomp;
|
|
171 last if (/^$/);
|
|
172 @tmp = grep { /^\d+$/ } split(/\s+/);
|
|
173 $i = 0;
|
|
174 my $previous = 0;
|
|
175 my $next = 0;
|
|
176 while ($i<=$#tmp) {
|
|
177 # IF base is a gap, quality is the average for neighbouring sites
|
|
178 if (substr($consensus,$j,1) eq '-') {
|
|
179 $previous = $tmp[$i-1] unless ($i == 0);
|
|
180 if ($i < $#tmp) {
|
|
181 $next = $tmp[$i+1];
|
|
182 } else {
|
|
183 $next = 0;
|
|
184 }
|
|
185 push(@quality,int(($previous+$next)/2));
|
|
186 } else {
|
|
187 push(@quality,$tmp[$i]);
|
|
188 $i++;
|
|
189 }
|
|
190 $j++;
|
|
191 }
|
|
192 }
|
|
193
|
|
194 my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
|
|
195 -id=>$contigOBJ->id());
|
|
196 $contigOBJ->set_consensus_quality($qual);
|
|
197 };
|
|
198
|
|
199 # Loading read info... (Assembled From field)
|
|
200 /^AF (\S+) (C|U) (-*\d+)/ && do {
|
|
201 $read_name = $1; my $ori = $2;
|
|
202 $read_data->{$read_name}{'padded_start'} = $3; # aligned start
|
|
203 $ori = ( $ori eq 'U' ? 1 : -1);
|
|
204 $read_data->{$read_name}{'strand'} = $ori;
|
|
205 };
|
|
206
|
|
207 # Loading base segments definitions (Base Segment field)
|
|
208 # /^BS (\d+) (\d+) (\S+)/ && do {
|
|
209 # if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) {
|
|
210 # $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2;
|
|
211 # } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 }
|
|
212 # };
|
|
213
|
|
214 # Loading reads... (ReaD sequence field
|
|
215 /^RD (\S+) (-*\d+) (\d+) (\d+)/ && do {
|
|
216 $read_name = $1;
|
|
217 $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases
|
|
218 $read_data->{$read_name}{'contig'} = $contigOBJ;
|
|
219 # $read_data->{$read_name}{'number_of_read_info_items'} = $3;
|
|
220 # $read_data->{$read_name}{'number_of_tags'} = $4;
|
|
221 my $read_sequence;
|
|
222 while ($_ = $self->_readline) {
|
|
223 chomp;
|
|
224 last if (/^$/);
|
|
225 s/\*/-/g; # Forcing '-' as gap symbol
|
|
226 $read_sequence .= $_; # aligned read sequence
|
|
227 }
|
|
228 my $read = Bio::LocatableSeq->new(-seq=>$read_sequence,
|
|
229 -start=>1,
|
|
230 -end=>$read_data->{$read_name}{'length'},
|
|
231 -strand=>$read_data->{$read_name}{'strand'},
|
|
232 -id=>$read_name,
|
|
233 -primary_id=>$read_name,
|
|
234 -alphabet=>'dna');
|
|
235
|
|
236 # Adding read location and sequence to contig ("gapped consensus" coordinates)
|
|
237 my $padded_start = $read_data->{$read_name}{'padded_start'};
|
|
238 my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1;
|
|
239 my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start,
|
|
240 -end=>$padded_end,
|
|
241 -strand=>$read_data->{$read_name}{'strand'},
|
|
242 -tag => { 'contig' => $contigOBJ->id }
|
|
243 );
|
|
244 $contigOBJ->set_seq_coord($coord,$read);
|
|
245 };
|
|
246
|
|
247 # Loading read trimming and alignment ranges...
|
|
248 /^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do {
|
|
249 my $qual_start = $1; my $qual_end = $2;
|
|
250 my $align_start = $3; my $align_end = $4;
|
|
251
|
|
252 unless ($align_start == -1 && $align_end == -1) {
|
|
253 $align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start);
|
|
254 $align_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end);
|
|
255 my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start,
|
|
256 -end=>$align_end,
|
|
257 -strand=>$read_data->{$read_name}{'strand'},
|
|
258 -primary=>"_align_clipping:$read_name");
|
|
259 $align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
|
|
260 $contigOBJ->add_features([ $align_feat ], 0);
|
|
261 }
|
|
262
|
|
263 unless ($qual_start == -1 && $qual_end == -1) {
|
|
264 $qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start);
|
|
265 $qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end);
|
|
266 my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start,
|
|
267 -end=>$qual_end,
|
|
268 -strand=>$read_data->{$read_name}{'strand'},
|
|
269 -primary=>"_quality_clipping:$read_name");
|
|
270 $qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
|
|
271 $contigOBJ->add_features([ $qual_feat ], 0);
|
|
272 }
|
|
273 };
|
|
274
|
|
275 # Loading read description (DeScription fields)
|
|
276 # /^DS / && do {
|
|
277 # /CHEM: (\S+)/ && do {
|
|
278 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1;
|
|
279 # };
|
|
280 # /CHROMAT_FILE: (\S+)/ && do {
|
|
281 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1;
|
|
282 # };
|
|
283 # /DIRECTION: (\w+)/ && do {
|
|
284 # my $ori = $1;
|
|
285 # if ($ori eq 'rev') { $ori = 'C' }
|
|
286 # elsif ($ori eq 'fwd') { $ori = 'U' }
|
|
287 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori;
|
|
288 # };
|
|
289 # /DYE: (\S+)/ && do {
|
|
290 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1;
|
|
291 # };
|
|
292 # /PHD_FILE: (\S+)/ && do {
|
|
293 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1;
|
|
294 # };
|
|
295 # /TEMPLATE: (\S+)/ && do {
|
|
296 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1;
|
|
297 # };
|
|
298 # /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do {
|
|
299 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1;
|
|
300 # };
|
|
301 # };
|
|
302
|
|
303 # Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features)
|
|
304 /^CT\s*\{/ && do {
|
|
305 my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
|
|
306 $contigID =~ s/^Contig//i;
|
|
307 my $extra_info = undef;
|
|
308 while ($_ = $self->_readline) {
|
|
309 last if (/\}/);
|
|
310 $extra_info .= $_;
|
|
311 }
|
|
312 my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start,
|
|
313 -end=>$end,
|
|
314 -primary=>$type,
|
|
315 -tag=>{ 'source' => $source,
|
|
316 'creation_date' => $date,
|
|
317 'extra_info' => $extra_info
|
|
318 });
|
|
319 $assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1);
|
|
320 };
|
|
321
|
|
322 # Loading read tags
|
|
323 /^RT\s*\{/ && do {
|
|
324 my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
|
|
325 my $extra_info = undef;
|
|
326 while ($_ = $self->_readline) {
|
|
327 last if (/\}/);
|
|
328 $extra_info .= $_;
|
|
329 }
|
|
330 $start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start);
|
|
331 $end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end);
|
|
332 my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start,
|
|
333 -end=>$end,
|
|
334 -primary=>$type,
|
|
335 -tag=>{ 'source' => $source,
|
|
336 'creation_date' => $date,
|
|
337 'extra_info' => $extra_info
|
|
338 });
|
|
339 my $contig = $read_data->{$readID}{'contig'};
|
|
340 my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) );
|
|
341 $coord->add_sub_SeqFeature($read_tag);
|
|
342 };
|
|
343
|
|
344 # Loading read tags
|
|
345 /^WA\s*\{/ && do {
|
|
346 my ($type,$source,$date) = split(' ',$self->_readline);
|
|
347 my $extra_info = undef;
|
|
348 while ($_ = $self->_readline) {
|
|
349 last if (/\}/);
|
|
350 $extra_info = $_;
|
|
351 }
|
|
352 #? push(@line,\@extra_info);
|
|
353 my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source,
|
|
354 "DATE:",$date,"DATA:",$extra_info);
|
|
355 $assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag);
|
|
356 $assembly->annotation->add_Annotation('phrap',$assembly_tag);
|
|
357 };
|
|
358
|
|
359 } # while ($_ = $self->_readline)
|
|
360
|
|
361 $assembly->update_seq_list();
|
|
362 return $assembly;
|
|
363 }
|
|
364
|
|
365 =head2 write_assembly
|
|
366
|
|
367 Title : write_assembly
|
|
368 Usage : $ass_io->write_assembly($assembly)
|
|
369 Function: Write the assembly object in Phrap compatible ACE format
|
|
370 Returns : 1 on success, 0 for error
|
|
371 Args : A Bio::Assembly::Scaffold object
|
|
372
|
|
373 =cut
|
|
374
|
|
375 sub write_assemebly {
|
|
376 my $self = shift;
|
|
377
|
|
378 $self->throw("Writing phrap ACE files is not implemented yet! Sorry...");
|
|
379 }
|
|
380
|
|
381 1;
|
|
382
|
|
383 __END__
|