0
|
1 # $Id: phrap.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $
|
|
2 #
|
|
3 # BioPerl driver for phrap.out file
|
|
4 #
|
|
5 # Copyright by Robson F. de Souza
|
|
6 #
|
|
7 # You may distribute this module under the same terms as perl itself
|
|
8 #
|
|
9 # POD documentation - main docs before the code
|
|
10
|
|
11 =head1 NAME
|
|
12
|
|
13 Bio::Assembly::IO::phrap - driver to load phrap.out files.
|
|
14
|
|
15 =head1 SYNOPSYS
|
|
16
|
|
17 # Building an input stream
|
|
18 use Bio::Assembly::IO;
|
|
19
|
|
20 # Assembly loading methods
|
|
21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.phrap.out",
|
|
22 -format=>"phrap");
|
|
23
|
|
24 $assembly = $io->next_assembly;
|
|
25
|
|
26 =head1 DESCRIPTION
|
|
27
|
|
28 This package was developed to load the phrap.out files from the
|
|
29 (phred/phrap/consed) package by Phill Green. This files contain just
|
|
30 the messages printed to standard out by phrap when building an
|
|
31 assembly. This output is redirected by phredPhrap perl-script to a
|
|
32 file in the project's directory and hold some bit of information
|
|
33 regarding assembly quality, connections between contigs and clone's
|
|
34 position inside contigs. It should be noted that such files have no
|
|
35 data about the sequence. neither for contig consensus nor for any
|
|
36 aligned sequence. Anyway, such information may be loaded from Fasta
|
|
37 files in the projects directory and added to the assembly object
|
|
38 later.
|
|
39
|
|
40 Note that, because no sequence is loaded for the contig consensus and
|
|
41 locations for aligned sequences are only given in "ungapped consensus"
|
|
42 coordinates in a phrap.out file, you can't make coordinate changes in
|
|
43 assemblies loaded by pharp.pm, unless you add an aligned
|
|
44 coordinates for each sequence to each contig's features collection
|
|
45 yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and
|
|
46 L<Bio::Assembly::Contig::Feature_collection>..
|
|
47
|
|
48 This driver also loads singlets into the assembly contigs as Bio::Seq
|
|
49 objects, altough without their sequence strings. It also adds a
|
|
50 feature for the entire sequence, thus storing the singlet length in
|
|
51 its end position, and adds a tag '_nof_trimmed_nonX' to the feature,
|
|
52 which stores the number of non-vector bases in the singlet.
|
|
53
|
|
54 =head2 Implementation
|
|
55
|
|
56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
|
|
57 Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig
|
|
58 "_aligned_coord:$seqID" feature class, therefore you can't make
|
|
59 coordinate changes in contigs loaded by this module. Contig objects
|
|
60 created by this module will have the following special feature
|
|
61 classes, identified by their primary tags, in their features
|
|
62 collection:
|
|
63
|
|
64 "_main_contig_feature:$ID" : main feature for contig $ID. This
|
|
65 feature is used to store information
|
|
66 about the entire consensus
|
|
67 sequence. This feature always start at
|
|
68 base 1 and its end position is the
|
|
69 consensus sequence length. A tag,
|
|
70 'trimmed_length' holds the length of the
|
|
71 trimmed good quality region inside the
|
|
72 consensus sequence.
|
|
73
|
|
74 "_covered_region:$index" : coordinates for valid clones inside the
|
|
75 contig. $index is the covered region
|
|
76 number, starting at 1 for the covered
|
|
77 region closest to the consensus sequence
|
|
78 first base.
|
|
79
|
|
80 "_unalign_coord:$seqID" : location of a sequence in "ungapped
|
|
81 consensus" coordinates (consensus
|
|
82 sequence without gaps). Primary and
|
|
83 secondary scores, indel and
|
|
84 substitutions statistics are stored as
|
|
85 feature tags.
|
|
86
|
|
87 "_internal_clones:$cloneID" : clones inside contigs $cloneID should be
|
|
88 used as the unique id for each
|
|
89 clone. These features have six tags:
|
|
90 '_1st_name', which is the id of the
|
|
91 upstream (5') aligned sequence
|
|
92 delimiting the clone; '_1st_strand', the
|
|
93 upstream sequence strand in the
|
|
94 alignment; '_2nd_name', downstream (3')
|
|
95 sequence id; '_2nd_strand', the
|
|
96 downstream sequence strand in the
|
|
97 alignment; '_length', unaligned clone
|
|
98 length; '_rejected', a boolean flag,
|
|
99 which is false if the clone is valid and
|
|
100 true if it was rejected.
|
|
101
|
|
102 All coordinates for the features above are expressed as "ungapped
|
|
103 consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>..
|
|
104
|
|
105 =head2 Feature collection
|
|
106
|
|
107 #
|
|
108
|
|
109 =head1 FEEDBACK
|
|
110
|
|
111 =head2 Mailing Lists
|
|
112
|
|
113 User feedback is an integral part of the evolution of this and other
|
|
114 Bioperl modules. Send your comments and suggestions preferably to the
|
|
115 Bioperl mailing lists Your participation is much appreciated.
|
|
116
|
|
117 bioperl-l@bioperl.org - General discussion
|
|
118 http://bio.perl.org/MailList.html - About the mailing lists
|
|
119
|
|
120 =head2 Reporting Bugs
|
|
121
|
|
122 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
123 the bugs and their resolution. Bug reports can be submitted via email
|
|
124 or the web:
|
|
125
|
|
126 bioperl-bugs@bio.perl.org
|
|
127 http://bugzilla.bioperl.org/
|
|
128
|
|
129
|
|
130 =head1 AUTHOR - Robson Francisco de Souza
|
|
131
|
|
132 Email rfsouza@citri.iq.usp.br
|
|
133
|
|
134 head1 APPENDIX
|
|
135
|
|
136 The rest of the documentation details each of the object
|
|
137 methods. Internal methods are usually preceded with a _
|
|
138
|
|
139 =cut
|
|
140
|
|
141 package Bio::Assembly::IO::phrap;
|
|
142
|
|
143 use strict;
|
|
144 use vars qw(@ISA);
|
|
145
|
|
146 use Bio::Assembly::IO;
|
|
147 use Bio::Assembly::Scaffold;
|
|
148 use Bio::Assembly::Contig;
|
|
149 use Bio::LocatableSeq;
|
|
150 use Bio::Seq;
|
|
151 use Bio::SeqFeature::Generic;
|
|
152
|
|
153 @ISA = qw(Bio::Assembly::IO);
|
|
154
|
|
155 =head2 next_assembly
|
|
156
|
|
157 Title : next_assembly
|
|
158 Usage : $unigene = $stream->next_assembly()
|
|
159 Function: returns the next assembly in the stream
|
|
160 Returns : Bio::Assembly::Scaffold object
|
|
161 Args : NONE
|
|
162
|
|
163 =cut
|
|
164
|
|
165 sub next_assembly {
|
|
166 my $self = shift; # Package reference
|
|
167
|
|
168 # Resetting assembly data structure
|
|
169 my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
|
|
170
|
|
171 # Looping over all phrap out file lines
|
|
172 my ($contigOBJ);
|
|
173 while ($_ = $self->_readline) {
|
|
174 chomp;
|
|
175
|
|
176 # Loading exact dupicated reads list
|
|
177 # /Exact duplicate reads:/ && do {
|
|
178 # my @exact_dupl;
|
|
179 # while (<FILE>) {
|
|
180 # last if (/^\s*$/);
|
|
181 # /(\S+)\s+(\S+)/ && do {
|
|
182 # push(@exact_dupl,[$1,$2]);
|
|
183 # };
|
|
184 # $self->{'assembly'}{'exact_dupl_reads'} =
|
|
185 # new Data::Table(\@exact_dupl,['included','excluded'],0);
|
|
186 # }
|
|
187 # };
|
|
188
|
|
189 # Loading singlets reads data
|
|
190 /^(\d+) isolated singletons/ && do {
|
|
191 while ($_ = $self->_readline) {
|
|
192 chomp;
|
|
193 last if (/^$/);
|
|
194 if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) {
|
|
195 my $seqID = $1; my $length = $2;
|
|
196 my $nof_trimmed_nonX = $3;
|
|
197 my $seq = new Bio::Seq(-strand=>1,
|
|
198 -primary_id=>$seqID);
|
|
199 my $f = Bio::SeqFeature::Generic->new
|
|
200 (-start=>1, -end=>$seq->length(),
|
|
201 -primary=>$seq->primary_id(),
|
|
202 -tag=>{ '_nof_trimmed_nonX' => $nof_trimmed_nonX }
|
|
203 );
|
|
204 $seq->add_SeqFeature($f);
|
|
205 $Assembly->add_singlet($seq);
|
|
206 }
|
|
207 }
|
|
208 };
|
|
209
|
|
210 # Loading contig information
|
|
211 /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do {
|
|
212 my $nof_reads = $2; my $length = $3; my $trimmed_length = $4;
|
|
213 $contigOBJ = Bio::Assembly::Contig->new(-id=>$1, -source=>'phrap');
|
|
214 my $feat = Bio::SeqFeature::Generic->new(-start=>1,
|
|
215 -end=>$length,
|
|
216 -primary=>"_main_contig_feature:".$contigOBJ->id(),
|
|
217 -tag=>{ '_trimmed_length' => $trimmed_length }
|
|
218 );
|
|
219 $contigOBJ->add_features([ $feat ],1);
|
|
220 $Assembly->add_contig($contigOBJ);
|
|
221 };
|
|
222
|
|
223 # Loading read information
|
|
224 /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do {
|
|
225 my $strand = ($1 eq 'C' ? -1 : 1);
|
|
226 my $readID = $4; my $start = $2; my $end = $3;
|
|
227 my $primary_score = $5; my $secondary_score = $6;
|
|
228 my $substitutions = $7; my $deletions = $8; my $insertions = $9;
|
|
229 my $seq = Bio::LocatableSeq->new(-start=>$start,
|
|
230 -end=>$end,
|
|
231 -strand=>$strand,
|
|
232 -id=>$readID,
|
|
233 -primary_id=>$readID,
|
|
234 -alphabet=>'dna');
|
|
235 my $unalign_coord = Bio::SeqFeature::Generic->new(-start=>$start,
|
|
236 -end=>$end,
|
|
237 -primary=>"_unalign_coord:$readID",
|
|
238 -tag=>{'_primary_score'=>$primary_score,
|
|
239 '_secondary_score'=>$secondary_score,
|
|
240 '_substitutions'=>$substitutions,
|
|
241 '_insertions'=>,$insertions,
|
|
242 '_deletions'=>$deletions }
|
|
243 );
|
|
244 $unalign_coord->attach_seq($seq);
|
|
245 $contigOBJ->add_seq($seq); $contigOBJ->add_features([ $unalign_coord ]);
|
|
246 };
|
|
247
|
|
248 # Loading INTERNAL clones description
|
|
249 /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do {
|
|
250 my $contigID = $1;
|
|
251 my $contig = $Assembly->get_contig_by_id($contigID);
|
|
252 while ($_ = $self->_readline) {
|
|
253 my (@data,$rejected,$c1_strand,$c2_strand);
|
|
254
|
|
255 (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do {
|
|
256 if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 }
|
|
257 $c1_strand = ($data[1] eq 'C' ? -1 : 1);
|
|
258 $c2_strand = ($data[3] eq 'C' ? -1 : 1);
|
|
259 (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/;
|
|
260 my $clone = Bio::SeqFeature::Generic->new(-start=>$data[6],
|
|
261 -end=>$data[7],
|
|
262 -strand=>0,
|
|
263 -primary=>"_internal_clone:$clone_name",
|
|
264 -tag=>{'_1st_strand'=>,$c1_strand,
|
|
265 '_2nd_strand'=>,$c2_strand,
|
|
266 '_1st_name'=>$data[2],
|
|
267 '_2nd_name'=>$data[4],
|
|
268 '_length'=>$data[5],
|
|
269 '_rejected'=>$rejected
|
|
270 }
|
|
271 );
|
|
272 $contig->add_features([ $clone ]);
|
|
273 };
|
|
274
|
|
275 /Covered regions:/ && do {
|
|
276 my %coord = /(\d+)/g; my $i = 0;
|
|
277 foreach my $start (sort { $a <=> $b } keys %coord) {
|
|
278 my $cov = Bio::SeqFeature::Generic->new(-start=>$start,
|
|
279 -end=>$coord{$start},
|
|
280 -primary=>'_covered_region:'.++$i
|
|
281 );
|
|
282 # 1: attach feature to contig consensus, if any
|
|
283 $contig->add_features([ $cov ],1);
|
|
284 }
|
|
285 last; # exit while loop
|
|
286 }; # /Covered regions:/
|
|
287
|
|
288 } # while ($_ = $self->_readline)
|
|
289 }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/
|
|
290
|
|
291 } # while ($_ = $self->_readline)
|
|
292 return $Assembly;
|
|
293 }
|
|
294
|
|
295 =head2 write_assembly
|
|
296
|
|
297 Title : write_assembly
|
|
298 Usage : $ass_io->write_assembly($assembly)
|
|
299 Function: Write the assembly object in Phrap compatible ACE format
|
|
300 Returns : 1 on success, 0 for error
|
|
301 Args : A Bio::Assembly::Scaffold object
|
|
302
|
|
303 =cut
|
|
304
|
|
305 sub write_assemebly {
|
|
306 my $self = shift;
|
|
307
|
|
308 $self->throw("Writing phrap.out files is not implemented yet! Sorry...");
|
|
309 }
|
|
310
|
|
311 1;
|
|
312
|
|
313 __END__
|