comparison variant_effect_predictor/Bio/Assembly/IO/phrap.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 # $Id: phrap.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $
2 #
3 # BioPerl driver for phrap.out file
4 #
5 # Copyright by Robson F. de Souza
6 #
7 # You may distribute this module under the same terms as perl itself
8 #
9 # POD documentation - main docs before the code
10
11 =head1 NAME
12
13 Bio::Assembly::IO::phrap - driver to load phrap.out files.
14
15 =head1 SYNOPSYS
16
17 # Building an input stream
18 use Bio::Assembly::IO;
19
20 # Assembly loading methods
21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.phrap.out",
22 -format=>"phrap");
23
24 $assembly = $io->next_assembly;
25
26 =head1 DESCRIPTION
27
28 This package was developed to load the phrap.out files from the
29 (phred/phrap/consed) package by Phill Green. This files contain just
30 the messages printed to standard out by phrap when building an
31 assembly. This output is redirected by phredPhrap perl-script to a
32 file in the project's directory and hold some bit of information
33 regarding assembly quality, connections between contigs and clone's
34 position inside contigs. It should be noted that such files have no
35 data about the sequence. neither for contig consensus nor for any
36 aligned sequence. Anyway, such information may be loaded from Fasta
37 files in the projects directory and added to the assembly object
38 later.
39
40 Note that, because no sequence is loaded for the contig consensus and
41 locations for aligned sequences are only given in "ungapped consensus"
42 coordinates in a phrap.out file, you can't make coordinate changes in
43 assemblies loaded by pharp.pm, unless you add an aligned
44 coordinates for each sequence to each contig's features collection
45 yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and
46 L<Bio::Assembly::Contig::Feature_collection>..
47
48 This driver also loads singlets into the assembly contigs as Bio::Seq
49 objects, altough without their sequence strings. It also adds a
50 feature for the entire sequence, thus storing the singlet length in
51 its end position, and adds a tag '_nof_trimmed_nonX' to the feature,
52 which stores the number of non-vector bases in the singlet.
53
54 =head2 Implementation
55
56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
57 Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig
58 "_aligned_coord:$seqID" feature class, therefore you can't make
59 coordinate changes in contigs loaded by this module. Contig objects
60 created by this module will have the following special feature
61 classes, identified by their primary tags, in their features
62 collection:
63
64 "_main_contig_feature:$ID" : main feature for contig $ID. This
65 feature is used to store information
66 about the entire consensus
67 sequence. This feature always start at
68 base 1 and its end position is the
69 consensus sequence length. A tag,
70 'trimmed_length' holds the length of the
71 trimmed good quality region inside the
72 consensus sequence.
73
74 "_covered_region:$index" : coordinates for valid clones inside the
75 contig. $index is the covered region
76 number, starting at 1 for the covered
77 region closest to the consensus sequence
78 first base.
79
80 "_unalign_coord:$seqID" : location of a sequence in "ungapped
81 consensus" coordinates (consensus
82 sequence without gaps). Primary and
83 secondary scores, indel and
84 substitutions statistics are stored as
85 feature tags.
86
87 "_internal_clones:$cloneID" : clones inside contigs $cloneID should be
88 used as the unique id for each
89 clone. These features have six tags:
90 '_1st_name', which is the id of the
91 upstream (5') aligned sequence
92 delimiting the clone; '_1st_strand', the
93 upstream sequence strand in the
94 alignment; '_2nd_name', downstream (3')
95 sequence id; '_2nd_strand', the
96 downstream sequence strand in the
97 alignment; '_length', unaligned clone
98 length; '_rejected', a boolean flag,
99 which is false if the clone is valid and
100 true if it was rejected.
101
102 All coordinates for the features above are expressed as "ungapped
103 consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>..
104
105 =head2 Feature collection
106
107 #
108
109 =head1 FEEDBACK
110
111 =head2 Mailing Lists
112
113 User feedback is an integral part of the evolution of this and other
114 Bioperl modules. Send your comments and suggestions preferably to the
115 Bioperl mailing lists Your participation is much appreciated.
116
117 bioperl-l@bioperl.org - General discussion
118 http://bio.perl.org/MailList.html - About the mailing lists
119
120 =head2 Reporting Bugs
121
122 Report bugs to the Bioperl bug tracking system to help us keep track
123 the bugs and their resolution. Bug reports can be submitted via email
124 or the web:
125
126 bioperl-bugs@bio.perl.org
127 http://bugzilla.bioperl.org/
128
129
130 =head1 AUTHOR - Robson Francisco de Souza
131
132 Email rfsouza@citri.iq.usp.br
133
134 head1 APPENDIX
135
136 The rest of the documentation details each of the object
137 methods. Internal methods are usually preceded with a _
138
139 =cut
140
141 package Bio::Assembly::IO::phrap;
142
143 use strict;
144 use vars qw(@ISA);
145
146 use Bio::Assembly::IO;
147 use Bio::Assembly::Scaffold;
148 use Bio::Assembly::Contig;
149 use Bio::LocatableSeq;
150 use Bio::Seq;
151 use Bio::SeqFeature::Generic;
152
153 @ISA = qw(Bio::Assembly::IO);
154
155 =head2 next_assembly
156
157 Title : next_assembly
158 Usage : $unigene = $stream->next_assembly()
159 Function: returns the next assembly in the stream
160 Returns : Bio::Assembly::Scaffold object
161 Args : NONE
162
163 =cut
164
165 sub next_assembly {
166 my $self = shift; # Package reference
167
168 # Resetting assembly data structure
169 my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
170
171 # Looping over all phrap out file lines
172 my ($contigOBJ);
173 while ($_ = $self->_readline) {
174 chomp;
175
176 # Loading exact dupicated reads list
177 # /Exact duplicate reads:/ && do {
178 # my @exact_dupl;
179 # while (<FILE>) {
180 # last if (/^\s*$/);
181 # /(\S+)\s+(\S+)/ && do {
182 # push(@exact_dupl,[$1,$2]);
183 # };
184 # $self->{'assembly'}{'exact_dupl_reads'} =
185 # new Data::Table(\@exact_dupl,['included','excluded'],0);
186 # }
187 # };
188
189 # Loading singlets reads data
190 /^(\d+) isolated singletons/ && do {
191 while ($_ = $self->_readline) {
192 chomp;
193 last if (/^$/);
194 if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) {
195 my $seqID = $1; my $length = $2;
196 my $nof_trimmed_nonX = $3;
197 my $seq = new Bio::Seq(-strand=>1,
198 -primary_id=>$seqID);
199 my $f = Bio::SeqFeature::Generic->new
200 (-start=>1, -end=>$seq->length(),
201 -primary=>$seq->primary_id(),
202 -tag=>{ '_nof_trimmed_nonX' => $nof_trimmed_nonX }
203 );
204 $seq->add_SeqFeature($f);
205 $Assembly->add_singlet($seq);
206 }
207 }
208 };
209
210 # Loading contig information
211 /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do {
212 my $nof_reads = $2; my $length = $3; my $trimmed_length = $4;
213 $contigOBJ = Bio::Assembly::Contig->new(-id=>$1, -source=>'phrap');
214 my $feat = Bio::SeqFeature::Generic->new(-start=>1,
215 -end=>$length,
216 -primary=>"_main_contig_feature:".$contigOBJ->id(),
217 -tag=>{ '_trimmed_length' => $trimmed_length }
218 );
219 $contigOBJ->add_features([ $feat ],1);
220 $Assembly->add_contig($contigOBJ);
221 };
222
223 # Loading read information
224 /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do {
225 my $strand = ($1 eq 'C' ? -1 : 1);
226 my $readID = $4; my $start = $2; my $end = $3;
227 my $primary_score = $5; my $secondary_score = $6;
228 my $substitutions = $7; my $deletions = $8; my $insertions = $9;
229 my $seq = Bio::LocatableSeq->new(-start=>$start,
230 -end=>$end,
231 -strand=>$strand,
232 -id=>$readID,
233 -primary_id=>$readID,
234 -alphabet=>'dna');
235 my $unalign_coord = Bio::SeqFeature::Generic->new(-start=>$start,
236 -end=>$end,
237 -primary=>"_unalign_coord:$readID",
238 -tag=>{'_primary_score'=>$primary_score,
239 '_secondary_score'=>$secondary_score,
240 '_substitutions'=>$substitutions,
241 '_insertions'=>,$insertions,
242 '_deletions'=>$deletions }
243 );
244 $unalign_coord->attach_seq($seq);
245 $contigOBJ->add_seq($seq); $contigOBJ->add_features([ $unalign_coord ]);
246 };
247
248 # Loading INTERNAL clones description
249 /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do {
250 my $contigID = $1;
251 my $contig = $Assembly->get_contig_by_id($contigID);
252 while ($_ = $self->_readline) {
253 my (@data,$rejected,$c1_strand,$c2_strand);
254
255 (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do {
256 if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 }
257 $c1_strand = ($data[1] eq 'C' ? -1 : 1);
258 $c2_strand = ($data[3] eq 'C' ? -1 : 1);
259 (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/;
260 my $clone = Bio::SeqFeature::Generic->new(-start=>$data[6],
261 -end=>$data[7],
262 -strand=>0,
263 -primary=>"_internal_clone:$clone_name",
264 -tag=>{'_1st_strand'=>,$c1_strand,
265 '_2nd_strand'=>,$c2_strand,
266 '_1st_name'=>$data[2],
267 '_2nd_name'=>$data[4],
268 '_length'=>$data[5],
269 '_rejected'=>$rejected
270 }
271 );
272 $contig->add_features([ $clone ]);
273 };
274
275 /Covered regions:/ && do {
276 my %coord = /(\d+)/g; my $i = 0;
277 foreach my $start (sort { $a <=> $b } keys %coord) {
278 my $cov = Bio::SeqFeature::Generic->new(-start=>$start,
279 -end=>$coord{$start},
280 -primary=>'_covered_region:'.++$i
281 );
282 # 1: attach feature to contig consensus, if any
283 $contig->add_features([ $cov ],1);
284 }
285 last; # exit while loop
286 }; # /Covered regions:/
287
288 } # while ($_ = $self->_readline)
289 }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/
290
291 } # while ($_ = $self->_readline)
292 return $Assembly;
293 }
294
295 =head2 write_assembly
296
297 Title : write_assembly
298 Usage : $ass_io->write_assembly($assembly)
299 Function: Write the assembly object in Phrap compatible ACE format
300 Returns : 1 on success, 0 for error
301 Args : A Bio::Assembly::Scaffold object
302
303 =cut
304
305 sub write_assemebly {
306 my $self = shift;
307
308 $self->throw("Writing phrap.out files is not implemented yet! Sorry...");
309 }
310
311 1;
312
313 __END__