Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/Assembly/IO/phrap.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 # $Id: phrap.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $ | |
2 # | |
3 # BioPerl driver for phrap.out file | |
4 # | |
5 # Copyright by Robson F. de Souza | |
6 # | |
7 # You may distribute this module under the same terms as perl itself | |
8 # | |
9 # POD documentation - main docs before the code | |
10 | |
11 =head1 NAME | |
12 | |
13 Bio::Assembly::IO::phrap - driver to load phrap.out files. | |
14 | |
15 =head1 SYNOPSYS | |
16 | |
17 # Building an input stream | |
18 use Bio::Assembly::IO; | |
19 | |
20 # Assembly loading methods | |
21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.phrap.out", | |
22 -format=>"phrap"); | |
23 | |
24 $assembly = $io->next_assembly; | |
25 | |
26 =head1 DESCRIPTION | |
27 | |
28 This package was developed to load the phrap.out files from the | |
29 (phred/phrap/consed) package by Phill Green. This files contain just | |
30 the messages printed to standard out by phrap when building an | |
31 assembly. This output is redirected by phredPhrap perl-script to a | |
32 file in the project's directory and hold some bit of information | |
33 regarding assembly quality, connections between contigs and clone's | |
34 position inside contigs. It should be noted that such files have no | |
35 data about the sequence. neither for contig consensus nor for any | |
36 aligned sequence. Anyway, such information may be loaded from Fasta | |
37 files in the projects directory and added to the assembly object | |
38 later. | |
39 | |
40 Note that, because no sequence is loaded for the contig consensus and | |
41 locations for aligned sequences are only given in "ungapped consensus" | |
42 coordinates in a phrap.out file, you can't make coordinate changes in | |
43 assemblies loaded by pharp.pm, unless you add an aligned | |
44 coordinates for each sequence to each contig's features collection | |
45 yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and | |
46 L<Bio::Assembly::Contig::Feature_collection>.. | |
47 | |
48 This driver also loads singlets into the assembly contigs as Bio::Seq | |
49 objects, altough without their sequence strings. It also adds a | |
50 feature for the entire sequence, thus storing the singlet length in | |
51 its end position, and adds a tag '_nof_trimmed_nonX' to the feature, | |
52 which stores the number of non-vector bases in the singlet. | |
53 | |
54 =head2 Implementation | |
55 | |
56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by | |
57 Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig | |
58 "_aligned_coord:$seqID" feature class, therefore you can't make | |
59 coordinate changes in contigs loaded by this module. Contig objects | |
60 created by this module will have the following special feature | |
61 classes, identified by their primary tags, in their features | |
62 collection: | |
63 | |
64 "_main_contig_feature:$ID" : main feature for contig $ID. This | |
65 feature is used to store information | |
66 about the entire consensus | |
67 sequence. This feature always start at | |
68 base 1 and its end position is the | |
69 consensus sequence length. A tag, | |
70 'trimmed_length' holds the length of the | |
71 trimmed good quality region inside the | |
72 consensus sequence. | |
73 | |
74 "_covered_region:$index" : coordinates for valid clones inside the | |
75 contig. $index is the covered region | |
76 number, starting at 1 for the covered | |
77 region closest to the consensus sequence | |
78 first base. | |
79 | |
80 "_unalign_coord:$seqID" : location of a sequence in "ungapped | |
81 consensus" coordinates (consensus | |
82 sequence without gaps). Primary and | |
83 secondary scores, indel and | |
84 substitutions statistics are stored as | |
85 feature tags. | |
86 | |
87 "_internal_clones:$cloneID" : clones inside contigs $cloneID should be | |
88 used as the unique id for each | |
89 clone. These features have six tags: | |
90 '_1st_name', which is the id of the | |
91 upstream (5') aligned sequence | |
92 delimiting the clone; '_1st_strand', the | |
93 upstream sequence strand in the | |
94 alignment; '_2nd_name', downstream (3') | |
95 sequence id; '_2nd_strand', the | |
96 downstream sequence strand in the | |
97 alignment; '_length', unaligned clone | |
98 length; '_rejected', a boolean flag, | |
99 which is false if the clone is valid and | |
100 true if it was rejected. | |
101 | |
102 All coordinates for the features above are expressed as "ungapped | |
103 consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>.. | |
104 | |
105 =head2 Feature collection | |
106 | |
107 # | |
108 | |
109 =head1 FEEDBACK | |
110 | |
111 =head2 Mailing Lists | |
112 | |
113 User feedback is an integral part of the evolution of this and other | |
114 Bioperl modules. Send your comments and suggestions preferably to the | |
115 Bioperl mailing lists Your participation is much appreciated. | |
116 | |
117 bioperl-l@bioperl.org - General discussion | |
118 http://bio.perl.org/MailList.html - About the mailing lists | |
119 | |
120 =head2 Reporting Bugs | |
121 | |
122 Report bugs to the Bioperl bug tracking system to help us keep track | |
123 the bugs and their resolution. Bug reports can be submitted via email | |
124 or the web: | |
125 | |
126 bioperl-bugs@bio.perl.org | |
127 http://bugzilla.bioperl.org/ | |
128 | |
129 | |
130 =head1 AUTHOR - Robson Francisco de Souza | |
131 | |
132 Email rfsouza@citri.iq.usp.br | |
133 | |
134 head1 APPENDIX | |
135 | |
136 The rest of the documentation details each of the object | |
137 methods. Internal methods are usually preceded with a _ | |
138 | |
139 =cut | |
140 | |
141 package Bio::Assembly::IO::phrap; | |
142 | |
143 use strict; | |
144 use vars qw(@ISA); | |
145 | |
146 use Bio::Assembly::IO; | |
147 use Bio::Assembly::Scaffold; | |
148 use Bio::Assembly::Contig; | |
149 use Bio::LocatableSeq; | |
150 use Bio::Seq; | |
151 use Bio::SeqFeature::Generic; | |
152 | |
153 @ISA = qw(Bio::Assembly::IO); | |
154 | |
155 =head2 next_assembly | |
156 | |
157 Title : next_assembly | |
158 Usage : $unigene = $stream->next_assembly() | |
159 Function: returns the next assembly in the stream | |
160 Returns : Bio::Assembly::Scaffold object | |
161 Args : NONE | |
162 | |
163 =cut | |
164 | |
165 sub next_assembly { | |
166 my $self = shift; # Package reference | |
167 | |
168 # Resetting assembly data structure | |
169 my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap'); | |
170 | |
171 # Looping over all phrap out file lines | |
172 my ($contigOBJ); | |
173 while ($_ = $self->_readline) { | |
174 chomp; | |
175 | |
176 # Loading exact dupicated reads list | |
177 # /Exact duplicate reads:/ && do { | |
178 # my @exact_dupl; | |
179 # while (<FILE>) { | |
180 # last if (/^\s*$/); | |
181 # /(\S+)\s+(\S+)/ && do { | |
182 # push(@exact_dupl,[$1,$2]); | |
183 # }; | |
184 # $self->{'assembly'}{'exact_dupl_reads'} = | |
185 # new Data::Table(\@exact_dupl,['included','excluded'],0); | |
186 # } | |
187 # }; | |
188 | |
189 # Loading singlets reads data | |
190 /^(\d+) isolated singletons/ && do { | |
191 while ($_ = $self->_readline) { | |
192 chomp; | |
193 last if (/^$/); | |
194 if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) { | |
195 my $seqID = $1; my $length = $2; | |
196 my $nof_trimmed_nonX = $3; | |
197 my $seq = new Bio::Seq(-strand=>1, | |
198 -primary_id=>$seqID); | |
199 my $f = Bio::SeqFeature::Generic->new | |
200 (-start=>1, -end=>$seq->length(), | |
201 -primary=>$seq->primary_id(), | |
202 -tag=>{ '_nof_trimmed_nonX' => $nof_trimmed_nonX } | |
203 ); | |
204 $seq->add_SeqFeature($f); | |
205 $Assembly->add_singlet($seq); | |
206 } | |
207 } | |
208 }; | |
209 | |
210 # Loading contig information | |
211 /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do { | |
212 my $nof_reads = $2; my $length = $3; my $trimmed_length = $4; | |
213 $contigOBJ = Bio::Assembly::Contig->new(-id=>$1, -source=>'phrap'); | |
214 my $feat = Bio::SeqFeature::Generic->new(-start=>1, | |
215 -end=>$length, | |
216 -primary=>"_main_contig_feature:".$contigOBJ->id(), | |
217 -tag=>{ '_trimmed_length' => $trimmed_length } | |
218 ); | |
219 $contigOBJ->add_features([ $feat ],1); | |
220 $Assembly->add_contig($contigOBJ); | |
221 }; | |
222 | |
223 # Loading read information | |
224 /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do { | |
225 my $strand = ($1 eq 'C' ? -1 : 1); | |
226 my $readID = $4; my $start = $2; my $end = $3; | |
227 my $primary_score = $5; my $secondary_score = $6; | |
228 my $substitutions = $7; my $deletions = $8; my $insertions = $9; | |
229 my $seq = Bio::LocatableSeq->new(-start=>$start, | |
230 -end=>$end, | |
231 -strand=>$strand, | |
232 -id=>$readID, | |
233 -primary_id=>$readID, | |
234 -alphabet=>'dna'); | |
235 my $unalign_coord = Bio::SeqFeature::Generic->new(-start=>$start, | |
236 -end=>$end, | |
237 -primary=>"_unalign_coord:$readID", | |
238 -tag=>{'_primary_score'=>$primary_score, | |
239 '_secondary_score'=>$secondary_score, | |
240 '_substitutions'=>$substitutions, | |
241 '_insertions'=>,$insertions, | |
242 '_deletions'=>$deletions } | |
243 ); | |
244 $unalign_coord->attach_seq($seq); | |
245 $contigOBJ->add_seq($seq); $contigOBJ->add_features([ $unalign_coord ]); | |
246 }; | |
247 | |
248 # Loading INTERNAL clones description | |
249 /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do { | |
250 my $contigID = $1; | |
251 my $contig = $Assembly->get_contig_by_id($contigID); | |
252 while ($_ = $self->_readline) { | |
253 my (@data,$rejected,$c1_strand,$c2_strand); | |
254 | |
255 (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do { | |
256 if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 } | |
257 $c1_strand = ($data[1] eq 'C' ? -1 : 1); | |
258 $c2_strand = ($data[3] eq 'C' ? -1 : 1); | |
259 (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/; | |
260 my $clone = Bio::SeqFeature::Generic->new(-start=>$data[6], | |
261 -end=>$data[7], | |
262 -strand=>0, | |
263 -primary=>"_internal_clone:$clone_name", | |
264 -tag=>{'_1st_strand'=>,$c1_strand, | |
265 '_2nd_strand'=>,$c2_strand, | |
266 '_1st_name'=>$data[2], | |
267 '_2nd_name'=>$data[4], | |
268 '_length'=>$data[5], | |
269 '_rejected'=>$rejected | |
270 } | |
271 ); | |
272 $contig->add_features([ $clone ]); | |
273 }; | |
274 | |
275 /Covered regions:/ && do { | |
276 my %coord = /(\d+)/g; my $i = 0; | |
277 foreach my $start (sort { $a <=> $b } keys %coord) { | |
278 my $cov = Bio::SeqFeature::Generic->new(-start=>$start, | |
279 -end=>$coord{$start}, | |
280 -primary=>'_covered_region:'.++$i | |
281 ); | |
282 # 1: attach feature to contig consensus, if any | |
283 $contig->add_features([ $cov ],1); | |
284 } | |
285 last; # exit while loop | |
286 }; # /Covered regions:/ | |
287 | |
288 } # while ($_ = $self->_readline) | |
289 }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ | |
290 | |
291 } # while ($_ = $self->_readline) | |
292 return $Assembly; | |
293 } | |
294 | |
295 =head2 write_assembly | |
296 | |
297 Title : write_assembly | |
298 Usage : $ass_io->write_assembly($assembly) | |
299 Function: Write the assembly object in Phrap compatible ACE format | |
300 Returns : 1 on success, 0 for error | |
301 Args : A Bio::Assembly::Scaffold object | |
302 | |
303 =cut | |
304 | |
305 sub write_assemebly { | |
306 my $self = shift; | |
307 | |
308 $self->throw("Writing phrap.out files is not implemented yet! Sorry..."); | |
309 } | |
310 | |
311 1; | |
312 | |
313 __END__ |