Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Assembly/IO/phrap.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: phrap.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl driver for phrap.out file | |
| 4 # | |
| 5 # Copyright by Robson F. de Souza | |
| 6 # | |
| 7 # You may distribute this module under the same terms as perl itself | |
| 8 # | |
| 9 # POD documentation - main docs before the code | |
| 10 | |
| 11 =head1 NAME | |
| 12 | |
| 13 Bio::Assembly::IO::phrap - driver to load phrap.out files. | |
| 14 | |
| 15 =head1 SYNOPSYS | |
| 16 | |
| 17 # Building an input stream | |
| 18 use Bio::Assembly::IO; | |
| 19 | |
| 20 # Assembly loading methods | |
| 21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.phrap.out", | |
| 22 -format=>"phrap"); | |
| 23 | |
| 24 $assembly = $io->next_assembly; | |
| 25 | |
| 26 =head1 DESCRIPTION | |
| 27 | |
| 28 This package was developed to load the phrap.out files from the | |
| 29 (phred/phrap/consed) package by Phill Green. This files contain just | |
| 30 the messages printed to standard out by phrap when building an | |
| 31 assembly. This output is redirected by phredPhrap perl-script to a | |
| 32 file in the project's directory and hold some bit of information | |
| 33 regarding assembly quality, connections between contigs and clone's | |
| 34 position inside contigs. It should be noted that such files have no | |
| 35 data about the sequence. neither for contig consensus nor for any | |
| 36 aligned sequence. Anyway, such information may be loaded from Fasta | |
| 37 files in the projects directory and added to the assembly object | |
| 38 later. | |
| 39 | |
| 40 Note that, because no sequence is loaded for the contig consensus and | |
| 41 locations for aligned sequences are only given in "ungapped consensus" | |
| 42 coordinates in a phrap.out file, you can't make coordinate changes in | |
| 43 assemblies loaded by pharp.pm, unless you add an aligned | |
| 44 coordinates for each sequence to each contig's features collection | |
| 45 yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and | |
| 46 L<Bio::Assembly::Contig::Feature_collection>.. | |
| 47 | |
| 48 This driver also loads singlets into the assembly contigs as Bio::Seq | |
| 49 objects, altough without their sequence strings. It also adds a | |
| 50 feature for the entire sequence, thus storing the singlet length in | |
| 51 its end position, and adds a tag '_nof_trimmed_nonX' to the feature, | |
| 52 which stores the number of non-vector bases in the singlet. | |
| 53 | |
| 54 =head2 Implementation | |
| 55 | |
| 56 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by | |
| 57 Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig | |
| 58 "_aligned_coord:$seqID" feature class, therefore you can't make | |
| 59 coordinate changes in contigs loaded by this module. Contig objects | |
| 60 created by this module will have the following special feature | |
| 61 classes, identified by their primary tags, in their features | |
| 62 collection: | |
| 63 | |
| 64 "_main_contig_feature:$ID" : main feature for contig $ID. This | |
| 65 feature is used to store information | |
| 66 about the entire consensus | |
| 67 sequence. This feature always start at | |
| 68 base 1 and its end position is the | |
| 69 consensus sequence length. A tag, | |
| 70 'trimmed_length' holds the length of the | |
| 71 trimmed good quality region inside the | |
| 72 consensus sequence. | |
| 73 | |
| 74 "_covered_region:$index" : coordinates for valid clones inside the | |
| 75 contig. $index is the covered region | |
| 76 number, starting at 1 for the covered | |
| 77 region closest to the consensus sequence | |
| 78 first base. | |
| 79 | |
| 80 "_unalign_coord:$seqID" : location of a sequence in "ungapped | |
| 81 consensus" coordinates (consensus | |
| 82 sequence without gaps). Primary and | |
| 83 secondary scores, indel and | |
| 84 substitutions statistics are stored as | |
| 85 feature tags. | |
| 86 | |
| 87 "_internal_clones:$cloneID" : clones inside contigs $cloneID should be | |
| 88 used as the unique id for each | |
| 89 clone. These features have six tags: | |
| 90 '_1st_name', which is the id of the | |
| 91 upstream (5') aligned sequence | |
| 92 delimiting the clone; '_1st_strand', the | |
| 93 upstream sequence strand in the | |
| 94 alignment; '_2nd_name', downstream (3') | |
| 95 sequence id; '_2nd_strand', the | |
| 96 downstream sequence strand in the | |
| 97 alignment; '_length', unaligned clone | |
| 98 length; '_rejected', a boolean flag, | |
| 99 which is false if the clone is valid and | |
| 100 true if it was rejected. | |
| 101 | |
| 102 All coordinates for the features above are expressed as "ungapped | |
| 103 consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>.. | |
| 104 | |
| 105 =head2 Feature collection | |
| 106 | |
| 107 # | |
| 108 | |
| 109 =head1 FEEDBACK | |
| 110 | |
| 111 =head2 Mailing Lists | |
| 112 | |
| 113 User feedback is an integral part of the evolution of this and other | |
| 114 Bioperl modules. Send your comments and suggestions preferably to the | |
| 115 Bioperl mailing lists Your participation is much appreciated. | |
| 116 | |
| 117 bioperl-l@bioperl.org - General discussion | |
| 118 http://bio.perl.org/MailList.html - About the mailing lists | |
| 119 | |
| 120 =head2 Reporting Bugs | |
| 121 | |
| 122 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 123 the bugs and their resolution. Bug reports can be submitted via email | |
| 124 or the web: | |
| 125 | |
| 126 bioperl-bugs@bio.perl.org | |
| 127 http://bugzilla.bioperl.org/ | |
| 128 | |
| 129 | |
| 130 =head1 AUTHOR - Robson Francisco de Souza | |
| 131 | |
| 132 Email rfsouza@citri.iq.usp.br | |
| 133 | |
| 134 head1 APPENDIX | |
| 135 | |
| 136 The rest of the documentation details each of the object | |
| 137 methods. Internal methods are usually preceded with a _ | |
| 138 | |
| 139 =cut | |
| 140 | |
| 141 package Bio::Assembly::IO::phrap; | |
| 142 | |
| 143 use strict; | |
| 144 use vars qw(@ISA); | |
| 145 | |
| 146 use Bio::Assembly::IO; | |
| 147 use Bio::Assembly::Scaffold; | |
| 148 use Bio::Assembly::Contig; | |
| 149 use Bio::LocatableSeq; | |
| 150 use Bio::Seq; | |
| 151 use Bio::SeqFeature::Generic; | |
| 152 | |
| 153 @ISA = qw(Bio::Assembly::IO); | |
| 154 | |
| 155 =head2 next_assembly | |
| 156 | |
| 157 Title : next_assembly | |
| 158 Usage : $unigene = $stream->next_assembly() | |
| 159 Function: returns the next assembly in the stream | |
| 160 Returns : Bio::Assembly::Scaffold object | |
| 161 Args : NONE | |
| 162 | |
| 163 =cut | |
| 164 | |
| 165 sub next_assembly { | |
| 166 my $self = shift; # Package reference | |
| 167 | |
| 168 # Resetting assembly data structure | |
| 169 my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap'); | |
| 170 | |
| 171 # Looping over all phrap out file lines | |
| 172 my ($contigOBJ); | |
| 173 while ($_ = $self->_readline) { | |
| 174 chomp; | |
| 175 | |
| 176 # Loading exact dupicated reads list | |
| 177 # /Exact duplicate reads:/ && do { | |
| 178 # my @exact_dupl; | |
| 179 # while (<FILE>) { | |
| 180 # last if (/^\s*$/); | |
| 181 # /(\S+)\s+(\S+)/ && do { | |
| 182 # push(@exact_dupl,[$1,$2]); | |
| 183 # }; | |
| 184 # $self->{'assembly'}{'exact_dupl_reads'} = | |
| 185 # new Data::Table(\@exact_dupl,['included','excluded'],0); | |
| 186 # } | |
| 187 # }; | |
| 188 | |
| 189 # Loading singlets reads data | |
| 190 /^(\d+) isolated singletons/ && do { | |
| 191 while ($_ = $self->_readline) { | |
| 192 chomp; | |
| 193 last if (/^$/); | |
| 194 if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) { | |
| 195 my $seqID = $1; my $length = $2; | |
| 196 my $nof_trimmed_nonX = $3; | |
| 197 my $seq = new Bio::Seq(-strand=>1, | |
| 198 -primary_id=>$seqID); | |
| 199 my $f = Bio::SeqFeature::Generic->new | |
| 200 (-start=>1, -end=>$seq->length(), | |
| 201 -primary=>$seq->primary_id(), | |
| 202 -tag=>{ '_nof_trimmed_nonX' => $nof_trimmed_nonX } | |
| 203 ); | |
| 204 $seq->add_SeqFeature($f); | |
| 205 $Assembly->add_singlet($seq); | |
| 206 } | |
| 207 } | |
| 208 }; | |
| 209 | |
| 210 # Loading contig information | |
| 211 /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do { | |
| 212 my $nof_reads = $2; my $length = $3; my $trimmed_length = $4; | |
| 213 $contigOBJ = Bio::Assembly::Contig->new(-id=>$1, -source=>'phrap'); | |
| 214 my $feat = Bio::SeqFeature::Generic->new(-start=>1, | |
| 215 -end=>$length, | |
| 216 -primary=>"_main_contig_feature:".$contigOBJ->id(), | |
| 217 -tag=>{ '_trimmed_length' => $trimmed_length } | |
| 218 ); | |
| 219 $contigOBJ->add_features([ $feat ],1); | |
| 220 $Assembly->add_contig($contigOBJ); | |
| 221 }; | |
| 222 | |
| 223 # Loading read information | |
| 224 /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do { | |
| 225 my $strand = ($1 eq 'C' ? -1 : 1); | |
| 226 my $readID = $4; my $start = $2; my $end = $3; | |
| 227 my $primary_score = $5; my $secondary_score = $6; | |
| 228 my $substitutions = $7; my $deletions = $8; my $insertions = $9; | |
| 229 my $seq = Bio::LocatableSeq->new(-start=>$start, | |
| 230 -end=>$end, | |
| 231 -strand=>$strand, | |
| 232 -id=>$readID, | |
| 233 -primary_id=>$readID, | |
| 234 -alphabet=>'dna'); | |
| 235 my $unalign_coord = Bio::SeqFeature::Generic->new(-start=>$start, | |
| 236 -end=>$end, | |
| 237 -primary=>"_unalign_coord:$readID", | |
| 238 -tag=>{'_primary_score'=>$primary_score, | |
| 239 '_secondary_score'=>$secondary_score, | |
| 240 '_substitutions'=>$substitutions, | |
| 241 '_insertions'=>,$insertions, | |
| 242 '_deletions'=>$deletions } | |
| 243 ); | |
| 244 $unalign_coord->attach_seq($seq); | |
| 245 $contigOBJ->add_seq($seq); $contigOBJ->add_features([ $unalign_coord ]); | |
| 246 }; | |
| 247 | |
| 248 # Loading INTERNAL clones description | |
| 249 /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do { | |
| 250 my $contigID = $1; | |
| 251 my $contig = $Assembly->get_contig_by_id($contigID); | |
| 252 while ($_ = $self->_readline) { | |
| 253 my (@data,$rejected,$c1_strand,$c2_strand); | |
| 254 | |
| 255 (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do { | |
| 256 if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 } | |
| 257 $c1_strand = ($data[1] eq 'C' ? -1 : 1); | |
| 258 $c2_strand = ($data[3] eq 'C' ? -1 : 1); | |
| 259 (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/; | |
| 260 my $clone = Bio::SeqFeature::Generic->new(-start=>$data[6], | |
| 261 -end=>$data[7], | |
| 262 -strand=>0, | |
| 263 -primary=>"_internal_clone:$clone_name", | |
| 264 -tag=>{'_1st_strand'=>,$c1_strand, | |
| 265 '_2nd_strand'=>,$c2_strand, | |
| 266 '_1st_name'=>$data[2], | |
| 267 '_2nd_name'=>$data[4], | |
| 268 '_length'=>$data[5], | |
| 269 '_rejected'=>$rejected | |
| 270 } | |
| 271 ); | |
| 272 $contig->add_features([ $clone ]); | |
| 273 }; | |
| 274 | |
| 275 /Covered regions:/ && do { | |
| 276 my %coord = /(\d+)/g; my $i = 0; | |
| 277 foreach my $start (sort { $a <=> $b } keys %coord) { | |
| 278 my $cov = Bio::SeqFeature::Generic->new(-start=>$start, | |
| 279 -end=>$coord{$start}, | |
| 280 -primary=>'_covered_region:'.++$i | |
| 281 ); | |
| 282 # 1: attach feature to contig consensus, if any | |
| 283 $contig->add_features([ $cov ],1); | |
| 284 } | |
| 285 last; # exit while loop | |
| 286 }; # /Covered regions:/ | |
| 287 | |
| 288 } # while ($_ = $self->_readline) | |
| 289 }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ | |
| 290 | |
| 291 } # while ($_ = $self->_readline) | |
| 292 return $Assembly; | |
| 293 } | |
| 294 | |
| 295 =head2 write_assembly | |
| 296 | |
| 297 Title : write_assembly | |
| 298 Usage : $ass_io->write_assembly($assembly) | |
| 299 Function: Write the assembly object in Phrap compatible ACE format | |
| 300 Returns : 1 on success, 0 for error | |
| 301 Args : A Bio::Assembly::Scaffold object | |
| 302 | |
| 303 =cut | |
| 304 | |
| 305 sub write_assemebly { | |
| 306 my $self = shift; | |
| 307 | |
| 308 $self->throw("Writing phrap.out files is not implemented yet! Sorry..."); | |
| 309 } | |
| 310 | |
| 311 1; | |
| 312 | |
| 313 __END__ |
