Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Assembly/IO/ace.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $ | |
| 2 # | |
| 3 ## BioPerl module for Bio::Assembly::IO::ace | |
| 4 # | |
| 5 # Copyright by Robson F. de Souza | |
| 6 # | |
| 7 # You may distribute this module under the same terms as perl itself | |
| 8 | |
| 9 # POD documentation - main docs before the code | |
| 10 | |
| 11 =head1 NAME | |
| 12 | |
| 13 Bio::Assembly::IO::ace - module to load phrap ACE files. | |
| 14 | |
| 15 =head1 SYNOPSYS | |
| 16 | |
| 17 # Building an input stream | |
| 18 use Bio::Assembly::IO; | |
| 19 | |
| 20 # Assembly loading methods | |
| 21 $io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1", | |
| 22 -format=>"ace"); | |
| 23 | |
| 24 $assembly = $io->next_assembly; | |
| 25 | |
| 26 =head1 DESCRIPTION | |
| 27 | |
| 28 This package loads the ACE files from the (phred/phrap/consed) package | |
| 29 by Phill Green. It was written to be used as a driver module for | |
| 30 Bio::Assembly::IO input/output. | |
| 31 | |
| 32 =head2 Implemention | |
| 33 | |
| 34 Assemblies are loaded into Bio::Assembly::Scaffold objects composed by | |
| 35 Bio::Assembly::Contig objects. In addition to default | |
| 36 "_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig | |
| 37 objects loaded by this module will have the following special feature | |
| 38 classes in their feature collection: | |
| 39 | |
| 40 "_align_clipping:$seqID" : location of subsequence in sequence $seqID | |
| 41 which is aligned to the contig | |
| 42 | |
| 43 "_quality_clipping:$seqID" : location of good quality subsequence for | |
| 44 sequence $seqID | |
| 45 | |
| 46 "consensus tags", as they are called in Consed's documentation, is | |
| 47 equivalent to a bioperl sequence feature and, therefore, are added to | |
| 48 the feature collection using their type field (see Consed's README.txt | |
| 49 file) as primary tag. | |
| 50 | |
| 51 "read tags" are also sequence features and are stored as | |
| 52 sub_SeqFeatures of the sequence's coordinate feature (the | |
| 53 corresponding "_aligned_coord:$seqID" feature, easily accessed through | |
| 54 get_seq_coord() method). | |
| 55 | |
| 56 "whole assembly tags" have no start and end, as they are not | |
| 57 associated to any particular sequence in the assembly, and are added | |
| 58 to the assembly's annotation collection using phrap as tag. | |
| 59 | |
| 60 =head1 FEEDBACK | |
| 61 | |
| 62 =head2 Mailing Lists | |
| 63 | |
| 64 User feedback is an integral part of the evolution of this and other | |
| 65 Bioperl modules. Send your comments and suggestions preferably to the | |
| 66 Bioperl mailing lists Your participation is much appreciated. | |
| 67 | |
| 68 bioperl-l@bioperl.org - General discussion | |
| 69 http://bio.perl.org/MailList.html - About the mailing lists | |
| 70 | |
| 71 =head2 Reporting Bugs | |
| 72 | |
| 73 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 74 the bugs and their resolution. Bug reports can be submitted via email | |
| 75 or the web: | |
| 76 | |
| 77 bioperl-bugs@bio.perl.org | |
| 78 http://bugzilla.bioperl.org/ | |
| 79 | |
| 80 =head1 AUTHOR - Robson Francisco de Souza | |
| 81 | |
| 82 Email rfsouza@citri.iq.usp.br | |
| 83 | |
| 84 =head1 APPENDIX | |
| 85 | |
| 86 The rest of the documentation details each of the object | |
| 87 methods. Internal methods are usually preceded with a _ | |
| 88 | |
| 89 =cut | |
| 90 | |
| 91 package Bio::Assembly::IO::ace; | |
| 92 | |
| 93 use strict; | |
| 94 use vars qw(@ISA); | |
| 95 | |
| 96 use Bio::Assembly::IO; | |
| 97 use Bio::Assembly::Scaffold; | |
| 98 use Bio::Assembly::Contig; | |
| 99 use Bio::LocatableSeq; | |
| 100 use Bio::Annotation::SimpleValue; | |
| 101 use Bio::Seq::PrimaryQual; | |
| 102 use Bio::SeqFeature::Generic; | |
| 103 | |
| 104 @ISA = qw(Bio::Assembly::IO); | |
| 105 | |
| 106 =head1 Parser methods | |
| 107 | |
| 108 =head2 next_assembly | |
| 109 | |
| 110 Title : next_assembly | |
| 111 Usage : $unigene = $stream->next_assembly() | |
| 112 Function: returns the next assembly in the stream | |
| 113 Returns : Bio::Assembly::Scaffold object | |
| 114 Args : NONE | |
| 115 | |
| 116 =cut | |
| 117 | |
| 118 sub next_assembly { | |
| 119 my $self = shift; # Object reference | |
| 120 local $/="\n"; | |
| 121 | |
| 122 # Resetting assembly data structure | |
| 123 my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap'); | |
| 124 | |
| 125 # Looping over all ACE file lines | |
| 126 my ($contigOBJ,$read_name); | |
| 127 my $read_data = {}; # Temporary holder for read data | |
| 128 while ($_ = $self->_readline) { # By now, ACE files hold a single assembly | |
| 129 chomp; | |
| 130 | |
| 131 # Loading assembly information (ASsembly field) | |
| 132 # (/^AS (\d+) (\d+)/) && do { | |
| 133 # $assembly->_set_nof_contigs($1); | |
| 134 # $assembly->_set_nof_sequences_in_contigs($2); | |
| 135 # }; | |
| 136 | |
| 137 # Loading contig sequence (COntig sequence field) | |
| 138 (/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found! | |
| 139 my $contigID = $1; | |
| 140 $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID); | |
| 141 # $contigOBJ->set_nof_bases($2); # Contig length in base pairs | |
| 142 # $contigOBJ->set_nof_reads($3); # Number of reads in this contig | |
| 143 # $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly | |
| 144 my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default) | |
| 145 $contigOBJ->strand($ori); | |
| 146 my $consensus_sequence = undef; | |
| 147 while ($_ = $self->_readline) { # Looping over contig lines | |
| 148 chomp; # Drop <ENTER> (\n) on current line | |
| 149 last if (/^$/); # Stop if empty line (contig end) is found | |
| 150 s/\*/-/g; # Forcing '-' as gap symbol | |
| 151 $consensus_sequence .= $_; | |
| 152 } | |
| 153 | |
| 154 my $consensus_length = length($consensus_sequence); | |
| 155 $consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence, | |
| 156 -start=>1, | |
| 157 -end=>$consensus_length, | |
| 158 -id=>$contigID); | |
| 159 $contigOBJ->set_consensus_sequence($consensus_sequence); | |
| 160 $assembly->add_contig($contigOBJ); | |
| 161 }; | |
| 162 | |
| 163 # Loading contig qualities... (Base Quality field) | |
| 164 /^BQ/ && do { | |
| 165 my $consensus = $contigOBJ->get_consensus_sequence()->seq(); | |
| 166 my ($i,$j,@tmp); | |
| 167 my @quality = (); | |
| 168 $j = 0; | |
| 169 while ($_ = $self->_readline) { | |
| 170 chomp; | |
| 171 last if (/^$/); | |
| 172 @tmp = grep { /^\d+$/ } split(/\s+/); | |
| 173 $i = 0; | |
| 174 my $previous = 0; | |
| 175 my $next = 0; | |
| 176 while ($i<=$#tmp) { | |
| 177 # IF base is a gap, quality is the average for neighbouring sites | |
| 178 if (substr($consensus,$j,1) eq '-') { | |
| 179 $previous = $tmp[$i-1] unless ($i == 0); | |
| 180 if ($i < $#tmp) { | |
| 181 $next = $tmp[$i+1]; | |
| 182 } else { | |
| 183 $next = 0; | |
| 184 } | |
| 185 push(@quality,int(($previous+$next)/2)); | |
| 186 } else { | |
| 187 push(@quality,$tmp[$i]); | |
| 188 $i++; | |
| 189 } | |
| 190 $j++; | |
| 191 } | |
| 192 } | |
| 193 | |
| 194 my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality), | |
| 195 -id=>$contigOBJ->id()); | |
| 196 $contigOBJ->set_consensus_quality($qual); | |
| 197 }; | |
| 198 | |
| 199 # Loading read info... (Assembled From field) | |
| 200 /^AF (\S+) (C|U) (-*\d+)/ && do { | |
| 201 $read_name = $1; my $ori = $2; | |
| 202 $read_data->{$read_name}{'padded_start'} = $3; # aligned start | |
| 203 $ori = ( $ori eq 'U' ? 1 : -1); | |
| 204 $read_data->{$read_name}{'strand'} = $ori; | |
| 205 }; | |
| 206 | |
| 207 # Loading base segments definitions (Base Segment field) | |
| 208 # /^BS (\d+) (\d+) (\S+)/ && do { | |
| 209 # if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) { | |
| 210 # $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2; | |
| 211 # } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 } | |
| 212 # }; | |
| 213 | |
| 214 # Loading reads... (ReaD sequence field | |
| 215 /^RD (\S+) (-*\d+) (\d+) (\d+)/ && do { | |
| 216 $read_name = $1; | |
| 217 $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases | |
| 218 $read_data->{$read_name}{'contig'} = $contigOBJ; | |
| 219 # $read_data->{$read_name}{'number_of_read_info_items'} = $3; | |
| 220 # $read_data->{$read_name}{'number_of_tags'} = $4; | |
| 221 my $read_sequence; | |
| 222 while ($_ = $self->_readline) { | |
| 223 chomp; | |
| 224 last if (/^$/); | |
| 225 s/\*/-/g; # Forcing '-' as gap symbol | |
| 226 $read_sequence .= $_; # aligned read sequence | |
| 227 } | |
| 228 my $read = Bio::LocatableSeq->new(-seq=>$read_sequence, | |
| 229 -start=>1, | |
| 230 -end=>$read_data->{$read_name}{'length'}, | |
| 231 -strand=>$read_data->{$read_name}{'strand'}, | |
| 232 -id=>$read_name, | |
| 233 -primary_id=>$read_name, | |
| 234 -alphabet=>'dna'); | |
| 235 | |
| 236 # Adding read location and sequence to contig ("gapped consensus" coordinates) | |
| 237 my $padded_start = $read_data->{$read_name}{'padded_start'}; | |
| 238 my $padded_end = $padded_start + $read_data->{$read_name}{'length'} - 1; | |
| 239 my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start, | |
| 240 -end=>$padded_end, | |
| 241 -strand=>$read_data->{$read_name}{'strand'}, | |
| 242 -tag => { 'contig' => $contigOBJ->id } | |
| 243 ); | |
| 244 $contigOBJ->set_seq_coord($coord,$read); | |
| 245 }; | |
| 246 | |
| 247 # Loading read trimming and alignment ranges... | |
| 248 /^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do { | |
| 249 my $qual_start = $1; my $qual_end = $2; | |
| 250 my $align_start = $3; my $align_end = $4; | |
| 251 | |
| 252 unless ($align_start == -1 && $align_end == -1) { | |
| 253 $align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start); | |
| 254 $align_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end); | |
| 255 my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start, | |
| 256 -end=>$align_end, | |
| 257 -strand=>$read_data->{$read_name}{'strand'}, | |
| 258 -primary=>"_align_clipping:$read_name"); | |
| 259 $align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) ); | |
| 260 $contigOBJ->add_features([ $align_feat ], 0); | |
| 261 } | |
| 262 | |
| 263 unless ($qual_start == -1 && $qual_end == -1) { | |
| 264 $qual_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start); | |
| 265 $qual_end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end); | |
| 266 my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start, | |
| 267 -end=>$qual_end, | |
| 268 -strand=>$read_data->{$read_name}{'strand'}, | |
| 269 -primary=>"_quality_clipping:$read_name"); | |
| 270 $qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) ); | |
| 271 $contigOBJ->add_features([ $qual_feat ], 0); | |
| 272 } | |
| 273 }; | |
| 274 | |
| 275 # Loading read description (DeScription fields) | |
| 276 # /^DS / && do { | |
| 277 # /CHEM: (\S+)/ && do { | |
| 278 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1; | |
| 279 # }; | |
| 280 # /CHROMAT_FILE: (\S+)/ && do { | |
| 281 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1; | |
| 282 # }; | |
| 283 # /DIRECTION: (\w+)/ && do { | |
| 284 # my $ori = $1; | |
| 285 # if ($ori eq 'rev') { $ori = 'C' } | |
| 286 # elsif ($ori eq 'fwd') { $ori = 'U' } | |
| 287 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori; | |
| 288 # }; | |
| 289 # /DYE: (\S+)/ && do { | |
| 290 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1; | |
| 291 # }; | |
| 292 # /PHD_FILE: (\S+)/ && do { | |
| 293 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1; | |
| 294 # }; | |
| 295 # /TEMPLATE: (\S+)/ && do { | |
| 296 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1; | |
| 297 # }; | |
| 298 # /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do { | |
| 299 # $self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1; | |
| 300 # }; | |
| 301 # }; | |
| 302 | |
| 303 # Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features) | |
| 304 /^CT\s*\{/ && do { | |
| 305 my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline); | |
| 306 $contigID =~ s/^Contig//i; | |
| 307 my $extra_info = undef; | |
| 308 while ($_ = $self->_readline) { | |
| 309 last if (/\}/); | |
| 310 $extra_info .= $_; | |
| 311 } | |
| 312 my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start, | |
| 313 -end=>$end, | |
| 314 -primary=>$type, | |
| 315 -tag=>{ 'source' => $source, | |
| 316 'creation_date' => $date, | |
| 317 'extra_info' => $extra_info | |
| 318 }); | |
| 319 $assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1); | |
| 320 }; | |
| 321 | |
| 322 # Loading read tags | |
| 323 /^RT\s*\{/ && do { | |
| 324 my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline); | |
| 325 my $extra_info = undef; | |
| 326 while ($_ = $self->_readline) { | |
| 327 last if (/\}/); | |
| 328 $extra_info .= $_; | |
| 329 } | |
| 330 $start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start); | |
| 331 $end = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end); | |
| 332 my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start, | |
| 333 -end=>$end, | |
| 334 -primary=>$type, | |
| 335 -tag=>{ 'source' => $source, | |
| 336 'creation_date' => $date, | |
| 337 'extra_info' => $extra_info | |
| 338 }); | |
| 339 my $contig = $read_data->{$readID}{'contig'}; | |
| 340 my $coord = $contig->get_seq_coord( $contig->get_seq_by_name($readID) ); | |
| 341 $coord->add_sub_SeqFeature($read_tag); | |
| 342 }; | |
| 343 | |
| 344 # Loading read tags | |
| 345 /^WA\s*\{/ && do { | |
| 346 my ($type,$source,$date) = split(' ',$self->_readline); | |
| 347 my $extra_info = undef; | |
| 348 while ($_ = $self->_readline) { | |
| 349 last if (/\}/); | |
| 350 $extra_info = $_; | |
| 351 } | |
| 352 #? push(@line,\@extra_info); | |
| 353 my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source, | |
| 354 "DATE:",$date,"DATA:",$extra_info); | |
| 355 $assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag); | |
| 356 $assembly->annotation->add_Annotation('phrap',$assembly_tag); | |
| 357 }; | |
| 358 | |
| 359 } # while ($_ = $self->_readline) | |
| 360 | |
| 361 $assembly->update_seq_list(); | |
| 362 return $assembly; | |
| 363 } | |
| 364 | |
| 365 =head2 write_assembly | |
| 366 | |
| 367 Title : write_assembly | |
| 368 Usage : $ass_io->write_assembly($assembly) | |
| 369 Function: Write the assembly object in Phrap compatible ACE format | |
| 370 Returns : 1 on success, 0 for error | |
| 371 Args : A Bio::Assembly::Scaffold object | |
| 372 | |
| 373 =cut | |
| 374 | |
| 375 sub write_assemebly { | |
| 376 my $self = shift; | |
| 377 | |
| 378 $self->throw("Writing phrap ACE files is not implemented yet! Sorry..."); | |
| 379 } | |
| 380 | |
| 381 1; | |
| 382 | |
| 383 __END__ |
