Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Lucy.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Lucy.pm,v 1.6 2002/10/22 07:38:46 lapp Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Tools::Lucy | |
| 4 # | |
| 5 # Copyright Her Majesty the Queen of England | |
| 6 # written by Andrew Walsh (paeruginosa@hotmail.com) during employment with | |
| 7 # Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # POD documentation - main docs before the code | |
| 11 | |
| 12 =head1 NAME | |
| 13 | |
| 14 Bio::Tools::Lucy - Object for analyzing the output from Lucy, | |
| 15 a vector and quality trimming program from TIGR | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 # Create the Lucy object from an existing Lucy output file | |
| 20 @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1); | |
| 21 $lucyObj = Bio::Tools::Lucy->new(@params); | |
| 22 | |
| 23 # Get names of all sequences | |
| 24 $names = $lucyObj->get_sequence_names(); | |
| 25 | |
| 26 # Print seq and qual values for sequences >400 bp in order to run CAP3 | |
| 27 foreach $name (@$names) { | |
| 28 next unless $lucyObj->length_clear($name) > 400; | |
| 29 print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; | |
| 30 print QUAL ">$name\n", $lucyObj->quality($name), "\n"; | |
| 31 } | |
| 32 | |
| 33 # Get an array of Bio::PrimarySeq objects | |
| 34 @seqObjs = $lucyObj->get_Seq_Objs(); | |
| 35 | |
| 36 | |
| 37 =head1 DESCRIPTION | |
| 38 | |
| 39 Bio::Tools::Lucy.pm provides methods for analyzing the sequence and | |
| 40 quality values generated by Lucy program from TIGR. | |
| 41 | |
| 42 Lucy will identify vector, poly-A/T tails, and poor quality regions in | |
| 43 a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf) | |
| 44 | |
| 45 The input to Lucy can be the Phred sequence and quality files | |
| 46 generated from running Phred on a set of chromatograms. | |
| 47 | |
| 48 Lucy can be obtained (free of charge to academic users) from | |
| 49 www.tigr.org/softlab | |
| 50 | |
| 51 There are a few methods that will only be available if you make some | |
| 52 minor changes to the source for Lucy and then recompile. The changes | |
| 53 are in the 'lucy.c' file and there is a diff between the original and | |
| 54 the modified file in the Appendix | |
| 55 | |
| 56 Please contact the author of this module if you have any problems | |
| 57 making these modifications. | |
| 58 | |
| 59 You do not have to make these modifications to use this module. | |
| 60 | |
| 61 =head2 Creating a Lucy object | |
| 62 | |
| 63 @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1, | |
| 64 'fwd_desig' => '_F', 'rev_desig' => '_R'); | |
| 65 $lucyObj = Bio::Tools::Lucy->new(@params); | |
| 66 | |
| 67 =head2 Using a Lucy object | |
| 68 | |
| 69 You should get an array with the sequence names in order to use | |
| 70 accessor methods. Note: The Lucy binary program will fail unless | |
| 71 the sequence names provided as input are unique. | |
| 72 | |
| 73 $names_ref = $lucyObj->get_sequence_names(); | |
| 74 | |
| 75 This code snippet will produce a Fasta format file with sequence | |
| 76 lengths and %GC in the description line. | |
| 77 | |
| 78 foreach $name (@$names) { | |
| 79 print FILE ">$name\t", | |
| 80 $lucyObj->length_clear($name), "\t", | |
| 81 $lucyObj->per_GC($name), "\n", | |
| 82 $lucyObj->sequence($name), "\n"; | |
| 83 } | |
| 84 | |
| 85 | |
| 86 Print seq and qual values for sequences >400 bp in order to assemble | |
| 87 them with CAP3 (or other assembler). | |
| 88 | |
| 89 foreach $name (@$names) { | |
| 90 next unless $lucyObj->length_clear($name) > 400; | |
| 91 print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; | |
| 92 print QUAL ">$name\n", $lucyObj->quality($name), "\n"; | |
| 93 } | |
| 94 | |
| 95 Get all the sequences as Bio::PrimarySeq objects (eg., for use with | |
| 96 Bio::Tools::Blast to perform BLAST). | |
| 97 | |
| 98 @seqObjs = $lucyObj->get_Seq_Objs(); | |
| 99 | |
| 100 Or use only those sequences that are full length and have a Poly-A | |
| 101 tail. | |
| 102 | |
| 103 foreach $name (@$names) { | |
| 104 next unless ($lucyObj->full_length($name) and $lucy->polyA($name)); | |
| 105 push @seqObjs, $lucyObj->get_Seq_Obj($name); | |
| 106 } | |
| 107 | |
| 108 | |
| 109 Get the names of those sequences that were rejected by Lucy. | |
| 110 | |
| 111 $rejects_ref = $lucyObj->get_rejects(); | |
| 112 | |
| 113 Print the names of the rejects and 1 letter code for reason they | |
| 114 were rejected. | |
| 115 | |
| 116 foreach $key (sort keys %$rejects_ref) { | |
| 117 print "$key: ", $rejects_ref->{$key}; | |
| 118 } | |
| 119 | |
| 120 There is a lot of other information available about the sequences | |
| 121 analyzed by Lucy (see APPENDIX). This module can be used with the | |
| 122 DBI module to store this sequence information in a database. | |
| 123 | |
| 124 =head1 FEEDBACK | |
| 125 | |
| 126 =head2 Mailing Lists | |
| 127 | |
| 128 User feedback is an integral part of the evolution of this and other | |
| 129 Bioperl modules. Send your comments and suggestions preferably to one | |
| 130 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 131 | |
| 132 bioperl-l@bioperl.org - General discussion | |
| 133 http://bio.perl.org/MailList.html - About the mailing lists | |
| 134 | |
| 135 =head2 Reporting Bugs | |
| 136 | |
| 137 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 138 the bugs and their resolution. Bug reports can be submitted via email | |
| 139 or the web: | |
| 140 | |
| 141 bioperl-bugs@bio.perl.org | |
| 142 http://bugzilla.bioperl.org/ | |
| 143 | |
| 144 =head1 AUTHOR | |
| 145 | |
| 146 Andrew G. Walsh paeruginosa@hotmail.com | |
| 147 | |
| 148 =head1 APPENDIX | |
| 149 | |
| 150 Methods available to Lucy objects are described below. Please note | |
| 151 that any method beginning with an underscore is considered internal | |
| 152 and should not be called directly. | |
| 153 | |
| 154 =cut | |
| 155 | |
| 156 | |
| 157 package Bio::Tools::Lucy; | |
| 158 | |
| 159 use vars qw($VERSION $AUTOLOAD @ISA @ATTR %OK_FIELD); | |
| 160 use strict; | |
| 161 use Bio::PrimarySeq; | |
| 162 use Bio::Root::Root; | |
| 163 use Bio::Root::IO; | |
| 164 | |
| 165 @ISA = qw(Bio::Root::Root Bio::Root::IO); | |
| 166 @ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr); | |
| 167 foreach my $attr (@ATTR) { | |
| 168 $OK_FIELD{$attr}++ | |
| 169 } | |
| 170 $VERSION = "0.01"; | |
| 171 | |
| 172 sub AUTOLOAD { | |
| 173 my $self = shift; | |
| 174 my $attr = $AUTOLOAD; | |
| 175 $attr =~ s/.*:://; | |
| 176 $attr = lc $attr; | |
| 177 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; | |
| 178 $self->{$attr} = shift if @_; | |
| 179 return $self->{$attr}; | |
| 180 } | |
| 181 | |
| 182 =head2 new | |
| 183 | |
| 184 Title : new | |
| 185 Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R', | |
| 186 fwd_desig => '_F') | |
| 187 Function: creates a Lucy object from Lucy analysis files | |
| 188 Returns : reference to Bio::Tools::Lucy object | |
| 189 Args : seqfile Fasta sequence file generated by Lucy | |
| 190 qualfile Quality values file generated by Lucy | |
| 191 infofile Info file created when Lucy is run with -debug 'infofile' option | |
| 192 stderrfile Standard error captured from Lucy when Lucy is run | |
| 193 with -info option and STDERR is directed to stderrfile | |
| 194 (ie. lucy ... 2> stderrfile). | |
| 195 Info in this file will include sequences dropped for low | |
| 196 quality. If you've modified Lucy source (see adv_stderr below), | |
| 197 it will also include info on which sequences were dropped because | |
| 198 they were vector, too short, had no insert, and whether a poly-A | |
| 199 tail was found (if Lucy was run with -cdna option). | |
| 200 lucy_verbose verbosity level (0-1). | |
| 201 fwd_desig The string used to determine whether sequence is a forward read. | |
| 202 The parser will assume that this match will occus at the | |
| 203 end of the sequence name string. | |
| 204 rev_desig As above, for reverse reads. | |
| 205 adv_stderr Can be set to a true value (1). Will only work if you have modified | |
| 206 the Lucy source code as outlined in DESCRIPTION and capture | |
| 207 the standard error from Lucy. | |
| 208 | |
| 209 If you don't provide filenames for qualfile, infofile or stderrfile, | |
| 210 the module will assume that .qual, .info, and .stderr are the file | |
| 211 extensions and search in the same directory as the .seq file for these | |
| 212 files. | |
| 213 | |
| 214 For example, if you create a Lucy object with $lucyObj = | |
| 215 Bio::Tools::Lucy-E<gt>new(seqfile =E<gt>lucy.seq), the module will | |
| 216 find lucy.qual, lucy.info and lucy.stderr. | |
| 217 | |
| 218 You can omit any or all of the quality, info or stderr files, but you | |
| 219 will not be able to use all of the object methods (see method | |
| 220 documentation below). | |
| 221 | |
| 222 =cut | |
| 223 | |
| 224 sub new { | |
| 225 my ($class,@args) = @_; | |
| 226 my $self = $class->SUPER::new(@args); | |
| 227 my ($attr, $value); | |
| 228 while (@args) { | |
| 229 $attr = shift @args; | |
| 230 $attr = lc $attr; | |
| 231 $value = shift @args; | |
| 232 $self->{$attr} = $value; | |
| 233 } | |
| 234 &_parse($self); | |
| 235 return $self; | |
| 236 } | |
| 237 | |
| 238 =head2 _parse | |
| 239 | |
| 240 Title : _parse | |
| 241 Usage : n/a (internal function) | |
| 242 Function: called by new() to parse Lucy output files | |
| 243 Returns : nothing | |
| 244 Args : none | |
| 245 | |
| 246 =cut | |
| 247 | |
| 248 sub _parse { | |
| 249 my $self = shift; | |
| 250 $self->{seqfile} =~ /^(\S+)\.\S+$/; | |
| 251 my $file = $1; | |
| 252 | |
| 253 print "Opening $self->{seqfile} for parsing...\n" if $self->{lucy_verbose}; | |
| 254 open SEQ, "$self->{seqfile}" or $self->throw("Could not open sequence file: $self->{seqfile}"); | |
| 255 my ($name, $line); | |
| 256 my $seq = ""; | |
| 257 my @lines = <SEQ>; | |
| 258 while ($line = pop @lines) { | |
| 259 chomp $line; | |
| 260 if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) { | |
| 261 $name = $1; | |
| 262 if ($self->{fwd_desig}) { | |
| 263 $self->{sequences}{$name}{direction} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/; | |
| 264 } | |
| 265 if ($self->{rev_desig}) { | |
| 266 $self->{sequences}{$name}{direction} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/; | |
| 267 } | |
| 268 $self->{sequences}{$name}{min_clone_len} = $2; # this is used for TIGR Assembler, as are $3 and $4 | |
| 269 $self->{sequences}{$name}{max_clone_len} = $3; | |
| 270 $self->{sequences}{$name}{med_clone_len} = $4; | |
| 271 $self->{sequences}{$name}{beg_clear} = $5; | |
| 272 $self->{sequences}{$name}{end_clear} = $6; | |
| 273 $self->{sequences}{$name}{length_raw} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong. | |
| 274 my $beg = $5-1; # substr function begins with index 0 | |
| 275 $seq = $self->{sequences}{$name}{sequence} = substr ($seq, $beg, $6-$beg); | |
| 276 my $count = $self->{sequences}{$name}{length_clear} = $seq =~ tr/[AGCTN]//; | |
| 277 my $countGC = $seq =~ tr/[GC]//; | |
| 278 $self->{sequences}{$name}{per_GC} = $countGC/$count * 100; | |
| 279 $seq = ""; | |
| 280 } | |
| 281 else { | |
| 282 $seq = $line.$seq; | |
| 283 } | |
| 284 } | |
| 285 | |
| 286 | |
| 287 # now parse quality values (check for presence of quality file first) | |
| 288 if ($self->{qualfile}) { | |
| 289 open QUAL, "$self->{qualfile}" or $self->throw("Could not open quality file: $self->{qualfile}"); | |
| 290 @lines = <QUAL>; | |
| 291 } | |
| 292 elsif (-e "$file.qual") { | |
| 293 print "You did not set qualfile, but I'm opening $file.qual\n" if $self->{lucy_verbose}; | |
| 294 $self->qualfile("$file.qual"); | |
| 295 open QUAL, "$file.qual" or $self->throw("Could not open quality file: $file.qual"); | |
| 296 @lines = <QUAL>; | |
| 297 } | |
| 298 else { | |
| 299 print "I did not find a quality file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; | |
| 300 @lines = (); | |
| 301 } | |
| 302 | |
| 303 my (@vals, @slice, $num, $tot, $vals); | |
| 304 my $qual = ""; | |
| 305 while ($line = pop @lines) { | |
| 306 chomp $line; | |
| 307 if ($line =~ /^>(\S+)/) { | |
| 308 $name = $1; | |
| 309 @vals = split /\s/ , $qual; | |
| 310 @slice = @vals[$self->{sequences}{$name}{beg_clear} .. $self->{sequences}{$name}{end_clear}]; | |
| 311 $vals = join "\t", @slice; | |
| 312 $self->{sequences}{$name}{quality} = $vals; | |
| 313 $qual = ""; | |
| 314 foreach $num (@slice) { | |
| 315 $tot += $num; | |
| 316 } | |
| 317 $num = @slice; | |
| 318 $self->{sequences}{$name}{avg_quality} = $tot/$num; | |
| 319 $tot = 0; | |
| 320 } | |
| 321 else { | |
| 322 $qual = $line.$qual; | |
| 323 } | |
| 324 } | |
| 325 | |
| 326 # determine whether reads are full length | |
| 327 | |
| 328 if ($self->{infofile}) { | |
| 329 open INFO, "$self->{infofile}" or $self->throw("Could not open info file: $self->{infofile}"); | |
| 330 @lines = <INFO>; | |
| 331 } | |
| 332 elsif (-e "$file.info") { | |
| 333 print "You did not set infofile, but I'm opening $file.info\n" if $self->{lucy_verbose}; | |
| 334 $self->infofile("$file.info"); | |
| 335 open INFO, "$file.info" or $self->throw("Could not open info file: $file.info"); | |
| 336 @lines = <INFO>; | |
| 337 } | |
| 338 else { | |
| 339 print "I did not find an info file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; | |
| 340 @lines = (); | |
| 341 } | |
| 342 | |
| 343 foreach (@lines) { | |
| 344 /^(\S+).+CLV\s+(\d+)\s+(\d+)$/; | |
| 345 if ($2>0 && $3>0) { | |
| 346 $self->{sequences}{$1}{full_length} = 1 if $self->{sequences}{$1}; # will show cleavage info for rejected sequences too | |
| 347 } | |
| 348 } | |
| 349 | |
| 350 | |
| 351 # parse rejects (and presence of poly-A if Lucy has been modified) | |
| 352 | |
| 353 if ($self->{stderrfile}) { | |
| 354 open STDERR_LUCY, "$self->{stderrfile}" or $self->throw("Could not open quality file: $self->{stderrfile}"); | |
| 355 @lines = <STDERR_LUCY>; | |
| 356 | |
| 357 } | |
| 358 elsif (-e "$file.stderr") { | |
| 359 print "You did not set stderrfile, but I'm opening $file.stderr\n" if $self->{lucy_verbose}; | |
| 360 $self->stderrfile("$file.stderr"); | |
| 361 open STDERR_LUCY, "$file.stderr" or $self->throw("Could not open quality file: $file.stderr"); | |
| 362 @lines = <STDERR_LUCY>; | |
| 363 } | |
| 364 else { | |
| 365 print "I did not find a standard error file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; | |
| 366 @lines = (); | |
| 367 } | |
| 368 | |
| 369 if ($self->{adv_stderr}) { | |
| 370 foreach (@lines) { | |
| 371 $self->{reject}{$1} = "Q" if /dropping\s+(\S+)/; | |
| 372 $self->{reject}{$1} = "V" if /Vector: (\S+)/; | |
| 373 $self->{reject}{$1} = "E" if /Empty: (\S+)/; | |
| 374 $self->{reject}{$1} = "S" if /Short: (\S+)/; | |
| 375 $self->{sequences}{$1}{polyA} = 1 if /(\S+) has PolyA/; | |
| 376 if (/Dropped PolyA: (\S+)/) { | |
| 377 $self->{reject}{$1} = "P"; | |
| 378 delete $self->{sequences}{$1}; | |
| 379 } | |
| 380 } | |
| 381 } | |
| 382 else { | |
| 383 foreach (@lines) { | |
| 384 $self->{reject}{$1} = "R" if /dropping\s+(\S+)/; | |
| 385 } | |
| 386 } | |
| 387 | |
| 388 } | |
| 389 | |
| 390 =head2 get_Seq_Objs | |
| 391 | |
| 392 Title : get_Seq_Objs | |
| 393 Usage : $lucyObj->get_Seq_Objs() | |
| 394 Function: returns an array of references to Bio::PrimarySeq objects | |
| 395 where -id = 'sequence name' and -seq = 'sequence' | |
| 396 | |
| 397 Returns : array of Bio::PrimarySeq objects | |
| 398 Args : none | |
| 399 | |
| 400 =cut | |
| 401 | |
| 402 sub get_Seq_Objs { | |
| 403 my $self = shift; | |
| 404 my($seqobj, @seqobjs); | |
| 405 foreach my $key (sort keys %{$self->{sequences}}) { | |
| 406 $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", | |
| 407 -id => "$key"); | |
| 408 push @seqobjs, $seqobj; | |
| 409 } | |
| 410 return \@seqobjs; | |
| 411 } | |
| 412 | |
| 413 =head2 get_Seq_Obj | |
| 414 | |
| 415 Title : get_Seq_Obj | |
| 416 Usage : $lucyObj->get_Seq_Obj($seqname) | |
| 417 Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name' | |
| 418 and -seq = 'sequence' | |
| 419 Returns : reference to Bio::PrimarySeq object | |
| 420 Args : name of a sequence | |
| 421 | |
| 422 =cut | |
| 423 | |
| 424 sub get_Seq_Obj { | |
| 425 my ($self, $key) = @_; | |
| 426 my $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", | |
| 427 -id => "$key"); | |
| 428 return $seqobj; | |
| 429 } | |
| 430 | |
| 431 =head2 get_sequence_names | |
| 432 | |
| 433 Title : get_sequence_names | |
| 434 Usage : $lucyObj->get_sequence_names | |
| 435 Function: returns reference to an array of names of the sequences analyzed by Lucy. | |
| 436 These names are required for most of the accessor methods. | |
| 437 Note: The Lucy binary will fail unless sequence names are unique. | |
| 438 Returns : array reference | |
| 439 Args : none | |
| 440 | |
| 441 =cut | |
| 442 | |
| 443 sub get_sequence_names { | |
| 444 my $self = shift; | |
| 445 my @keys = sort keys %{$self->{sequences}}; | |
| 446 return \@keys; | |
| 447 } | |
| 448 | |
| 449 =head2 sequence | |
| 450 | |
| 451 Title : sequence | |
| 452 Usage : $lucyObj->sequence($seqname) | |
| 453 Function: returns the DNA sequence of one of the sequences analyzed by Lucy. | |
| 454 Returns : string | |
| 455 Args : name of a sequence | |
| 456 | |
| 457 =cut | |
| 458 | |
| 459 sub sequence { | |
| 460 my ($self, $key) = @_; | |
| 461 return $self->{sequences}{$key}{sequence}; | |
| 462 } | |
| 463 | |
| 464 =head2 quality | |
| 465 | |
| 466 Title : quality | |
| 467 Usage : $lucyObj->quality($seqname) | |
| 468 Function: returns the quality values of one of the sequences analyzed by Lucy. | |
| 469 This method depends on the user having provided a quality file. | |
| 470 Returns : string | |
| 471 Args : name of a sequence | |
| 472 | |
| 473 =cut | |
| 474 | |
| 475 sub quality { | |
| 476 my($self, $key) = @_; | |
| 477 return $self->{sequences}{$key}{quality}; | |
| 478 } | |
| 479 | |
| 480 =head2 avg_quality | |
| 481 | |
| 482 Title : avg_quality | |
| 483 Usage : $lucyObj->avg_quality($seqname) | |
| 484 Function: returns the average quality value for one of the sequences analyzed by Lucy. | |
| 485 Returns : float | |
| 486 Args : name of a sequence | |
| 487 | |
| 488 =cut | |
| 489 | |
| 490 sub avg_quality { | |
| 491 my($self, $key) = @_; | |
| 492 return $self->{sequences}{$key}{avg_quality}; | |
| 493 } | |
| 494 | |
| 495 =head2 direction | |
| 496 | |
| 497 Title : direction | |
| 498 Usage : $lucyObj->direction($seqname) | |
| 499 Function: returns the direction for one of the sequences analyzed by Lucy | |
| 500 providing that 'fwd_desig' or 'rev_desig' were set when the | |
| 501 Lucy object was created. | |
| 502 Strings returned are: 'F' for forward, 'R' for reverse. | |
| 503 Returns : string | |
| 504 Args : name of a sequence | |
| 505 | |
| 506 =cut | |
| 507 | |
| 508 sub direction { | |
| 509 my($self, $key) = @_; | |
| 510 return $self->{sequences}{$key}{direction} if $self->{sequences}{$key}{direction}; | |
| 511 return ""; | |
| 512 } | |
| 513 | |
| 514 =head2 length_raw | |
| 515 | |
| 516 Title : length_raw | |
| 517 Usage : $lucyObj->length_raw($seqname) | |
| 518 Function: returns the length of a DNA sequence prior to quality/ vector | |
| 519 trimming by Lucy. | |
| 520 Returns : integer | |
| 521 Args : name of a sequence | |
| 522 | |
| 523 =cut | |
| 524 | |
| 525 sub length_raw { | |
| 526 my($self, $key) = @_; | |
| 527 return $self->{sequences}{$key}{length_raw}; | |
| 528 } | |
| 529 | |
| 530 =head2 length_clear | |
| 531 | |
| 532 Title : length_clear | |
| 533 Usage : $lucyObj->length_clear($seqname) | |
| 534 Function: returns the length of a DNA sequence following quality/ vector | |
| 535 trimming by Lucy. | |
| 536 Returns : integer | |
| 537 Args : name of a sequence | |
| 538 | |
| 539 =cut | |
| 540 | |
| 541 sub length_clear { | |
| 542 my($self, $key) = @_; | |
| 543 return $self->{sequences}{$key}{length_clear}; | |
| 544 } | |
| 545 | |
| 546 =head2 start_clear | |
| 547 | |
| 548 Title : start_clear | |
| 549 Usage : $lucyObj->start_clear($seqname) | |
| 550 Function: returns the beginning position of good quality, vector free DNA sequence | |
| 551 determined by Lucy. | |
| 552 Returns : integer | |
| 553 Args : name of a sequence | |
| 554 | |
| 555 =cut | |
| 556 | |
| 557 sub start_clear { | |
| 558 my($self, $key) = @_; | |
| 559 return $self->{sequences}{$key}{beg_clear}; | |
| 560 } | |
| 561 | |
| 562 | |
| 563 =head2 end_clear | |
| 564 | |
| 565 Title : end_clear | |
| 566 Usage : $lucyObj->end_clear($seqname) | |
| 567 Function: returns the ending position of good quality, vector free DNA sequence | |
| 568 determined by Lucy. | |
| 569 Returns : integer | |
| 570 Args : name of a sequence | |
| 571 | |
| 572 =cut | |
| 573 | |
| 574 sub end_clear { | |
| 575 my($self, $key) = @_; | |
| 576 return $self->{sequences}{$key}{end_clear}; | |
| 577 } | |
| 578 | |
| 579 =head2 per_GC | |
| 580 | |
| 581 Title : per_GC | |
| 582 Usage : $lucyObj->per_GC($seqname) | |
| 583 Function: returns the percente of the good quality, vector free DNA sequence | |
| 584 determined by Lucy. | |
| 585 Returns : float | |
| 586 Args : name of a sequence | |
| 587 | |
| 588 =cut | |
| 589 | |
| 590 sub per_GC { | |
| 591 my($self, $key) = @_; | |
| 592 return $self->{sequences}{$key}{per_GC}; | |
| 593 } | |
| 594 | |
| 595 =head2 full_length | |
| 596 | |
| 597 Title : full_length | |
| 598 Usage : $lucyObj->full_length($seqname) | |
| 599 Function: returns the truth value for whether or not the sequence read was | |
| 600 full length (ie. vector present on both ends of read). This method | |
| 601 depends on the user having provided the 'info' file (Lucy must be | |
| 602 run with the -debug 'info_filename' option to get this file). | |
| 603 Returns : boolean | |
| 604 Args : name of a sequence | |
| 605 | |
| 606 =cut | |
| 607 | |
| 608 sub full_length { | |
| 609 my($self, $key) = @_; | |
| 610 return 1 if $self->{sequences}{$key}{full_length}; | |
| 611 return 0; | |
| 612 } | |
| 613 | |
| 614 =head2 polyA | |
| 615 | |
| 616 Title : polyA | |
| 617 Usage : $lucyObj->polyA($seqname) | |
| 618 Function: returns the truth value for whether or not a poly-A tail was detected | |
| 619 and clipped by Lucy. This method depends on the user having modified | |
| 620 the source for Lucy as outlined in DESCRIPTION and invoking Lucy with | |
| 621 the -cdna option and saving the standard error. | |
| 622 Note, the final sequence will not show the poly-A/T region. | |
| 623 Returns : boolean | |
| 624 Args : name of a sequence | |
| 625 | |
| 626 =cut | |
| 627 | |
| 628 sub polyA { | |
| 629 my($self, $key) = @_; | |
| 630 return 1 if $self->{sequences}{$key}{polyA}; | |
| 631 return 0; | |
| 632 } | |
| 633 | |
| 634 =head2 get_rejects | |
| 635 | |
| 636 Title : get_rejects | |
| 637 Usage : $lucyObj->get_rejects() | |
| 638 Function: returns a hash containing names of rejects and a 1 letter code for the | |
| 639 reason Lucy rejected the sequence. | |
| 640 Q- rejected because of low quality values | |
| 641 S- sequence was short | |
| 642 V- sequence was vector | |
| 643 E- sequence was empty | |
| 644 P- poly-A/T trimming caused sequence to be too short | |
| 645 In order to get the rejects, you must provide a file with the standard | |
| 646 error from Lucy. You will only get the quality category rejects unless | |
| 647 you have modified the source and recompiled Lucy as outlined in DESCRIPTION. | |
| 648 Returns : hash reference | |
| 649 Args : none | |
| 650 | |
| 651 =cut | |
| 652 | |
| 653 sub get_rejects { | |
| 654 my $self = shift; | |
| 655 return $self->{reject}; | |
| 656 } | |
| 657 | |
| 658 =head2 Diff for Lucy source code | |
| 659 | |
| 660 352a353,354 | |
| 661 > /* AGW added next line */ | |
| 662 > fprintf(stderr, "Empty: %s\n", seqs[i].name); | |
| 663 639a642,643 | |
| 664 > /* AGW added next line */ | |
| 665 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name); | |
| 666 678c682,686 | |
| 667 < if (left) seqs[i].left+=left; | |
| 668 --- | |
| 669 > if (left) { | |
| 670 > seqs[i].left+=left; | |
| 671 > /* AGW added next line */ | |
| 672 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name); | |
| 673 > } | |
| 674 681c689,693 | |
| 675 < if (right) seqs[i].right-=right; | |
| 676 --- | |
| 677 > if (right) { | |
| 678 > seqs[i].right-=right; | |
| 679 > /* AGW added next line */ | |
| 680 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name); | |
| 681 > } | |
| 682 682a695,696 | |
| 683 > /* AGW added next line */ | |
| 684 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name); | |
| 685 734a749,750 | |
| 686 > /* AGW added next line */ | |
| 687 > fprintf(stderr, "Vector: %s\n", seqs[i].name); | |
| 688 | |
| 689 =cut | |
| 690 | |
| 691 1; |
