comparison variant_effect_predictor/Bio/Tools/Lucy.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: Lucy.pm,v 1.6 2002/10/22 07:38:46 lapp Exp $
2 #
3 # BioPerl module for Bio::Tools::Lucy
4 #
5 # Copyright Her Majesty the Queen of England
6 # written by Andrew Walsh (paeruginosa@hotmail.com) during employment with
7 # Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB
8 #
9 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
11
12 =head1 NAME
13
14 Bio::Tools::Lucy - Object for analyzing the output from Lucy,
15 a vector and quality trimming program from TIGR
16
17 =head1 SYNOPSIS
18
19 # Create the Lucy object from an existing Lucy output file
20 @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1);
21 $lucyObj = Bio::Tools::Lucy->new(@params);
22
23 # Get names of all sequences
24 $names = $lucyObj->get_sequence_names();
25
26 # Print seq and qual values for sequences >400 bp in order to run CAP3
27 foreach $name (@$names) {
28 next unless $lucyObj->length_clear($name) > 400;
29 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
30 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
31 }
32
33 # Get an array of Bio::PrimarySeq objects
34 @seqObjs = $lucyObj->get_Seq_Objs();
35
36
37 =head1 DESCRIPTION
38
39 Bio::Tools::Lucy.pm provides methods for analyzing the sequence and
40 quality values generated by Lucy program from TIGR.
41
42 Lucy will identify vector, poly-A/T tails, and poor quality regions in
43 a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf)
44
45 The input to Lucy can be the Phred sequence and quality files
46 generated from running Phred on a set of chromatograms.
47
48 Lucy can be obtained (free of charge to academic users) from
49 www.tigr.org/softlab
50
51 There are a few methods that will only be available if you make some
52 minor changes to the source for Lucy and then recompile. The changes
53 are in the 'lucy.c' file and there is a diff between the original and
54 the modified file in the Appendix
55
56 Please contact the author of this module if you have any problems
57 making these modifications.
58
59 You do not have to make these modifications to use this module.
60
61 =head2 Creating a Lucy object
62
63 @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1,
64 'fwd_desig' => '_F', 'rev_desig' => '_R');
65 $lucyObj = Bio::Tools::Lucy->new(@params);
66
67 =head2 Using a Lucy object
68
69 You should get an array with the sequence names in order to use
70 accessor methods. Note: The Lucy binary program will fail unless
71 the sequence names provided as input are unique.
72
73 $names_ref = $lucyObj->get_sequence_names();
74
75 This code snippet will produce a Fasta format file with sequence
76 lengths and %GC in the description line.
77
78 foreach $name (@$names) {
79 print FILE ">$name\t",
80 $lucyObj->length_clear($name), "\t",
81 $lucyObj->per_GC($name), "\n",
82 $lucyObj->sequence($name), "\n";
83 }
84
85
86 Print seq and qual values for sequences >400 bp in order to assemble
87 them with CAP3 (or other assembler).
88
89 foreach $name (@$names) {
90 next unless $lucyObj->length_clear($name) > 400;
91 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
92 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
93 }
94
95 Get all the sequences as Bio::PrimarySeq objects (eg., for use with
96 Bio::Tools::Blast to perform BLAST).
97
98 @seqObjs = $lucyObj->get_Seq_Objs();
99
100 Or use only those sequences that are full length and have a Poly-A
101 tail.
102
103 foreach $name (@$names) {
104 next unless ($lucyObj->full_length($name) and $lucy->polyA($name));
105 push @seqObjs, $lucyObj->get_Seq_Obj($name);
106 }
107
108
109 Get the names of those sequences that were rejected by Lucy.
110
111 $rejects_ref = $lucyObj->get_rejects();
112
113 Print the names of the rejects and 1 letter code for reason they
114 were rejected.
115
116 foreach $key (sort keys %$rejects_ref) {
117 print "$key: ", $rejects_ref->{$key};
118 }
119
120 There is a lot of other information available about the sequences
121 analyzed by Lucy (see APPENDIX). This module can be used with the
122 DBI module to store this sequence information in a database.
123
124 =head1 FEEDBACK
125
126 =head2 Mailing Lists
127
128 User feedback is an integral part of the evolution of this and other
129 Bioperl modules. Send your comments and suggestions preferably to one
130 of the Bioperl mailing lists. Your participation is much appreciated.
131
132 bioperl-l@bioperl.org - General discussion
133 http://bio.perl.org/MailList.html - About the mailing lists
134
135 =head2 Reporting Bugs
136
137 Report bugs to the Bioperl bug tracking system to help us keep track
138 the bugs and their resolution. Bug reports can be submitted via email
139 or the web:
140
141 bioperl-bugs@bio.perl.org
142 http://bugzilla.bioperl.org/
143
144 =head1 AUTHOR
145
146 Andrew G. Walsh paeruginosa@hotmail.com
147
148 =head1 APPENDIX
149
150 Methods available to Lucy objects are described below. Please note
151 that any method beginning with an underscore is considered internal
152 and should not be called directly.
153
154 =cut
155
156
157 package Bio::Tools::Lucy;
158
159 use vars qw($VERSION $AUTOLOAD @ISA @ATTR %OK_FIELD);
160 use strict;
161 use Bio::PrimarySeq;
162 use Bio::Root::Root;
163 use Bio::Root::IO;
164
165 @ISA = qw(Bio::Root::Root Bio::Root::IO);
166 @ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr);
167 foreach my $attr (@ATTR) {
168 $OK_FIELD{$attr}++
169 }
170 $VERSION = "0.01";
171
172 sub AUTOLOAD {
173 my $self = shift;
174 my $attr = $AUTOLOAD;
175 $attr =~ s/.*:://;
176 $attr = lc $attr;
177 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
178 $self->{$attr} = shift if @_;
179 return $self->{$attr};
180 }
181
182 =head2 new
183
184 Title : new
185 Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R',
186 fwd_desig => '_F')
187 Function: creates a Lucy object from Lucy analysis files
188 Returns : reference to Bio::Tools::Lucy object
189 Args : seqfile Fasta sequence file generated by Lucy
190 qualfile Quality values file generated by Lucy
191 infofile Info file created when Lucy is run with -debug 'infofile' option
192 stderrfile Standard error captured from Lucy when Lucy is run
193 with -info option and STDERR is directed to stderrfile
194 (ie. lucy ... 2> stderrfile).
195 Info in this file will include sequences dropped for low
196 quality. If you've modified Lucy source (see adv_stderr below),
197 it will also include info on which sequences were dropped because
198 they were vector, too short, had no insert, and whether a poly-A
199 tail was found (if Lucy was run with -cdna option).
200 lucy_verbose verbosity level (0-1).
201 fwd_desig The string used to determine whether sequence is a forward read.
202 The parser will assume that this match will occus at the
203 end of the sequence name string.
204 rev_desig As above, for reverse reads.
205 adv_stderr Can be set to a true value (1). Will only work if you have modified
206 the Lucy source code as outlined in DESCRIPTION and capture
207 the standard error from Lucy.
208
209 If you don't provide filenames for qualfile, infofile or stderrfile,
210 the module will assume that .qual, .info, and .stderr are the file
211 extensions and search in the same directory as the .seq file for these
212 files.
213
214 For example, if you create a Lucy object with $lucyObj =
215 Bio::Tools::Lucy-E<gt>new(seqfile =E<gt>lucy.seq), the module will
216 find lucy.qual, lucy.info and lucy.stderr.
217
218 You can omit any or all of the quality, info or stderr files, but you
219 will not be able to use all of the object methods (see method
220 documentation below).
221
222 =cut
223
224 sub new {
225 my ($class,@args) = @_;
226 my $self = $class->SUPER::new(@args);
227 my ($attr, $value);
228 while (@args) {
229 $attr = shift @args;
230 $attr = lc $attr;
231 $value = shift @args;
232 $self->{$attr} = $value;
233 }
234 &_parse($self);
235 return $self;
236 }
237
238 =head2 _parse
239
240 Title : _parse
241 Usage : n/a (internal function)
242 Function: called by new() to parse Lucy output files
243 Returns : nothing
244 Args : none
245
246 =cut
247
248 sub _parse {
249 my $self = shift;
250 $self->{seqfile} =~ /^(\S+)\.\S+$/;
251 my $file = $1;
252
253 print "Opening $self->{seqfile} for parsing...\n" if $self->{lucy_verbose};
254 open SEQ, "$self->{seqfile}" or $self->throw("Could not open sequence file: $self->{seqfile}");
255 my ($name, $line);
256 my $seq = "";
257 my @lines = <SEQ>;
258 while ($line = pop @lines) {
259 chomp $line;
260 if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) {
261 $name = $1;
262 if ($self->{fwd_desig}) {
263 $self->{sequences}{$name}{direction} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/;
264 }
265 if ($self->{rev_desig}) {
266 $self->{sequences}{$name}{direction} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/;
267 }
268 $self->{sequences}{$name}{min_clone_len} = $2; # this is used for TIGR Assembler, as are $3 and $4
269 $self->{sequences}{$name}{max_clone_len} = $3;
270 $self->{sequences}{$name}{med_clone_len} = $4;
271 $self->{sequences}{$name}{beg_clear} = $5;
272 $self->{sequences}{$name}{end_clear} = $6;
273 $self->{sequences}{$name}{length_raw} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong.
274 my $beg = $5-1; # substr function begins with index 0
275 $seq = $self->{sequences}{$name}{sequence} = substr ($seq, $beg, $6-$beg);
276 my $count = $self->{sequences}{$name}{length_clear} = $seq =~ tr/[AGCTN]//;
277 my $countGC = $seq =~ tr/[GC]//;
278 $self->{sequences}{$name}{per_GC} = $countGC/$count * 100;
279 $seq = "";
280 }
281 else {
282 $seq = $line.$seq;
283 }
284 }
285
286
287 # now parse quality values (check for presence of quality file first)
288 if ($self->{qualfile}) {
289 open QUAL, "$self->{qualfile}" or $self->throw("Could not open quality file: $self->{qualfile}");
290 @lines = <QUAL>;
291 }
292 elsif (-e "$file.qual") {
293 print "You did not set qualfile, but I'm opening $file.qual\n" if $self->{lucy_verbose};
294 $self->qualfile("$file.qual");
295 open QUAL, "$file.qual" or $self->throw("Could not open quality file: $file.qual");
296 @lines = <QUAL>;
297 }
298 else {
299 print "I did not find a quality file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose};
300 @lines = ();
301 }
302
303 my (@vals, @slice, $num, $tot, $vals);
304 my $qual = "";
305 while ($line = pop @lines) {
306 chomp $line;
307 if ($line =~ /^>(\S+)/) {
308 $name = $1;
309 @vals = split /\s/ , $qual;
310 @slice = @vals[$self->{sequences}{$name}{beg_clear} .. $self->{sequences}{$name}{end_clear}];
311 $vals = join "\t", @slice;
312 $self->{sequences}{$name}{quality} = $vals;
313 $qual = "";
314 foreach $num (@slice) {
315 $tot += $num;
316 }
317 $num = @slice;
318 $self->{sequences}{$name}{avg_quality} = $tot/$num;
319 $tot = 0;
320 }
321 else {
322 $qual = $line.$qual;
323 }
324 }
325
326 # determine whether reads are full length
327
328 if ($self->{infofile}) {
329 open INFO, "$self->{infofile}" or $self->throw("Could not open info file: $self->{infofile}");
330 @lines = <INFO>;
331 }
332 elsif (-e "$file.info") {
333 print "You did not set infofile, but I'm opening $file.info\n" if $self->{lucy_verbose};
334 $self->infofile("$file.info");
335 open INFO, "$file.info" or $self->throw("Could not open info file: $file.info");
336 @lines = <INFO>;
337 }
338 else {
339 print "I did not find an info file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose};
340 @lines = ();
341 }
342
343 foreach (@lines) {
344 /^(\S+).+CLV\s+(\d+)\s+(\d+)$/;
345 if ($2>0 && $3>0) {
346 $self->{sequences}{$1}{full_length} = 1 if $self->{sequences}{$1}; # will show cleavage info for rejected sequences too
347 }
348 }
349
350
351 # parse rejects (and presence of poly-A if Lucy has been modified)
352
353 if ($self->{stderrfile}) {
354 open STDERR_LUCY, "$self->{stderrfile}" or $self->throw("Could not open quality file: $self->{stderrfile}");
355 @lines = <STDERR_LUCY>;
356
357 }
358 elsif (-e "$file.stderr") {
359 print "You did not set stderrfile, but I'm opening $file.stderr\n" if $self->{lucy_verbose};
360 $self->stderrfile("$file.stderr");
361 open STDERR_LUCY, "$file.stderr" or $self->throw("Could not open quality file: $file.stderr");
362 @lines = <STDERR_LUCY>;
363 }
364 else {
365 print "I did not find a standard error file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose};
366 @lines = ();
367 }
368
369 if ($self->{adv_stderr}) {
370 foreach (@lines) {
371 $self->{reject}{$1} = "Q" if /dropping\s+(\S+)/;
372 $self->{reject}{$1} = "V" if /Vector: (\S+)/;
373 $self->{reject}{$1} = "E" if /Empty: (\S+)/;
374 $self->{reject}{$1} = "S" if /Short: (\S+)/;
375 $self->{sequences}{$1}{polyA} = 1 if /(\S+) has PolyA/;
376 if (/Dropped PolyA: (\S+)/) {
377 $self->{reject}{$1} = "P";
378 delete $self->{sequences}{$1};
379 }
380 }
381 }
382 else {
383 foreach (@lines) {
384 $self->{reject}{$1} = "R" if /dropping\s+(\S+)/;
385 }
386 }
387
388 }
389
390 =head2 get_Seq_Objs
391
392 Title : get_Seq_Objs
393 Usage : $lucyObj->get_Seq_Objs()
394 Function: returns an array of references to Bio::PrimarySeq objects
395 where -id = 'sequence name' and -seq = 'sequence'
396
397 Returns : array of Bio::PrimarySeq objects
398 Args : none
399
400 =cut
401
402 sub get_Seq_Objs {
403 my $self = shift;
404 my($seqobj, @seqobjs);
405 foreach my $key (sort keys %{$self->{sequences}}) {
406 $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}",
407 -id => "$key");
408 push @seqobjs, $seqobj;
409 }
410 return \@seqobjs;
411 }
412
413 =head2 get_Seq_Obj
414
415 Title : get_Seq_Obj
416 Usage : $lucyObj->get_Seq_Obj($seqname)
417 Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name'
418 and -seq = 'sequence'
419 Returns : reference to Bio::PrimarySeq object
420 Args : name of a sequence
421
422 =cut
423
424 sub get_Seq_Obj {
425 my ($self, $key) = @_;
426 my $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}",
427 -id => "$key");
428 return $seqobj;
429 }
430
431 =head2 get_sequence_names
432
433 Title : get_sequence_names
434 Usage : $lucyObj->get_sequence_names
435 Function: returns reference to an array of names of the sequences analyzed by Lucy.
436 These names are required for most of the accessor methods.
437 Note: The Lucy binary will fail unless sequence names are unique.
438 Returns : array reference
439 Args : none
440
441 =cut
442
443 sub get_sequence_names {
444 my $self = shift;
445 my @keys = sort keys %{$self->{sequences}};
446 return \@keys;
447 }
448
449 =head2 sequence
450
451 Title : sequence
452 Usage : $lucyObj->sequence($seqname)
453 Function: returns the DNA sequence of one of the sequences analyzed by Lucy.
454 Returns : string
455 Args : name of a sequence
456
457 =cut
458
459 sub sequence {
460 my ($self, $key) = @_;
461 return $self->{sequences}{$key}{sequence};
462 }
463
464 =head2 quality
465
466 Title : quality
467 Usage : $lucyObj->quality($seqname)
468 Function: returns the quality values of one of the sequences analyzed by Lucy.
469 This method depends on the user having provided a quality file.
470 Returns : string
471 Args : name of a sequence
472
473 =cut
474
475 sub quality {
476 my($self, $key) = @_;
477 return $self->{sequences}{$key}{quality};
478 }
479
480 =head2 avg_quality
481
482 Title : avg_quality
483 Usage : $lucyObj->avg_quality($seqname)
484 Function: returns the average quality value for one of the sequences analyzed by Lucy.
485 Returns : float
486 Args : name of a sequence
487
488 =cut
489
490 sub avg_quality {
491 my($self, $key) = @_;
492 return $self->{sequences}{$key}{avg_quality};
493 }
494
495 =head2 direction
496
497 Title : direction
498 Usage : $lucyObj->direction($seqname)
499 Function: returns the direction for one of the sequences analyzed by Lucy
500 providing that 'fwd_desig' or 'rev_desig' were set when the
501 Lucy object was created.
502 Strings returned are: 'F' for forward, 'R' for reverse.
503 Returns : string
504 Args : name of a sequence
505
506 =cut
507
508 sub direction {
509 my($self, $key) = @_;
510 return $self->{sequences}{$key}{direction} if $self->{sequences}{$key}{direction};
511 return "";
512 }
513
514 =head2 length_raw
515
516 Title : length_raw
517 Usage : $lucyObj->length_raw($seqname)
518 Function: returns the length of a DNA sequence prior to quality/ vector
519 trimming by Lucy.
520 Returns : integer
521 Args : name of a sequence
522
523 =cut
524
525 sub length_raw {
526 my($self, $key) = @_;
527 return $self->{sequences}{$key}{length_raw};
528 }
529
530 =head2 length_clear
531
532 Title : length_clear
533 Usage : $lucyObj->length_clear($seqname)
534 Function: returns the length of a DNA sequence following quality/ vector
535 trimming by Lucy.
536 Returns : integer
537 Args : name of a sequence
538
539 =cut
540
541 sub length_clear {
542 my($self, $key) = @_;
543 return $self->{sequences}{$key}{length_clear};
544 }
545
546 =head2 start_clear
547
548 Title : start_clear
549 Usage : $lucyObj->start_clear($seqname)
550 Function: returns the beginning position of good quality, vector free DNA sequence
551 determined by Lucy.
552 Returns : integer
553 Args : name of a sequence
554
555 =cut
556
557 sub start_clear {
558 my($self, $key) = @_;
559 return $self->{sequences}{$key}{beg_clear};
560 }
561
562
563 =head2 end_clear
564
565 Title : end_clear
566 Usage : $lucyObj->end_clear($seqname)
567 Function: returns the ending position of good quality, vector free DNA sequence
568 determined by Lucy.
569 Returns : integer
570 Args : name of a sequence
571
572 =cut
573
574 sub end_clear {
575 my($self, $key) = @_;
576 return $self->{sequences}{$key}{end_clear};
577 }
578
579 =head2 per_GC
580
581 Title : per_GC
582 Usage : $lucyObj->per_GC($seqname)
583 Function: returns the percente of the good quality, vector free DNA sequence
584 determined by Lucy.
585 Returns : float
586 Args : name of a sequence
587
588 =cut
589
590 sub per_GC {
591 my($self, $key) = @_;
592 return $self->{sequences}{$key}{per_GC};
593 }
594
595 =head2 full_length
596
597 Title : full_length
598 Usage : $lucyObj->full_length($seqname)
599 Function: returns the truth value for whether or not the sequence read was
600 full length (ie. vector present on both ends of read). This method
601 depends on the user having provided the 'info' file (Lucy must be
602 run with the -debug 'info_filename' option to get this file).
603 Returns : boolean
604 Args : name of a sequence
605
606 =cut
607
608 sub full_length {
609 my($self, $key) = @_;
610 return 1 if $self->{sequences}{$key}{full_length};
611 return 0;
612 }
613
614 =head2 polyA
615
616 Title : polyA
617 Usage : $lucyObj->polyA($seqname)
618 Function: returns the truth value for whether or not a poly-A tail was detected
619 and clipped by Lucy. This method depends on the user having modified
620 the source for Lucy as outlined in DESCRIPTION and invoking Lucy with
621 the -cdna option and saving the standard error.
622 Note, the final sequence will not show the poly-A/T region.
623 Returns : boolean
624 Args : name of a sequence
625
626 =cut
627
628 sub polyA {
629 my($self, $key) = @_;
630 return 1 if $self->{sequences}{$key}{polyA};
631 return 0;
632 }
633
634 =head2 get_rejects
635
636 Title : get_rejects
637 Usage : $lucyObj->get_rejects()
638 Function: returns a hash containing names of rejects and a 1 letter code for the
639 reason Lucy rejected the sequence.
640 Q- rejected because of low quality values
641 S- sequence was short
642 V- sequence was vector
643 E- sequence was empty
644 P- poly-A/T trimming caused sequence to be too short
645 In order to get the rejects, you must provide a file with the standard
646 error from Lucy. You will only get the quality category rejects unless
647 you have modified the source and recompiled Lucy as outlined in DESCRIPTION.
648 Returns : hash reference
649 Args : none
650
651 =cut
652
653 sub get_rejects {
654 my $self = shift;
655 return $self->{reject};
656 }
657
658 =head2 Diff for Lucy source code
659
660 352a353,354
661 > /* AGW added next line */
662 > fprintf(stderr, "Empty: %s\n", seqs[i].name);
663 639a642,643
664 > /* AGW added next line */
665 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name);
666 678c682,686
667 < if (left) seqs[i].left+=left;
668 ---
669 > if (left) {
670 > seqs[i].left+=left;
671 > /* AGW added next line */
672 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name);
673 > }
674 681c689,693
675 < if (right) seqs[i].right-=right;
676 ---
677 > if (right) {
678 > seqs[i].right-=right;
679 > /* AGW added next line */
680 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name);
681 > }
682 682a695,696
683 > /* AGW added next line */
684 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name);
685 734a749,750
686 > /* AGW added next line */
687 > fprintf(stderr, "Vector: %s\n", seqs[i].name);
688
689 =cut
690
691 1;