Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/Lucy.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Lucy.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,691 @@ +# $Id: Lucy.pm,v 1.6 2002/10/22 07:38:46 lapp Exp $ +# +# BioPerl module for Bio::Tools::Lucy +# +# Copyright Her Majesty the Queen of England +# written by Andrew Walsh (paeruginosa@hotmail.com) during employment with +# Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB +# +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Lucy - Object for analyzing the output from Lucy, + a vector and quality trimming program from TIGR + +=head1 SYNOPSIS + + # Create the Lucy object from an existing Lucy output file + @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1); + $lucyObj = Bio::Tools::Lucy->new(@params); + + # Get names of all sequences + $names = $lucyObj->get_sequence_names(); + + # Print seq and qual values for sequences >400 bp in order to run CAP3 + foreach $name (@$names) { + next unless $lucyObj->length_clear($name) > 400; + print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; + print QUAL ">$name\n", $lucyObj->quality($name), "\n"; + } + + # Get an array of Bio::PrimarySeq objects + @seqObjs = $lucyObj->get_Seq_Objs(); + + +=head1 DESCRIPTION + +Bio::Tools::Lucy.pm provides methods for analyzing the sequence and +quality values generated by Lucy program from TIGR. + +Lucy will identify vector, poly-A/T tails, and poor quality regions in +a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf) + +The input to Lucy can be the Phred sequence and quality files +generated from running Phred on a set of chromatograms. + +Lucy can be obtained (free of charge to academic users) from +www.tigr.org/softlab + +There are a few methods that will only be available if you make some +minor changes to the source for Lucy and then recompile. The changes +are in the 'lucy.c' file and there is a diff between the original and +the modified file in the Appendix + +Please contact the author of this module if you have any problems +making these modifications. + +You do not have to make these modifications to use this module. + +=head2 Creating a Lucy object + + @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1, + 'fwd_desig' => '_F', 'rev_desig' => '_R'); + $lucyObj = Bio::Tools::Lucy->new(@params); + +=head2 Using a Lucy object + + You should get an array with the sequence names in order to use + accessor methods. Note: The Lucy binary program will fail unless + the sequence names provided as input are unique. + + $names_ref = $lucyObj->get_sequence_names(); + + This code snippet will produce a Fasta format file with sequence + lengths and %GC in the description line. + + foreach $name (@$names) { + print FILE ">$name\t", + $lucyObj->length_clear($name), "\t", + $lucyObj->per_GC($name), "\n", + $lucyObj->sequence($name), "\n"; + } + + + Print seq and qual values for sequences >400 bp in order to assemble + them with CAP3 (or other assembler). + + foreach $name (@$names) { + next unless $lucyObj->length_clear($name) > 400; + print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; + print QUAL ">$name\n", $lucyObj->quality($name), "\n"; + } + + Get all the sequences as Bio::PrimarySeq objects (eg., for use with + Bio::Tools::Blast to perform BLAST). + + @seqObjs = $lucyObj->get_Seq_Objs(); + + Or use only those sequences that are full length and have a Poly-A + tail. + + foreach $name (@$names) { + next unless ($lucyObj->full_length($name) and $lucy->polyA($name)); + push @seqObjs, $lucyObj->get_Seq_Obj($name); + } + + + Get the names of those sequences that were rejected by Lucy. + + $rejects_ref = $lucyObj->get_rejects(); + + Print the names of the rejects and 1 letter code for reason they + were rejected. + + foreach $key (sort keys %$rejects_ref) { + print "$key: ", $rejects_ref->{$key}; + } + + There is a lot of other information available about the sequences + analyzed by Lucy (see APPENDIX). This module can be used with the + DBI module to store this sequence information in a database. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Andrew G. Walsh paeruginosa@hotmail.com + +=head1 APPENDIX + +Methods available to Lucy objects are described below. Please note +that any method beginning with an underscore is considered internal +and should not be called directly. + +=cut + + +package Bio::Tools::Lucy; + +use vars qw($VERSION $AUTOLOAD @ISA @ATTR %OK_FIELD); +use strict; +use Bio::PrimarySeq; +use Bio::Root::Root; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); +@ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr); +foreach my $attr (@ATTR) { + $OK_FIELD{$attr}++ +} +$VERSION = "0.01"; + +sub AUTOLOAD { + my $self = shift; + my $attr = $AUTOLOAD; + $attr =~ s/.*:://; + $attr = lc $attr; + $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; + $self->{$attr} = shift if @_; + return $self->{$attr}; +} + +=head2 new + + Title : new + Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R', + fwd_desig => '_F') + Function: creates a Lucy object from Lucy analysis files + Returns : reference to Bio::Tools::Lucy object + Args : seqfile Fasta sequence file generated by Lucy + qualfile Quality values file generated by Lucy + infofile Info file created when Lucy is run with -debug 'infofile' option + stderrfile Standard error captured from Lucy when Lucy is run + with -info option and STDERR is directed to stderrfile + (ie. lucy ... 2> stderrfile). + Info in this file will include sequences dropped for low + quality. If you've modified Lucy source (see adv_stderr below), + it will also include info on which sequences were dropped because + they were vector, too short, had no insert, and whether a poly-A + tail was found (if Lucy was run with -cdna option). + lucy_verbose verbosity level (0-1). + fwd_desig The string used to determine whether sequence is a forward read. + The parser will assume that this match will occus at the + end of the sequence name string. + rev_desig As above, for reverse reads. + adv_stderr Can be set to a true value (1). Will only work if you have modified + the Lucy source code as outlined in DESCRIPTION and capture + the standard error from Lucy. + +If you don't provide filenames for qualfile, infofile or stderrfile, +the module will assume that .qual, .info, and .stderr are the file +extensions and search in the same directory as the .seq file for these +files. + +For example, if you create a Lucy object with $lucyObj = +Bio::Tools::Lucy-E<gt>new(seqfile =E<gt>lucy.seq), the module will +find lucy.qual, lucy.info and lucy.stderr. + +You can omit any or all of the quality, info or stderr files, but you +will not be able to use all of the object methods (see method +documentation below). + +=cut + +sub new { + my ($class,@args) = @_; + my $self = $class->SUPER::new(@args); + my ($attr, $value); + while (@args) { + $attr = shift @args; + $attr = lc $attr; + $value = shift @args; + $self->{$attr} = $value; + } + &_parse($self); + return $self; +} + +=head2 _parse + + Title : _parse + Usage : n/a (internal function) + Function: called by new() to parse Lucy output files + Returns : nothing + Args : none + +=cut + +sub _parse { + my $self = shift; + $self->{seqfile} =~ /^(\S+)\.\S+$/; + my $file = $1; + + print "Opening $self->{seqfile} for parsing...\n" if $self->{lucy_verbose}; + open SEQ, "$self->{seqfile}" or $self->throw("Could not open sequence file: $self->{seqfile}"); + my ($name, $line); + my $seq = ""; + my @lines = <SEQ>; + while ($line = pop @lines) { + chomp $line; + if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) { + $name = $1; + if ($self->{fwd_desig}) { + $self->{sequences}{$name}{direction} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/; + } + if ($self->{rev_desig}) { + $self->{sequences}{$name}{direction} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/; + } + $self->{sequences}{$name}{min_clone_len} = $2; # this is used for TIGR Assembler, as are $3 and $4 + $self->{sequences}{$name}{max_clone_len} = $3; + $self->{sequences}{$name}{med_clone_len} = $4; + $self->{sequences}{$name}{beg_clear} = $5; + $self->{sequences}{$name}{end_clear} = $6; + $self->{sequences}{$name}{length_raw} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong. + my $beg = $5-1; # substr function begins with index 0 + $seq = $self->{sequences}{$name}{sequence} = substr ($seq, $beg, $6-$beg); + my $count = $self->{sequences}{$name}{length_clear} = $seq =~ tr/[AGCTN]//; + my $countGC = $seq =~ tr/[GC]//; + $self->{sequences}{$name}{per_GC} = $countGC/$count * 100; + $seq = ""; + } + else { + $seq = $line.$seq; + } + } + + +# now parse quality values (check for presence of quality file first) + if ($self->{qualfile}) { + open QUAL, "$self->{qualfile}" or $self->throw("Could not open quality file: $self->{qualfile}"); + @lines = <QUAL>; + } + elsif (-e "$file.qual") { + print "You did not set qualfile, but I'm opening $file.qual\n" if $self->{lucy_verbose}; + $self->qualfile("$file.qual"); + open QUAL, "$file.qual" or $self->throw("Could not open quality file: $file.qual"); + @lines = <QUAL>; + } + else { + print "I did not find a quality file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; + @lines = (); + } + + my (@vals, @slice, $num, $tot, $vals); + my $qual = ""; + while ($line = pop @lines) { + chomp $line; + if ($line =~ /^>(\S+)/) { + $name = $1; + @vals = split /\s/ , $qual; + @slice = @vals[$self->{sequences}{$name}{beg_clear} .. $self->{sequences}{$name}{end_clear}]; + $vals = join "\t", @slice; + $self->{sequences}{$name}{quality} = $vals; + $qual = ""; + foreach $num (@slice) { + $tot += $num; + } + $num = @slice; + $self->{sequences}{$name}{avg_quality} = $tot/$num; + $tot = 0; + } + else { + $qual = $line.$qual; + } + } + +# determine whether reads are full length + + if ($self->{infofile}) { + open INFO, "$self->{infofile}" or $self->throw("Could not open info file: $self->{infofile}"); + @lines = <INFO>; + } + elsif (-e "$file.info") { + print "You did not set infofile, but I'm opening $file.info\n" if $self->{lucy_verbose}; + $self->infofile("$file.info"); + open INFO, "$file.info" or $self->throw("Could not open info file: $file.info"); + @lines = <INFO>; + } + else { + print "I did not find an info file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; + @lines = (); + } + + foreach (@lines) { + /^(\S+).+CLV\s+(\d+)\s+(\d+)$/; + if ($2>0 && $3>0) { + $self->{sequences}{$1}{full_length} = 1 if $self->{sequences}{$1}; # will show cleavage info for rejected sequences too + } + } + + +# parse rejects (and presence of poly-A if Lucy has been modified) + + if ($self->{stderrfile}) { + open STDERR_LUCY, "$self->{stderrfile}" or $self->throw("Could not open quality file: $self->{stderrfile}"); + @lines = <STDERR_LUCY>; + + } + elsif (-e "$file.stderr") { + print "You did not set stderrfile, but I'm opening $file.stderr\n" if $self->{lucy_verbose}; + $self->stderrfile("$file.stderr"); + open STDERR_LUCY, "$file.stderr" or $self->throw("Could not open quality file: $file.stderr"); + @lines = <STDERR_LUCY>; + } + else { + print "I did not find a standard error file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; + @lines = (); + } + + if ($self->{adv_stderr}) { + foreach (@lines) { + $self->{reject}{$1} = "Q" if /dropping\s+(\S+)/; + $self->{reject}{$1} = "V" if /Vector: (\S+)/; + $self->{reject}{$1} = "E" if /Empty: (\S+)/; + $self->{reject}{$1} = "S" if /Short: (\S+)/; + $self->{sequences}{$1}{polyA} = 1 if /(\S+) has PolyA/; + if (/Dropped PolyA: (\S+)/) { + $self->{reject}{$1} = "P"; + delete $self->{sequences}{$1}; + } + } + } + else { + foreach (@lines) { + $self->{reject}{$1} = "R" if /dropping\s+(\S+)/; + } + } + +} + +=head2 get_Seq_Objs + + Title : get_Seq_Objs + Usage : $lucyObj->get_Seq_Objs() + Function: returns an array of references to Bio::PrimarySeq objects + where -id = 'sequence name' and -seq = 'sequence' + + Returns : array of Bio::PrimarySeq objects + Args : none + +=cut + +sub get_Seq_Objs { + my $self = shift; + my($seqobj, @seqobjs); + foreach my $key (sort keys %{$self->{sequences}}) { + $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", + -id => "$key"); + push @seqobjs, $seqobj; + } + return \@seqobjs; +} + +=head2 get_Seq_Obj + + Title : get_Seq_Obj + Usage : $lucyObj->get_Seq_Obj($seqname) + Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name' + and -seq = 'sequence' + Returns : reference to Bio::PrimarySeq object + Args : name of a sequence + +=cut + +sub get_Seq_Obj { + my ($self, $key) = @_; + my $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", + -id => "$key"); + return $seqobj; +} + +=head2 get_sequence_names + + Title : get_sequence_names + Usage : $lucyObj->get_sequence_names + Function: returns reference to an array of names of the sequences analyzed by Lucy. + These names are required for most of the accessor methods. + Note: The Lucy binary will fail unless sequence names are unique. + Returns : array reference + Args : none + +=cut + +sub get_sequence_names { + my $self = shift; + my @keys = sort keys %{$self->{sequences}}; + return \@keys; +} + +=head2 sequence + + Title : sequence + Usage : $lucyObj->sequence($seqname) + Function: returns the DNA sequence of one of the sequences analyzed by Lucy. + Returns : string + Args : name of a sequence + +=cut + +sub sequence { + my ($self, $key) = @_; + return $self->{sequences}{$key}{sequence}; +} + +=head2 quality + + Title : quality + Usage : $lucyObj->quality($seqname) + Function: returns the quality values of one of the sequences analyzed by Lucy. + This method depends on the user having provided a quality file. + Returns : string + Args : name of a sequence + +=cut + +sub quality { + my($self, $key) = @_; + return $self->{sequences}{$key}{quality}; +} + +=head2 avg_quality + + Title : avg_quality + Usage : $lucyObj->avg_quality($seqname) + Function: returns the average quality value for one of the sequences analyzed by Lucy. + Returns : float + Args : name of a sequence + +=cut + +sub avg_quality { + my($self, $key) = @_; + return $self->{sequences}{$key}{avg_quality}; +} + +=head2 direction + + Title : direction + Usage : $lucyObj->direction($seqname) + Function: returns the direction for one of the sequences analyzed by Lucy + providing that 'fwd_desig' or 'rev_desig' were set when the + Lucy object was created. + Strings returned are: 'F' for forward, 'R' for reverse. + Returns : string + Args : name of a sequence + +=cut + +sub direction { + my($self, $key) = @_; + return $self->{sequences}{$key}{direction} if $self->{sequences}{$key}{direction}; + return ""; +} + +=head2 length_raw + + Title : length_raw + Usage : $lucyObj->length_raw($seqname) + Function: returns the length of a DNA sequence prior to quality/ vector + trimming by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub length_raw { + my($self, $key) = @_; + return $self->{sequences}{$key}{length_raw}; +} + +=head2 length_clear + + Title : length_clear + Usage : $lucyObj->length_clear($seqname) + Function: returns the length of a DNA sequence following quality/ vector + trimming by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub length_clear { + my($self, $key) = @_; + return $self->{sequences}{$key}{length_clear}; +} + +=head2 start_clear + + Title : start_clear + Usage : $lucyObj->start_clear($seqname) + Function: returns the beginning position of good quality, vector free DNA sequence + determined by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub start_clear { + my($self, $key) = @_; + return $self->{sequences}{$key}{beg_clear}; +} + + +=head2 end_clear + + Title : end_clear + Usage : $lucyObj->end_clear($seqname) + Function: returns the ending position of good quality, vector free DNA sequence + determined by Lucy. + Returns : integer + Args : name of a sequence + +=cut + +sub end_clear { + my($self, $key) = @_; + return $self->{sequences}{$key}{end_clear}; +} + +=head2 per_GC + + Title : per_GC + Usage : $lucyObj->per_GC($seqname) + Function: returns the percente of the good quality, vector free DNA sequence + determined by Lucy. + Returns : float + Args : name of a sequence + +=cut + +sub per_GC { + my($self, $key) = @_; + return $self->{sequences}{$key}{per_GC}; +} + +=head2 full_length + + Title : full_length + Usage : $lucyObj->full_length($seqname) + Function: returns the truth value for whether or not the sequence read was + full length (ie. vector present on both ends of read). This method + depends on the user having provided the 'info' file (Lucy must be + run with the -debug 'info_filename' option to get this file). + Returns : boolean + Args : name of a sequence + +=cut + +sub full_length { + my($self, $key) = @_; + return 1 if $self->{sequences}{$key}{full_length}; + return 0; +} + +=head2 polyA + + Title : polyA + Usage : $lucyObj->polyA($seqname) + Function: returns the truth value for whether or not a poly-A tail was detected + and clipped by Lucy. This method depends on the user having modified + the source for Lucy as outlined in DESCRIPTION and invoking Lucy with + the -cdna option and saving the standard error. + Note, the final sequence will not show the poly-A/T region. + Returns : boolean + Args : name of a sequence + +=cut + +sub polyA { + my($self, $key) = @_; + return 1 if $self->{sequences}{$key}{polyA}; + return 0; +} + +=head2 get_rejects + + Title : get_rejects + Usage : $lucyObj->get_rejects() + Function: returns a hash containing names of rejects and a 1 letter code for the + reason Lucy rejected the sequence. + Q- rejected because of low quality values + S- sequence was short + V- sequence was vector + E- sequence was empty + P- poly-A/T trimming caused sequence to be too short + In order to get the rejects, you must provide a file with the standard + error from Lucy. You will only get the quality category rejects unless + you have modified the source and recompiled Lucy as outlined in DESCRIPTION. + Returns : hash reference + Args : none + +=cut + +sub get_rejects { + my $self = shift; + return $self->{reject}; +} + +=head2 Diff for Lucy source code + + 352a353,354 + > /* AGW added next line */ + > fprintf(stderr, "Empty: %s\n", seqs[i].name); + 639a642,643 + > /* AGW added next line */ + > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name); + 678c682,686 + < if (left) seqs[i].left+=left; + --- + > if (left) { + > seqs[i].left+=left; + > /* AGW added next line */ + > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name); + > } + 681c689,693 + < if (right) seqs[i].right-=right; + --- + > if (right) { + > seqs[i].right-=right; + > /* AGW added next line */ + > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name); + > } + 682a695,696 + > /* AGW added next line */ + > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name); + 734a749,750 + > /* AGW added next line */ + > fprintf(stderr, "Vector: %s\n", seqs[i].name); + +=cut + +1;