Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/Alignment/Consed.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Alignment/Consed.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1808 @@ +# Bio::Tools::Alignment::Consed.pm +# +# Cared for by Chad Matsalla +# +# Copyright Chad Matsalla +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Alignment::Consed - A module to work with objects from consed .ace files + +=head1 SYNOPSIS + + # a report for sequencing stuff + my $o_consed = new Bio::Tools::Alignment::Consed( + -acefile => "/path/to/an/acefile.ace.1", + -verbose => 1); + my $foo = $o_consed->set_reverse_designator("r"); + my $bar = $o_consed->set_forward_designator("f"); + + # get the contig numbers + my @keys = $o_consed->get_contigs(); + + # construct the doublets + my $setter_doublets = $o_consed->choose_doublets(); + + # get the doublets + my @doublets = $o_consed->get_doublets(); + +=head1 DESCRIPTION + +Bio::Tools::Alignment::Consed provides methods and objects to deal +with the output from the Consed package of programs. Specifically, +Bio::Tools::Alignment::Consed takes in the name of in .ace file and +provides objects for the results. + +A word about doublets: This module was written to accomodate a large +EST sequencing operation. In this case, EST's were sequenced from the +3' and from the 5' end of the EST. The objective was to find a +consensus sequence for these two reads. Thus, a contig of two is what +we wanted, and this contig should consist of the forward and reverse +reads of a getn clone. For example, for a forward designator of "F" +and a reverse designator of "R", if the two reads chad1F and chad1R +were in a single contig (for example Contig 5) it will be determined +that the consensus sequence for Contig 5 will be the sequence for +clone chad1. + +Doublets are good! + +This module parses .ace and related files. A detailed list of methods +can be found at the end of this document. + +I wrote a detailed rationale for design that may explain the reasons +why some things were done the way they were done. That document is +beyond the scope of this pod and can probably be found in the +directory from which this module was 'made' or at +http://www.dieselwurks.com/bioinformatics/consedpm_documentation.pdf + +Note that the pod in that document might be old but the original +rationale still stands. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + + +=head1 AUTHOR - Chad Matsalla + +chad@dieselwurks.com + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' + +package Bio::Tools::Alignment::Consed; + +use strict; +use vars qw($VERSION @ISA $Contigs %DEFAULTS); +use FileHandle; +use Dumpvalue qw(dumpValue); +use Bio::Tools::Alignment::Trim; +use Bio::Root::Root; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +$VERSION = '0.60'; + +BEGIN { + %DEFAULTS = ( 'f_designator' => 'f', + 'r_designator' => 'r'); +} + +=head2 new() + + Title : new(-acefile => $path_to_some_acefile, -verbose => "1") + Usage : $o_consed = Bio::Tools::Alignment::Consed-> + new(-acefile => $path_to_some_acefile, -verbose => "1"); + Function: Construct the Bio::Tools::Alignment::Consed object. Sets + verbosity for the following procedures, if necessary: + 1. Construct a new Bio::Tools::Alignment::Trim object, to + handle quality trimming 2. Read in the acefile and parse it + + Returns : A reference to a Bio::Tools::Alignment::Consed object. + Args : A hash. (-acefile) is the filename of an acefile. If a full path + is not specified "./" is prepended to the filename and used from + instantiation until destruction. If you want + Bio::Tools::Alignment::Consed to be noisy during parsing of + the acefile, specify some value for (-verbose). + +=cut + +sub new { + my ($class,%args) = @_; + my $self = $class->SUPER::new(%args); + + $self->{'filename'} = $args{'-acefile'}; + # this is special to UNIX and should probably use catfile + if (!($self->{'filename'} =~ /\//)) { + $self->{'filename'} = "./".$self->{'filename'}; + } + $self->{'filename'} =~ m/(.*\/)(.*)ace.*$/; + $self->{'path'} = $1; + $self->_initialize_io('-file'=>$self->{'filename'}); + $self->{'o_trim'} = new Bio::Tools::Alignment::Trim(-verbose => $self->verbose()); + $self->set_forward_designator($DEFAULTS{'f_designator'}); + $self->set_reverse_designator($DEFAULTS{'r_designator'}); + + $self->_read_file(); + return $self; +} + +=head2 verbose() + + Title : verbose() + Usage : $o_consed->verbose(1); + Function: Set the verbosity level for debugging messages. On instantiation + of the Bio::Tools::Alignment::Consed object the verbosity level + is set to 0 (quiet). + Returns : 1 or 0. + Args : The verbosity levels are: + 0 - quiet + 1 - noisy + 2 - noisier + 3 - annoyingly noisy + Notes : This method for setting verbosity has largely been superseeded by + a sub-by-sub way, where for every sub you can provide a (-verbose) + switch. I am doing converting this bit-by-bit so do not be surprised + if some subs do not honour this. + +=cut + +# from RootI + +# backwards compat +sub set_verbose { (shift)->verbose(@_) } + +=head2 get_filename() + + Title : get_filename() + Usage : $o_consed->get_filename(); + Function: Returns the name of the acefile being used by the + Bio::Tools::Alignment::Consed object. + Returns : A scalar containing the name of a file. + Args : None. + +=cut + + +sub get_filename { + my $self = shift; + return $self->{'filename'}; +} + +=head2 count_sequences_with_grep() + + Title : count_sequences_with_grep() + Usage : $o_consed->count_sequences_with_grep(); + Function: Use /bin/grep to scan through the files in the ace project dir + and count sequences in those files. I used this method in the + development of this module to verify that I was getting all of the + sequences. It works, but it is (I think) unix-like platform + dependent. + Returns : A scalar containing the number of sequences in the ace project + directory. + Args : None. + Notes : If you are on a non-UNIX platform, you really do not have to use + this. It is more of a debugging routine designed to address very + specific problems. + + This method was reimplemented to be platform independent with a + pure perl implementation. The above note can be ignored. + +=cut + +sub count_sequences_with_grep { + my $self = shift; + my ($working_dir,$grep_cli,@total_grep_sequences); + # this should be migrated to a pure perl implementation ala + # Tom Christiansen's 'tcgrep' + # http://www.perl.com/language/ppt/src/grep/tcgrep + + open(FILE, $self->{'filename'}) or do { $self->warn("cannot open file ".$self->{'filename'}. " for grepping"); return}; + my $counter =0; + while(<FILE>) { $counter++ if(/^AF/); } + + close FILE; + opendir(SINGLETS,$self->{'path'}); + foreach my $f ( readdir(SINGLETS) ) { + next unless ($f =~ /\.singlets$/); + open(FILE, $self->catfile($self->{'path'},$f)) or do{ $self->warn("cannot open file ".$self->catfile($self->{'path'},$f)); next }; + while(<FILE>) { $counter++ if(/^>/) } + close FILE; + } + return $counter; +# Chad's platform implementation which required grep on the system + + # I tried to cause graceful exiting if not on *ix here + # then i took platforms from Bioperl*/PLATFORMS here. Is that good? + # print("\$^O is $^O\n"); +# if (!($^O =~ /dec_osf|linux|unix|bsd|solaris|darwin/i)) { +# $self->warn("Bio::Tools::Alignment::Consed::count_sequences_with_grep: This sub uses grep which is doesn't run on this operating system, AFAIK. Sorry .".$^O); +# return 1; +# } +# $grep_cli = `which grep`; +# if (!$grep_cli) { +# $self->warn("I couldn't see to find grep on this system, or the which command is broken. Bio::Tools::Alignment::Consed::count_sequences_with_grep requires grep and which to find grep."); +# return 1; +# } +# chomp $grep_cli; +# push(@total_grep_sequences, my @foo = `$grep_cli ^AF $self->{filename}`); +# my $cli = "$grep_cli \\> $self->{'path'}*.singlets"; +# push(@total_grep_sequences, @foo = `$cli`); +# return scalar(@total_grep_sequences); +} + +=head2 get_path() + + Title : get_path() + Usage : $o_consed->get_path(); + Function: Returns the path to the acefile this object is working with. + Returns : Scalar. The path to the working acefile. + Args : None. + +=cut + +sub get_path { + my $self = shift; + return $self->{'path'}; +} + +=head2 get_contigs() + + Title : get_contigs() + Usage : $o_consed->get_contigs(); + Function: Return the keys to the Bio::Tools::Alignment::Consed object. + Returns : An array containing the keynames in the + Bio::Tools::Alignment::Consed object. + Args : None. + Notes : This would normally be used to get the keynames for some sort of + iterator. These keys are worthless in general day-to-day use because + in the Consed acefile they are simply Contig1, Contig2, ... + +=cut + +sub get_contigs { + my ($self,$contig) = @_; + my @keys = (sort keys %{$self->{'contigs'}}); + return @keys; +} + +=head2 get_class($contig_keyname) + + Title : get_class($contig_keyname) + Usage : $o_consed->get_class($contig_keyname); + Function: Return the class name for this contig + Returns : A scalar representing the class of this contig. + Args : None. + Notes : + +=cut + +sub get_class { + my ($self,$contig) = @_; + return $self->{contigs}->{$contig}->{class}; +} + +=head2 get_quality_array($contig_keyname) + + Title : get_quality_array($contig_keyname) + Usage : $o_consed->get_quality_array($contig_keyname); + Function: Returns the quality for the consensus sequence for the given + contig as an array. See get_quality_scalar to get this as a scalar. + Returns : An array containing the quality for the consensus sequence with + the given keyname. + Args : The keyname of a contig. Note: This is a keyname. The key would + normally come from get_contigs. + Notes : Returns an array, not a reference. Is this a bug? <thinking> No. + Well, maybe. + Why was this developed like this? I was using FreezeThaw for object + persistence, and when it froze out these arrays it took a long time + to thaw it. Much better as a scalar. + +See L<get_quality_scalar()|get_quality_scalar> + +=cut + +sub get_quality_array { + my ($self,$contig) = @_; + my $quality = $self->{contigs}->{$contig}->{quality}; + # chad, what is with the s/// ? + # my @qualities = split + # (' ',($self->{contigs}->{$contig}->{quality} =~ s/\s+//)); + my @qualities = split + (' ',$self->{contigs}->{$contig}->{quality}); + return @qualities; +} + +=head2 get_quality_scalar($contig_keyname)) + + Title : get_quality_scalar($contig_keyname) + Usage : $o_consed->get_quality_scalar($contig_keyname); + Function: Returns the quality for the consensus sequence for the given + contig as a scalar. See get_quality_array to get this as an array. + Returns : An scalar containing the quality for the consensus sequence with + the given keyname. + Args : The keyname of a contig. Note this is a _keyname_. The key would + normally come from get_contigs. + Notes : Why was this developed like this? I was using FreezeThaw for object + persistence, and when it froze out these arrays it took a coon's age + to thaw it. Much better as a scalar. + +See L<get_quality_array()|get_quality_array> + +=cut + +#' +sub get_quality_scalar { + my ($self,$contig) = @_; + return $self->{'contigs'}->{$contig}->{'quality'}; +} + +=head2 freeze_hash() + + Title : freeze_hash() + Usage : $o_consed->freeze_hash(); + + Function: Use Ilya's FreezeThaw module to create a persistent data + object for this Bio::Tools::Alignment::Consed data + structure. In the case of AAFC, we use + Bio::Tools::Alignment::Consed to pre-process bunches of + sequences, freeze the structures, and send in a harvesting + robot later to do database stuff. + Returns : 0 or 1; + Args : None. + Notes : This procedure was removed so Consed.pm won't require + FreezeThaw. + + +=cut + +#' +sub freeze_hash { + my $self = shift; + $self->warn("This method (freeze_hash) was removed from the bioperl consed.pm. Sorry.\n"); + if (1==2) { + $self->debug("Bio::Tools::Alignment::Consed::freeze_hash: \$self->{path} is $self->{path}\n"); + my $filename = $self->{'path'}."frozen"; + my %contigs = %{$self->{'contigs'}}; + my $frozen = freeze(%contigs); + umask 0001; + open (FREEZE,">$filename") or do { + $self->warn( "Bio::Tools::Alignment::Consed could not freeze the contig hash because the file ($filename) could not be opened: $!\n"); + return 1; + }; + print FREEZE $frozen; + close FREEZE; + return 0; + } +} + +=head2 get_members($contig_keyname) + + Title : get_members($contig_keyname) + Usage : $o_consed->get_members($contig_keyname); + Function: Return the _names_ of the reads in this contig. + Returns : An array containing the names of the reads in this contig. + Args : The keyname of a contig. Note this is a keyname. The keyname + would normally come from get_contigs. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_members { + my ($self,$contig) = @_; + if (!$contig) { + $self->warn("You need to provide the name of a contig to use Bio::Tools::Alignment::Consed::get_members!\n"); + return; + } + return @{$self->{'contigs'}->{$contig}->{'member_array'}}; +} + +=head2 get_members_by_name($some_arbitrary_name) + + Title : get_members_by_name($some_arbitrary_name) + Usage : $o_consed->get_members_by_name($some_arbitrary_name); + Function: Return the names of the reads in a contig. This is the name given + to $contig{key} based on what is in the contig. This is different + from the keys retrieved through get_contigs(). + Returns : An array containing the names of the reads in the contig with this + name. + Args : The name of a contig. Not a key, but a name. + Notes : Highly inefficient. use some other method if possible. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_members_by_name { + my ($self,$name) = @_; + # build a list to try to screen for redundancy + my @contigs_with_that_name; + foreach my $currkey ( sort keys %{$self->{'contigs'}} ) { + next if (!$self->{'contigs'}->{$currkey}->{'name'}); + if ($self->{'contigs'}->{$currkey}->{'name'} eq "$name") { + push @contigs_with_that_name,$currkey; + } + } + my $count = @contigs_with_that_name; + if ($count == 1) { + my $contig_num = $contigs_with_that_name[0]; + return @{$self->{'contigs'}->{$contig_num}->{'member_array'}}; + } +} + +=head2 get_contig_number_by_name($some_arbitrary_name) + + Title : get_contig_number_by_name($some_arbitrary_name) + Usage : $o_consed->get_contig_number_by_name($some_arbitrary_name); + Function: Return the names of the reads in a contig. This is the name given + to $contig{key} based on what is in the contig. This is different + from the keys retrieved through get_contigs(). + Returns : An array containing the names of the reads in the contig with this + name. + Args : The name of a contig. Not a key, but a name. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_contig_number_by_name { + my ($self,$name) = @_; + foreach my $currkey (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$currkey}->{'name'} && + $self->{'contigs'}->{$currkey}->{'name'} eq "$name") { + return $currkey; + } + } +} + +=head2 get_sequence($contig_keyname) + + Title : get_sequence($contig_keyname) + Usage : $o_consed->get_sequence($contig_keyname); + Function: Returns the consensus sequence for a given contig. + Returns : A scalar containing a sequence. + Args : The keyname of a contig. Note this is a key. The key would + normally come from get_contigs. + +See L<get_contigs()|get_contigs> + +=cut + +sub get_sequence { + my ($self,$contig) = @_; + return $self->{'contigs'}->{$contig}->{'consensus'}; +} + +=head2 set_final_sequence($some_sequence) + + Title : set_final_sequence($name,$some_sequence) + Usage : $o_consed->set_final_sequence($name,$some_sequence); + Function: Provides a manual way to set the sequence for a given key in the + contig hash. Rarely used. + Returns : 0 or 1; + Args : The name (not the keyname) of a contig and an arbitrary string. + Notes : A method with a questionable and somewhat mysterious origin. May + raise the dead or something like that. + +=cut + +sub set_final_sequence { + my ($self,$name,$sequence) = @_; + if (!$self->{'contigs'}->{$name}) { + $self->warn("You cannot set the final sequence for $name because it doesn't exist!\n"); + return 1; + } + else { + $self->{'contigs'}->{$name}->{'final_sequence'} = $sequence; + } + return 0; +} + +=head2 _read_file() + + Title : _read_file(); + Usage : _read_file(); + Function: An internal subroutine used to read in an acefile and parse it + into a Bio::Tools::Alignment::Consed object. + Returns : 0 or 1. + Args : Nothing. + Notes : This routine creates and saves the filhandle for reading the + files in {fh} + +=cut + +sub _read_file { + my ($self) = @_; + my ($line,$in_contig,$in_quality,$contig_number,$top); + # make it easier to type $fhl + while (defined($line=$self->_readline()) ) { + chomp $line; + # check if there is anything on this line + # if not, you can stop gathering consensus sequence + if (!$line) { + # if the line is blank you are no longer to gather consensus + # sequence or quality values + $in_contig = 0; + $in_quality = 0; + } + # you are currently gathering consensus sequence + elsif ($in_contig) { + if ($in_contig == 1) { + $self->debug("Adding $line to consensus of contig number $contig_number.\n"); + $self->{'contigs'}->{$contig_number}->{'consensus'} .= $line; + } + } + elsif ($in_quality) { + if (!$line) { + $in_quality = undef; + } + else { + # I wrote this in here because acefiles produced by cap3 do not have a leading space + # like the acefiles produced by phrap and there is the potential to have concatenated + # quality values like this: 2020 rather then 20 20 whre lines collide. Thanks Andrew for noticing. + if ($self->{'contigs'}->{$contig_number}->{'quality'} && !($self->{'contigs'}->{$contig_number}->{'quality'} =~ m/\ $/)) { + $self->{'contigs'}->{$contig_number}->{'quality'} .= " "; + } + $self->{'contigs'}->{$contig_number}->{'quality'} .= $line; + } + } + elsif ($line =~ /^BQ/) { + $in_quality = 1; + } + # the line /^CO/ like this: + # CO Contig1 796 1 1 U + # can be broken down as follows: + # CO - Contig! + # Contig1 - the name of this contig + # 796 - Number of bases in this contig + # 1 - Number of reads in this contig + # 1 - number of base segments in this contig + # U - Uncomplemented + elsif ($line =~ /^CO/) { + $line =~ m/^CO\ Contig(\d+)\ \d+\ \d+\ \d+\ (\w)/; + $contig_number = $1; + if ($2 eq "C") { + $self->debug("Contig $contig_number is complemented!\n"); + } + $self->{'contigs'}->{$contig_number}->{'member_array'} = []; + $self->{'contigs'}->{$contig_number}->{'contig_direction'} = "$2"; + $in_contig = 1; + } + # 000713 + # this BS is deprecated, I think. + # haha, I am really witty. <ew> + elsif ($line =~ /^BSDEPRECATED/) { + $line =~ m/^BS\s+\d+\s+\d+\s+(.+)/; + my $member = $1; + $self->{'contigs'}->{$contig_number}->{$member}++; + } + # the members of the contigs are determined by the AF line in the ace file + elsif ($line =~ /^AF/) { + $self->debug("I see an AF line here.\n"); + $line =~ /^AF\ (\S+)\ (\w)\ (\S+)/; + # push the name of the current read onto the member array for this contig + push @{$self->{'contigs'}->{$contig_number}->{'member_array'}},$1; + # the first read in the contig will be named the "top" read + if (!$top) { + $self->debug("\$top is not set.\n"); + if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") { + $self->debug("Reversing the order of the reads. The bottom will be $1\n"); + # if the contig sequence is marked as the complement, the top becomes the bottom and$ + $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3; + } + else { + $self->debug("NOT reversing the order of the reads. The top_name will be $1\n"); + # if the contig sequence is marked as the complement, the top becomes the bottom and$ + $self->{'contigs'}->{$contig_number}->{'top_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'top_start'} = $3; + } + $top = 1; + } + else { + # if the contig sequence is marked as the complement, the top becomes the bottom and the bottom becomes the top + if ($self->{'contigs'}->{$contig_number}->{'contig_direction'} eq "C") { + $self->debug("Reversing the order of the reads. The top will be $1\n"); + $self->{'contigs'}->{$contig_number}->{'top_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'top_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'top_start'} = $3; + } + else { + $self->debug("NOT reversing the order of the reads. The bottom will be $1\n"); + $self->{'contigs'}->{$contig_number}->{'bottom_name'} = $1; + $self->{'contigs'}->{$contig_number}->{'bottom_complement'} = $2; + $self->{'contigs'}->{$contig_number}->{'bottom_start'} = $3; + } + $top = undef; + } + } + } + return 0; +} + +=head2 set_reverse_designator($some_string) + + Title : set_reverse_designator($some_string) + Usage : $o_consed->set_reverse_designator($some_string); + Function: Set the designator for the reverse read of contigs in this + Bio::Tools::Alignment::Consed object. Used to determine if + contigs containing two reads can be named. + Returns : The value of $o_consed->{reverse_designator} so you can check + to see that it was set properly. + Args : An arbitrary string. + Notes : May be useful only to me. <shrug> + +=cut + +sub set_reverse_designator { + my ($self,$reverse_designator) = @_; + $self->{'reverse_designator'} = $reverse_designator; + $self->{'o_trim'}->set_reverse_designator($reverse_designator); + return $self->{'reverse_designator'}; +} # end set_reverse_designator + +=head2 set_forward_designator($some_string) + + Title : set_forward_designator($some_string) + Usage : $o_consed->set_forward_designator($some_string); + Function: Set the designator for the forward read of contigs in this + Bio::Tools::Alignment::Consed object. Used to determine if + contigs containing two reads can be named. + Returns : The value of $o_consed->{forward_designator} so you can check + to see that it was set properly. + Args : An arbitrary string. + Notes : May be useful only to me. <shrug> + +=cut + +sub set_forward_designator { + my ($self,$forward_designator) = @_; + $self->{'forward_designator'} = $forward_designator; + $self->{'o_trim'}->set_forward_designator($forward_designator); + return $self->{'forward_designator'}; +} # end set_forward_designator + +=head2 set_designator_ignore_case("yes") + + Title : set_designator_ignore_case("yes") + Usage : $o_consed->set_designator_ignore_case("yes"); + Function: Deprecated. + Returns : Deprecated. + Args : Deprecated. + Notes : Deprecated. Really. Trust me. + +=cut + +sub set_designator_ignore_case { + my ($self,$ignore_case) = @_; + if ($ignore_case eq "yes") { + $self->{'designator_ignore_case'} = 1; + } + return $self->{'designator_ignore_case'}; +} # end set_designator_ignore_case + +=head2 set_trim_points_singlets_and_singletons() + + Title : set_trim_points_singlets_and_singletons() + Usage : $o_consed->set_trim_points_singlets_and_singletons(); + Function: Set the trim points for singlets and singletons based on + quality. Uses the Bio::Tools::Alignment::Trim object. Use + at your own risk because the Bio::Tools::Alignment::Trim + object was designed specifically for me and is mysterious + in its ways. Every time somebody other then me uses it a + swarm of locusts decends on a small Central American + village so do not say you weren't warned. + Returns : Nothing. + Args : None. + Notes : Working on exceptions and warnings here. + +See L<Bio::Tools::Alignment::Trim> for more information + +=cut + +#' to make my emacs happy + +sub set_trim_points_singlets_and_singletons { + my ($self) = @_; + $self->debug("Consed.pm : \$self is $self\n"); + my (@points,$trimmed_sequence); + if (!$self->{'doublets_set'}) { + $self->debug("You need to set the doublets before you use set_trim_points_singlets_and_doublets. Doing that now."); + $self->set_doublets(); + } + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$_}->{'class'} eq "singlet") { + $self->debug("Singlet $_\n"); + # this is what Warehouse wants + # my ($self,$sequence,$quality,$name) = @_; + # this is what Bio::Tools::Alignment::Trim::trim_singlet wants: + # my ($self,$sequence,$quality,$name,$class) = @_; + # the following several lines are to make the parameter passing legible. + my ($sequence,$quality,$name,$class); + $sequence = $self->{'contigs'}->{$_}->{'consensus'}; + if (!$self->{'contigs'}->{$_}->{'quality'}) { $quality = "unset"; } + else { $quality = $self->{'contigs'}->{$_}->{'quality'}; } + $name = $self->{'contigs'}->{$_}->{'name'}; + $class = $self->{'contigs'}->{$_}->{'class'}; + (@points) = @{$self->{'o_trim'}->trim_singlet($sequence,$quality,$name,$class)}; + $self->{'contigs'}->{$_}->{'start_point'} = $points[0]; + $self->{'contigs'}->{$_}->{'end_point'} = $points[1]; + $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]); + } + } + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_singlets_and_singletons: Done setting the quality trimpoints.\n"); + return; +} # end set_trim_points_singlet + +=head2 set_trim_points_doublets() + + Title : set_trim_points_doublets() + Usage : $o_consed->set_trim_points_doublets(); + Function: Set the trim points for doublets based on quality. Uses the + Bio::Tools::Alignment::Trim object. Use at your own risk because + the Bio::Tools::Alignment::Trim object was designed specifically + for me and is mysterious in its ways. Every time somebody other + then me uses it you risk a biblical plague being loosed on your + city. + Returns : Nothing. + Args : None. + Notes : Working on exceptions here. + +See L<Bio::Tools::Alignment::Trim> for more information + +=cut + +sub set_trim_points_doublets { + my $self = shift; + my @points; + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Restoring zeros for doublets.\n"); + # &show_missing_sequence($self); + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Setting doublet trim points.\n"); + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$_}->{'class'} eq "doublet") { + # my ($self,$sequence,$quality,$name,$class) = @_; + my @quals = split(' ',$self->{'contigs'}->{$_}->{'quality'}); + + (@points) = $self->{o_trim}->trim_doublet($self->{'contigs'}->{$_}->{'consensus'},$self->{'contigs'}->{$_}->{'quality'},$self->{'contigs'}->{$_}->{name},$self->{'contigs'}->{$_}->{'class'}); + $self->{'contigs'}->{$_}->{'start_point'} = $points[0]; + $self->{'contigs'}->{$_}->{'end_point'} = $points[1]; + # now set this + $self->{'contigs'}->{$_}->{'sequence_trimmed'} = substr($self->{contigs}->{$_}->{'consensus'},$points[0],$points[1]-$points[0]); + # 010102 the deprecated way to do things: + } + } + $self->debug("Bio::Tools::Alignment::Consed::set_trim_points_doublets: Done setting doublet trim points.\n"); + return; +} # end set_trim_points_doublets + +=head2 get_trimmed_sequence_by_name($name) + + Title : get_trimmed_sequence_by_name($name) + Usage : $o_consed->get_trimmed_sequence_by_name($name); + Function: Returns the trimmed_sequence of a contig with {name} eq $name. + Returns : A scalar- the trimmed sequence. + Args : The {name} of a contig. + Notes : + +=cut + +sub get_trimmed_sequence_by_name { + my ($self,$name) = @_; + my $trimmed_sequence; + my $contigname = &get_contig_number_by_name($self,$name); + my $class = $self->{'contigs'}->{$contigname}->{'class'}; + # what is this business and who was smoking crack while writing this? + # if ($class eq "singlet") { + # send the sequence, the quality, and the name + # $trimmed_sequence = $self->{o_trim}->trim_singlet($self->{'contigs'}->{$contigname}->{consensus},$self->{'contigs'}->{$contigname}->{'quality'},$name); + # } + return $self->{'contigs'}->{$contigname}->{'sequence_trimmed'}; +} + +=head2 set_dash_present_in_sequence_name("yes") + + Title : set_dash_present_in_sequence_name("yes") + Usage : $o_consed->set_dash_present_in_sequence_name("yes"); + Function: Deprecated. Part of an uncompleted thought. ("Oooh! Shiny!") + Returns : Nothing. + Args : "yes" to set {dash_present_in_sequence_name} to 1 + Notes : + +=cut + +sub set_dash_present_in_sequence_name { + my ($self,$dash_present) = @_; + if ($dash_present eq "yes") { + $self->{'dash_present_in_sequence_name'} = 1; + } + else { + $self->{'dash_present_in_sequence_name'} = 0; + } + return $self->{'dash_present_in_sequence_name'}; +} # end set_dash_present_in_sequence_name + +=head2 set_doublets() + + Title : set_doublets() + Usage : $o_consed->set_doublets(); + Function: Find pairs that have similar names and mark them as doublets + and set the {name}. + Returns : 0 or 1. + Args : None. + Notes : A complicated subroutine that iterates over the + Bio::Tools::Alignment::Consed looking for contigs of 2. If the + forward and reverse designator are removed from each of the reads + in {'member_array'} and the remaining reads are the same, {name} + is set to that name and the contig's class is set as "doublet". + If any of those cases fail the contig is marked as a "pair". + +=cut + +#' make my emacs happy + +sub set_doublets { + my ($self) = @_; + # set the designators in the Bio::Tools::Alignment::Trim object + + $self->{'o_trim'}->set_designators($self->{'reverse_designator'}, + $self->{'forward_designator'}); + # + foreach my $key_contig (sort keys %{$self->{'contigs'}}) { + + # if there is a member array (why would there not be? This should be a die()able offence + # but for now I will leave it + if ($self->{'contigs'}->{$key_contig}->{'member_array'}) { + # if there are two reads in this contig + # i am pretty sure that this is wrong but i am keeping it for reference + # if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2 || !$self->{'contigs'}->{$key_contig}->{'class'}) { + # <seconds later> + # <nod> WRONG. Was I on crack? + if (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 2) { + $self->{'contigs'}->{$key_contig}->{'num_members'} = 2; + $self->debug("\tThere are 2 members! Looking for the contig name...\n"); + my $name = _get_contig_name($self,$self->{'contigs'}->{$key_contig}->{'member_array'}); + $self->debug("The name is $name\n") if defined $name; + if ($name) { + $self->{'contigs'}->{$key_contig}->{'name'} = $name; + $self->{'contigs'}->{$key_contig}->{'class'} = "doublet"; + } + else { + $self->debug("$key_contig is a pair.\n"); + $self->{'contigs'}->{$key_contig}->{'class'} = "pair"; + } + } + # this is all fair and good but what about singlets? + # they have one reads in the member_array but certainly are not singletons + elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} == 1) { + # set the name to be the name of the read + $self->{'contigs'}->{$key_contig}->{name} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}[0]; + # set the number of members to be one + $self->{'contigs'}->{$key_contig}->{num_members} = 1; + # if this was a singlet, it would already belong to the class "singlet" + # so leave it alone + # if it is not a singlet, it is a singleton! lablel it appropriately + unless ($self->{'contigs'}->{$key_contig}->{'class'}) { + $self->{'contigs'}->{$key_contig}->{'class'} = "singleton"; + } + } + # set the multiplet characteristics + elsif (@{$self->{'contigs'}->{$key_contig}->{'member_array'}} >= 3) { + $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}; + $self->{'contigs'}->{$key_contig}->{'class'} = "multiplet"; + } + $self->{'contigs'}->{$key_contig}->{'num_members'} = @{$self->{'contigs'}->{$key_contig}->{'member_array'}}; + + } + } + $self->{'doublets_set'} = "done"; + return 0; +} # end set_doublets + +=head2 set_singlets + + Title : set_singlets + Usage : $o_consed->set_singlets(); + Function: Read in a singlets file and place them into the + Bio::Tools::Alignment::Consed object. + Returns : Nothing. + Args : A scalar to turn on verbose parsing of the singlets file. + Notes : + +=cut + +sub set_singlets { + # parse out the contents of the singlets file + my ($self) = @_; + $self->debug("Bio::Tools::Alignment::Consed Adding singlets to the contig hash...\n"); + my $full_filename = $self->{'filename'}; + $self->debug("Bio::Tools::Alignment::Consed::set_singlets: \$full_filename is $full_filename\n"); + $full_filename =~ m/(.*\/)(.*ace.*)$/; + my ($base_path,$filename) = ($1,$2); + $self->debug("Bio::Tools::Alignment::Consed::set_singlets: singlets filename is $filename and \$base_path is $base_path\n"); + $filename =~ m/(.*)ace.*$/; + my $singletsfile = $base_path.$1."singlets"; + $self->debug("\$singletsfile is $singletsfile\n"); + if (-f $singletsfile) { + $self->debug("$singletsfile is indeed a file. Trying to open it...\n"); + } + my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile); + my ($sequence,$name,$count); + while ($_ = $singlets_fh->_readline()) { + chomp $_; + if (/\>/) { + if ($name && $sequence) { + $self->debug("Adding $name with sequence $sequence to hash...\n"); + push @{$self->{'contigs'}->{$name}->{'member_array'}},$name; + $self->{'contigs'}->{$name}->{'consensus'} = $sequence; + $self->{'contigs'}->{$name}->{'name'} = $name; + $self->{'contigs'}->{$name}->{"singlet"} = 1; + $self->{'contigs'}->{$name}->{'class'} = "singlet"; + } + $sequence = $name = undef; + $count++; + m/^\>(.*)\s\sCHROMAT/; + $name = $1; + if (!$name) { + m/\>(\S+)\s/; + $name = $1; + } + } + else { $sequence .= $_; } + } + if ($name && $sequence) { + $self->debug("Pushing the last of the singlets ($name)\n"); + @{$self->{'contigs'}->{$name}->{'member_array'}} = $name; + $self->{'contigs'}->{$name}->{'consensus'} = $sequence; + $self->{'contigs'}->{$name}->{'name'} = $name; + $self->{'contigs'}->{$name}->{"singlet"} = 1; + $self->{'contigs'}->{$name}->{'class'} = "singlet"; + } + $self->debug("Bio::Tools::Alignment::Consed::set_singlets: Done adding singlets to the singlets hash.\n"); + $self->{'singlets_set'} = "done"; + return 0; +} # end sub set_singlets + +=head2 get_singlets() + + Title : get_singlets() + Usage : $o_consed->get_singlets(); + Function: Return the keynames of the singlets. + Returns : An array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "singlet". + Args : None. + Notes : + +=cut + +sub get_singlets { + # returns an array of singlet names + # singlets have "singlet"=1 in the hash + my $self = shift; + if (!$self->{singlets_set}) { + $self->debug("You need to set the singlets before you get them. Doing that now."); + $self->set_singlets(); + } + + my (@singlets,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + # @array = @{$Consed::contigs{$key}->{'member_array'}}; + # somethimes a user will try to get a list of singlets before the classes for the rest of the + # contigs has been set (see t/test.t for how I figured this out. <bah> + # so either way, just return class=singlets + if (!$self->{'contigs'}->{$key}->{'class'}) { + # print("$key has no class. why?\n"); + } + elsif ($self->{'contigs'}->{$key}->{'class'} eq "singlet") { + push @singlets,$key; + } + } + return @singlets; +} + +=head2 set_quality_by_name($name,$quality) + + Title : set_quality_by_name($name,$quality) + Usage : $o_consed->set_quality_by_name($name,$quality); + Function: Deprecated. Make the contig with {name} have {'quality'} $quality. + Probably used for testing. + Returns : Nothing. + Args : The name of a contig and a scalar for its quality. + Notes : Deprecated. + +=cut + +sub set_quality_by_name { + # this is likely deprecated + my ($self,$name,$quality) = shift; + my $return; + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'} eq "$name" || $self->{'contigs'}->{'name'} eq "$name") { + $self->{'contigs'}->{'quality'} = $quality; + $return=1; + } + } + if ($return) { return "0"; } else { return "1"; } +} # end set quality by name + +=head2 set_singlet_quality() + + Title : set_singlet_quality() + Usage : $o_consed->set_singlet_quality(); + Function: For each singlet, go to the appropriate file in phd_dir and read + in the phred quality for that read and place it into {'quality'} + Returns : 0 or 1. + Args : None. + Notes : This is the next subroutine that will receive substantial revision + in the next little while. It really should eval the creation of + Bio::Tools::Alignment::Phred objects and go from there. + +=cut + +sub set_singlet_quality { + my $self = shift; + my $full_filename = $self->{'filename'}; + $full_filename =~ m/(.*\/)(.*)ace.*$/; + my ($base_path,$filename) = ($1,"$2"."qual"); + my $singletsfile = $base_path.$filename; + if (-f $singletsfile) { + # print("$singletsfile is indeed a file. Trying to open it...\n"); + } + else { + $self->warn("$singletsfile is not a file. Sorry.\n"); + return; + } + my $singlets_fh = Bio::Root::IO->new(-file => $singletsfile); + my ($sequence,$name,$count); + my ($identity,$line,$quality,@qline); + while ($line = $singlets_fh->_readline()) { + chomp $line; + if ($line =~ /^\>/) { + $quality = undef; + $line =~ m/\>(\S*)\s/; + $identity = $1; + } + else { + if ($self->{'contigs'}->{$identity}) { + $self->{'contigs'}->{$identity}->{'quality'} .= "$line "; + } + } + + } + return 0; +} + +=head2 set_contig_quality() + + Title : set_contig_quality() + Usage : $o_consed->set_contig_quality(); + Function: Deprecated. + Returns : Deprecated. + Args : Deprecated. + Notes : Deprecated. Really. Trust me. + +=cut + +sub set_contig_quality { + # note: contigs _include_ singletons but _not_ singlets + my ($self) = shift; + # the unexpected results I am referring to here are a doubling of quality values. + # the profanity I uttered on discovering this reminded me of the simpsons: + # Ned Flanders: "That is the loudest profanity I have ever heard!" + $self->warn("set_contig_quality is deprecated and will likely produce unexpected results"); + my $full_filename = $self->{'filename'}; + # Run_SRC3700_2000-08-01_73+74.fasta.screen.contigs.qual + # from Consed.pm + $full_filename =~ m/(.*\/)(.*)ace.*$/; + my ($base_path,$filename) = ($1,"$2"."contigs.qual"); + my $singletsfile = $base_path.$filename; + if (-f $singletsfile) { + # print("$singletsfile is indeed a file. Trying to open it...\n"); + } + else { + $self->warn("Bio::Tools::Alignment::Consed::set_contig_quality $singletsfile is not a file. Sorry.\n"); + return; + } + my $contig_quality_fh = Bio::Root::IO->new(-file => $singletsfile); + + my ($sequence,$name,$count,$identity,$line,$quality); + while ($line = $contig_quality_fh->_readline()) { + chomp $line; + if ($line =~ /^\>/) { + $quality = undef; + $line =~ m/\>.*Contig(\d+)\s/; + $identity = $1; + } + else { + if ($self->{'contigs'}->{$identity} ) { + $self->{'contigs'}->{$identity}->{'quality'} .= " $line"; + } + } + } +} # end set_contig_quality + +=head2 get_multiplets() + + Title : get_multiplets() + Usage : $o_consed->get_multiplets(); + Function: Return the keynames of the multiplets. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "multiplet". + Args : None. + Notes : + +=cut + +sub get_multiplets { + # returns an array of multiplet names + # multiplets have # members > 2 + my $self = shift; + my (@multiplets,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$key}->{'class'}) { + if ($self->{'contigs'}->{$key}->{'class'} eq "multiplet") { + push @multiplets,$key; + } + } + } + return @multiplets; +} + +=head2 get_all_members() + + Title : get_all_members() + Usage : @all_members = $o_consed->get_all_members(); + Function: Return a list of all of the read names in the + Bio::Tools::Alignment::Consed object. + Returns : An array containing all of the elements in all of the + {'member_array'}s. + Args : None. + Notes : + +=cut + +sub get_all_members { + my $self = shift; + my @members; + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($key =~ /^singlet/) { + push @members,$self->{'contigs'}->{$key}->{'member_array'}[0]; + } + elsif ($self->{'contigs'}->{$key}->{'member_array'}) { + push @members,@{$self->{'contigs'}->{$key}->{'member_array'}}; + } + # else { + # print("Bio::Tools::Alignment::Consed: $key is _not_ an array. Pushing $self->{'contigs'}->{$key}->{'member_array'} onto \@members\n"); + # push @members,$self->{'contigs'}->{$key}->{'member_array'}; + # } + } + return @members; +} + +=head2 sum_lets($total_only) + + Title : sum_lets($total_only) + Usage : $statistics = $o_consed->sum_lets($total_only); + Function: Provide numbers for how many sequences were accounted for in the + Bio::Tools::Alignment::Consed object. + Returns : If a scalar is present, returns the total number of + sequences accounted for in all classes. If no scalar passed + then returns a string that looks like this: + Singt/singn/doub/pair/mult/total : 2,0,1(2),0(0),0(0),4 + This example means the following: There were 1 singlets. + There were 0 singletons. There were 1 doublets for a total + of 2 sequences in this class. There were 0 pairs for a + total of 0 sequences in this class. There were 0 + multiplets for a total of 0 sequences in this class. There + were a total of 4 sequences accounted for in the + Bio::Tools::Alignment::Consed object. + Args : A scalar is optional to change the way the numbers are returned. + Notes: + +=cut + +sub sum_lets { + my ($self,$total_only) = @_; + my ($count,$count_multiplets,$multiplet_count); + my $singlets = &get_singlets($self); $count += $singlets; + my $doublets = &get_doublets($self); $count += ($doublets * 2); + my $pairs = &get_pairs($self); $count += ($pairs * 2); + my $singletons = &get_singletons($self); $count += $singletons; + my @multiplets = &get_multiplets($self); + $count_multiplets = @multiplets; + my $return_string; + foreach (@multiplets) { + my $number_members = $self->{'contigs'}->{$_}->{num_members}; + $multiplet_count += $number_members; + } + if ($multiplet_count) { + $count += $multiplet_count; + } + foreach (qw(multiplet_count singlets doublets pairs singletons multiplets count_multiplets)) { + no strict 'refs'; # renege for the block + if (!${$_}) { + ${$_} = 0; + } + } + if (!$multiplet_count) { $multiplet_count = 0; } + if ($total_only) { + return $count; + } + $return_string = "Singt/singn/doub/pair/mult/total : $singlets,$singletons,$doublets(". + ($doublets*2)."),$pairs(".($pairs*2)."),$count_multiplets($multiplet_count),$count"; + return $return_string; +} + +=head2 write_stats() + + Title : write_stats() + Usage : $o_consed->write_stats(); + Function: Write a file called "statistics" containing numbers similar to + those provided in sum_lets(). + Returns : Nothing. Write a file in $o_consed->{path} containing something + like this: + + 0,0,50(100),0(0),0(0),100 + + Where the numbers provided are in the format described in the + documentation for sum_lets(). + Args : None. + Notes : This might break platform independence, I do not know. + +See L<sum_lets()|sum_lets> + +=cut + +sub write_stats { + # worry about platform dependence here? + # oh shucksdarn. + my $self = shift; + my $stats_filename = $self->{'path'}."statistics"; + my $statistics_raw = $self->sum_lets; + (my $statsfilecontents = $statistics_raw) =~ s/.*\ \:\ //g; + umask 0001; + my $fh = new Bio::Root::IO(-file=>"$stats_filename"); + # open(STATSFILE,">$stats_filename") or print("Could not open the statsfile: $!\n"); + $fh->_print("$statsfilecontents"); + # close STATSFILE; + $fh->close(); +} + +=head2 get_singletons() + + Title : get_singletons() + Usage : @singletons = $o_consed->get_singletons(); + Function: Return the keynames of the singletons. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "singleton". + Args : None. + Notes : + +=cut + +sub get_singletons { + # returns an array of singleton names + # singletons are contigs with one member (see consed documentation) + my $self = shift; + my (@singletons,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$key}->{'class'}) { + # print ("$key class: $self->{'contigs'}->{$key}->{'class'}\n"); + } + else { + # print("$key belongs to no class. why?\n"); + } + if ($self->{'contigs'}->{$key}->{'member_array'}) { + @array = @{$self->{'contigs'}->{$key}->{'member_array'}}; + } + my $num_array_elem = @array; + if ($num_array_elem == 1 && $self->{'contigs'}->{$key}->{'class'} && $self->{'contigs'}->{$key}->{'class'} eq "singleton") { push @singletons,$key; } + } + return @singletons; +} + +=head2 get_pairs() + + Title : get_pairs() + Usage : @pairs = $o_consed->get_pairs(); + Function: Return the keynames of the pairs. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "pair". + Args : None. + Notes : + +=cut + +sub get_pairs { + # returns an array of pair contig names + # a pair is a contig of two where the names do not match + my $self = shift; + my (@pairs,@array); + foreach my $key (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$key}->{'member_array'}) { + if (@{$self->{'contigs'}->{$key}->{'member_array'}} == 2 && + $self->{'contigs'}->{$key}->{'class'} eq "pair") { + push @pairs,$key; + } + } + } + return @pairs; +} + +=head2 get_name($contig_keyname) + + Title : get_name($contig_keyname) + Usage : $name = $o_consed->get_name($contig_keyname); + Function: Return the {name} for $contig_keyname. + Returns : A string. ({name}) + Args : A contig keyname. + Notes : + +=cut + +sub get_name { + my ($self,$contig) = @_; + return $self->{'contigs'}->{$contig}->{'name'}; +} + +=head2 _get_contig_name(\@array_containing_reads) + + Title : _get_contig_name(\@array_containing_reads) + Usage : $o_consed->_get_contig_name(\@array_containing_reads); + Function: The logic for the set_doublets subroutine. + Returns : The name for this contig. + Args : A reference to an array containing read names. + Notes : Depends on reverse_designator. Be sure this is set the way you + intend. + +=cut + +sub _get_contig_name { + my ($self,$r_array) = @_; + my @contig_members = @$r_array; + my @name_nodir; + foreach (@contig_members) { + # how can I distinguish the clone name from the direction label? + # look for $Consed::reverse_designator and $Consed::forward_designator + # what if you do not find _any_ of those? + my $forward_designator = $self->{'forward_designator'} || "f"; + my $reverse_designator = $self->{'reverse_designator'} || "r"; + my $any_hits = /(.+)($forward_designator.*)/ || /(.+)($reverse_designator.*)/||/(.+)(_.+)/; + my $name = $1; + my $suffix = $2; + if ($name) { + # print("\t\$name is $name "); + } + if ($suffix) { + # print("and \$suffix is $suffix.\n"); + } + # Jee, I hope we get a naming convention soon + if ($suffix) { + if ($suffix =~ /^$forward_designator/ || $suffix =~ /^$reverse_designator/) { + push @name_nodir,$name; + } + # bugwatch here! should this be unnested? + else { + push @name_nodir,"$name$suffix"; + } + } + } + # print("\@name_nodir: @name_nodir\n"); + my $mismatch = 0; + for (my $counter=0; $counter<@name_nodir;$counter++) { + next if ($name_nodir[0] eq $name_nodir[$counter]); + $mismatch = 1; + } + if ($mismatch == 0) { + # print("\tYou have a cohesive contig named $name_nodir[0].\n\n"); + return $name_nodir[0]; + } + else { + # print("\tYou have mixed names in this contig.\n\n"); + } +} # end _get_contig_name + +=head2 get_doublets() + + Title : get_doublets() + Usage : @doublets = $o_consed->get_doublets(); + Function: Return the keynames of the doublets. + Returns : Returns an array containing the keynames of all + Bio::Tools::Alignment::Consed sequences in the class "doublet". + Args : None. + Notes : + +=cut + +sub get_doublets { + my $self = shift; + if (!$self->{doublets_set}) { + $self->warn("You need to set the doublets before you can get them. Doing that now."); + $self->set_doublets(); + } + my @doublets; + foreach (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$_}->{name} && $self->{'contigs'}->{$_}->{'class'} eq "doublet") { + push @doublets,$_; + } + } + return @doublets; +} # end get_doublets + +=head2 dump_hash() + + Title : dump_hash() + Usage : $o_consed->dump_hash(); + Function: Use dumpvar.pl to dump out the Bio::Tools::Alignment::Consed + object to STDOUT. + Returns : Nothing. + Args : None. + Notes : I used this a lot in debugging. + +=cut + +sub dump_hash { + my $self = shift; + my $dumper = new Dumpvalue; + $self->debug( "Bio::Tools::Alignment::Consed::dump_hash - ". + "The following is the contents of the contig hash...\n"); + $dumper->dumpValue($self->{'contigs'}); +} + +=head2 dump_hash_compact() + + Title : dump_hash_compact() + Usage : $o_consed->dump_hash_compact(); + Function: Dump out the Bio::Tools::Alignment::Consed object in a compact way. + Returns : Nothing. + Args : Nothing. + Notes : Cleaner then dumpValue(), dumpHash(). I used this a lot in + debugging. + +=cut + +sub dump_hash_compact { + no strict 'refs'; # renege for the block + my ($self,$sequence) = @_; + # get the classes + my @singlets = $self->get_singlets(); + my @singletons = $self->get_singletons(); + my @doublets = $self->get_doublets(); + my @pairs = $self->get_pairs(); + my @multiplets = $self->get_multiplets(); + print("Name\tClass\tMembers\tQuality?\n"); + foreach (@singlets) { + my @members = $self->get_members($_); + print($self->get_name($_)."\tsinglets\t".(join',',@members)."\t"); + if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); } + else { print("no qualities found here\n"); } + + } + foreach (@singletons) { + my @members = $self->get_members($_); + print($self->get_name($_)."\tsingletons\t".(join',',@members)."\t"); + if ($self->{'contigs'}->{$_}->{'quality'}) { print("qualities found here\n"); } + else { print("no qualities found here\n"); } + } + foreach my $pair (@pairs) { + my @members = $self->get_members($pair); + my $name; + if (!$self->get_name($pair)) { + $name = "BLANK"; + } + else { $name = $self->get_name($pair); } + print("$name\tpairs\t".(join',',@members)."\n"); + } + foreach (@doublets) { + my @members = $self->get_members_by_name($_); + print("$_\tdoublets\t".(join',',@members)."\t"); + my $contig_number = &get_contig_number_by_name($self,$_); + if ($self->{'contigs'}->{$contig_number}->{'quality'}) { print("qualities found here\n"); } + else { print("no qualities found here\n"); } + # print($_."\tdoublets\t".(join',',@members)."\n"); + } + foreach (@multiplets) { + my @members = $self->get_members($_); + print("Contig $_"."\tmultiplets\t".(join',',@members)."\n"); + } +} # end dump_hash_compact + +=head2 get_phreds() + + Title : get_phreds() + Usage : @phreds = $o_consed->get_phreds(); + Function: For each doublet in the Bio::Tools::Alignment::Consed hash, go + and get the phreds for the top and bottom reads. Place them into + {top_phreds} and {bottom_phreds}. + Returns : Nothing. + Args : Nothing. + Notes : Requires parse_phd() and reverse_and_complement(). I realize that + it would be much more elegant to pull qualities as required but there + were certain "features" in the acefile that required a bit more + detailed work be done to get the qualities for certain parts of the + consensus sequence. In order to make _sure_ that this was done + properly I wrote things to do all steps and then I used dump_hash() + and checked each one to ensure expected bahavior. I have never changed + this, so there you are. + +=cut + +sub get_phreds { + # this subroutine is the target of a rewrite to use the Bio::Tools::Alignment::Phred object. + my $self = shift; + my $current_contig; + foreach $current_contig (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") { + $self->debug("$current_contig is a doublet. Going to parse_phd for top($self->{'contigs'}->{$current_contig}->{'top_name'}) and bottom($self->{'contigs'}->{$current_contig}->{'bottom_name'})\n"); + my $r_phreds_top = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'top_name'}); + my $r_phreds_bottom = &parse_phd($self,$self->{'contigs'}->{$current_contig}->{'bottom_name'}); + if ($self->{'contigs'}->{$current_contig}->{'top_complement'} eq "C") { + # print("Reversing and complementing...\n"); + $r_phreds_top = &reverse_and_complement($r_phreds_top); + } + if ($self->{'contigs'}->{$current_contig}->{'bottom_complement'} eq "C") { + $r_phreds_bottom = &reverse_and_complement($r_phreds_bottom); + } + $self->{'contigs'}->{$current_contig}->{'top_phreds'} = $r_phreds_top; + $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = $r_phreds_bottom; + } + } +} + +=head2 parse_phd($read_name) + + Title : parse_phd($read_name) + Usage : $o_consed->parse_phd($read_name); + Function: Suck in the contents of a .phd file. + Returns : A reference to an array containing the quality values for the read. + Args : The name of a read. + Notes : This is a significantly weak subroutine because it was always + intended that these functions, along with the functions provided by + get_phreds() be put into the Bio::SeqIO:phd module. This is done + now but the Bio::Tools::Alignment::Consed module has not be + rewritten to reflect this change. + +See L<Bio::SeqIO::phd> for more information. + +=cut + +sub parse_phd { + my ($self,$sequence_name) = @_; + $self->debug("Parsing phd for $sequence_name\n"); + my $in_dna = 0; + my $base_number = 0; + my (@bases,@current_line); + # print("parse_phd: $sequence_name\n"); + my $fh = new Bio::Root::IO(-file=>"$self->{path}/../phd_dir/$sequence_name.phd.1"); + # open(PHD,"<$self->{path}/../phd_dir/$sequence_name.phd.1") or + # die "Couldn't open the phred for $sequence_name\n"; + while ($fh->_readline()) { + # print("Reading a line from a phredfile!\n"); + chomp; + if (/^BEGIN_DNA/) { $in_dna = 1; next} + if (/^END_DNA/) { last; } + if (!$in_dna) { next; } + push(@bases,$_); + } + return \@bases; +} + +=head2 reverse_and_complement(\@source) + + Title : reverse_and_complement(\@source) + Usage : $reference_to_array = $o_consed->reverse_and_complement(\@source); + Function: A stub for the recursive routine reverse_recurse(). + Returns : A reference to a reversed and complemented array of phred data. + Args : A reference to an array of phred data. + Notes : + +=cut + +sub reverse_and_complement { + my $r_source = shift; + my $r_destination; + $r_destination = &reverse_recurse($r_source,$r_destination); + return $r_destination; +} + +=head2 reverse_recurse($r_source,$r_destination) + + Title : reverse_recurse(\@source,\@destination) + Usage : $o_consed->reverse_recurse(\@source,\@destination); + Function: A recursive routine to reverse and complement an array of phred + data. + Returns : A reference to an array containing reversed phred data. + Args : A reference to a source array and a reverence to a destination + array. + Notes : Recursion is kewl, but this sub should likely be _reverse_recurse. + +=cut + +my $current_base; +sub reverse_recurse($$) { + (my $r_source,my $r_destination) = @_; + if (!@$r_source) { return $r_destination; } + $_=pop(@$r_source); + (s/c/g/||s/g/c/||s/a/t/||s/t/a/); + push(@$r_destination,$_); + &reverse_recurse($r_source,$r_destination); +} + +=head2 show_missing_sequence() + + Title : show_missing_sequence(); + Usage : $o_consed->show_missing_sequence(); + Function: Used by set_trim_points_doublets() to fill in quality values where + consed (phrap?) set them to 0 at the beginning and/or end of the + consensus sequences. + Returns : Nothing. + Args : None. + Notes : Acts on doublets only. Really very somewhat quite ugly. A + disgusting kludge. <insert pride here> It was written stepwise with + no real plan because it was not really evident why consed (phrap?) + was doing this. + +=cut + +sub show_missing_sequence() { + # decide which sequence should not have been clipped at consensus position = 0 + my $self = shift; + &get_phreds($self); + my ($current_contig,@qualities); + foreach $current_contig (sort keys %{$self->{'contigs'}}) { + if ($self->{'contigs'}->{$current_contig}->{'class'} eq "doublet") { + my $number_leading_xs = 0; + my $number_trailing_xs = 0; + my $measurer = $self->{'contigs'}->{$current_contig}->{'quality'}; + while ($measurer =~ s/^\ 0\ /\ /) { + $number_leading_xs++; + } + while ($measurer =~ s/\ 0(\s*)$/$1/) { + $number_trailing_xs++; + } + @qualities = split(' ',$self->{'contigs'}->{$current_contig}->{'quality'}); + my $in_initial_zeros = 0; + for (my $count=0;$count<scalar(@qualities); $count++) { + if ($qualities[$count] == 0) { + my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data); + # print("The quality of the consensus at ".($count+1)." is zero. Retrieving the real quality value.\n"); + # how do I know which strand to get these quality values from???? + # boggle + my $top_quality_here = $self->{'contigs'}->{$current_contig}->{'top_phreds'}->[0-$self->{'contigs'}->{$current_contig}->{'top_start'}+$count+1]; + my $bottom_quality_here = $self->{'contigs'}->{$current_contig}->{'bottom_phreds'}->[1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count]; + if (!$bottom_quality_here || (1-$self->{'contigs'}->{$current_contig}->{'bottom_start'}+$count)<0) { + $bottom_quality_here = "not found"; + } + if (!$top_quality_here) { + $top_quality_here = "not found"; + } + # print("Looking for quals at position $count of $current_contig: top position ".(0-$self->{'contigs'}->{$current_contig}->{top_start}+$count)." ($self->{'contigs'}->{$current_contig}->{top_name}) $top_quality_here , bottom position ".(1-$self->{'contigs'}->{$current_contig}->{bottom_start}+$count)." ($self->{'contigs'}->{$current_contig}->{bottom_name}) $bottom_quality_here\n"); + if ($count<$number_leading_xs) { + # print("$count is less then $number_leading_xs so I will get the quality from the top strand\n"); + # print("retrieved quality is ".$self->{'contigs'}->{$current_contig}->{top_phreds}[0-$self->{'contigs'}->{$current_contig}->{top_start}+$count+1]."\n"); + my $quality = $top_quality_here; + $quality =~ /\S+\s(\d+)\s+/; + $quality = $1; + # print("retrieved quality for leading zero $count is $quality\n"); + # t 9 9226 + $qualities[$count] = $quality; + } + else { + # this part is tricky + # if the contig is like this + # cccccccccccccccc + # ffffffffffffffffff + # rrrrrrrrrrrrrrrrr + # then take the quality value for the trailing zeros in the cons. seq from the r + # + # but if the contig is like this + # cccccccccccccccccc + # ffffffffffffffffffffffffffffffff + # rrrrrrrrrrrrrrrrrrrrrrrxxxxxxxxr + # ^^^ + # then any zeros that fall in the positions (^) must be decided whether the quality + # is the qual from the f or r strand. I will use the greater number + # does a similar situation exist for the leading zeros? i dunno + # + # print("$count is greater then $number_leading_xs so I will get the quality from the bottom strand\n"); + # print("retrieved quality is ".$contigs->{$current_contig}->{top_phreds}[0-$contigs->{$current_contig}->{top_start}+$count+1]."\n"); + # my ($quality,$top_phred_position,$bottom_phred_position,$top_phred_data,$bottom_phred_data); + if ($bottom_quality_here eq "not found") { + # $top_phred_position = 1-$contigs->{$current_contig}->{bottom_start}+$count; + # print("Going to get quality from here: $top_phred_position of the top.\n"); + # my $temp_quality - $contigs->{$current_contig}->{top_phreds} + # $quality = $contigs->{$current_contig}->{top_phreds}[$top_phred_position]; + $top_quality_here =~ /\w+\s(\d+)\s/; + $quality = $1; + } + elsif ($top_quality_here eq "not found") { + # $bottom_phred_position = 1+$contigs->{$current_contig}->{bottom_start}+$count; + # print("Going to get quality from here: $bottom_phred_position of the bottom.\n"); + # $quality = $contigs->{$current_contig}->{bottom_phreds}[$bottom_phred_position]; + # print("Additional: no top quality but bottom is $quality\n"); + $bottom_quality_here =~ /\w+\s(\d+)\s/; + $quality = $1; + } + else { + # print("Oh jeepers, there are 2 qualities to choose from at this position.\n"); + # print("Going to compare these phred qualities: top: #$top_quality_here# bottom: #$bottom_quality_here#\n"); + # now you have to compare them + # my $top_quality_phred = $contigs->{$current_contig}->{top_phreds}[$top_phred_position]; + # #t 40 875# + # print("regexing #$top_quality_here#... "); + $top_quality_here =~ /\w\ (\d+)\s/; + my $top_quality = $1; + # print("$top_quality\nregexing #$bottom_quality_here#... "); + $bottom_quality_here =~ /\w\ (\d+)\s/; + my $bottom_quality = $1; + # print("$bottom_quality\n"); + # print("top_quality: $top_quality bottom quality: $bottom_quality\n"); + if ($bottom_quality > $top_quality) { + # print("Chose to take the bottom quality: $bottom_quality\n"); + $quality = $bottom_quality; + } else { + # print("Chose to take the top quality: $top_quality\n"); + $quality = $top_quality; + } + } + if (!$quality) { + # print("Warning: no quality value for $current_contig, position $count!\n"); + # print("Additional data: top quality phred: $top_quality_here\n"); + # print("Additional data: bottom quality phred: $bottom_quality_here\n"); + } else { + $qualities[$count] = $quality; + } + } + } + + } + unless (!@qualities) { + $self->{'contigs'}->{$current_contig}->{'quality'} = join(" ",@qualities); + } + $self->{'contigs'}->{$current_contig}->{'bottom_phreds'} = undef; + $self->{'contigs'}->{$current_contig}->{'top_phreds'} = undef; + my $count = 1; + } # end foreach key + } +} + + +# Autoload methods go after =cut, and are processed by the autosplit program. + +1; +__END__ + +=head1 SEE ALSO + +perl(1). + +=cut