Mercurial > repos > fastaptamer > fastaptamer_cluster

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastaptamer_cluster	Wed Jan 14 19:04:47 2015 -0500
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+
+# Last Modified January 11th, 2015 13:27 CST
+
+use Getopt::Long;   ## Core Perl module for command line options/arguments
+
+my $start = time;           ## Starts program execution timer
+my $infile;                 ## Variable for input file
+my $outfile;                ## Variable for output file
+my $defined_edit_distance;  ## Variable (integer) for user-defined edit distance
+my $help;                   ## true/false variable for help screen
+my $quiet;                  ## true/false to suppress standard output
+my $filter = 0;				## Variable to define a read filter for entries
+my $version;                ## true/false variable for version screen
+
+                                           ## Take command line arguments for...
+GetOptions (    "infile=s" => \$infile,    ## ... input file
+                "outfile=s" => \$outfile,  ## ... output file
+                "distance=i" => \$defined_edit_distance,  ## ... edit distance
+                "filter=f" => \$filter,    ## ... read filter
+                "help" => \$help,          ## ... help screen
+                "version" => \$version,    ## ... version screen
+                "quiet" => \$quiet);       ## ... supress standard output
+
+if ($help){         ## Prints help screen if $help returns as true
+print <<"HELP";
+
+--------------------------------------------------------------------------------
+                              FASTAptamer-Cluster
+--------------------------------------------------------------------------------
+
+Usage: fastaptamer_cluster [-h] [-i INFILE] [-o OUTFILE] [-d] [-f] [-q] [-v]
+
+    [-h]            = Help screen.
+    [-i INFILE]     = Input file from fastaptamer_count; required.
+    [-o OUTFILE]    = Output file, FASTA format.
+    [-d]            = Edit distance for clustering sequences.
+    [-f]            = Read filter. Only sequences with total reads greater than
+                      the value supplied will be clustered.
+    [-q]            = Quiet mode.  Suppresses standard output of file I/O, numb-
+                      er of clusters, cluster size and execution time.
+    [-v]            = Display version.
+
+FASTAptamer-Cluster uses the Levenshtein algorithm to cluster together sequences
+based on a user-defined edit distance.  The most abundant and unclustered seque-
+nce is used as the "seed sequence" for which edit distance is calculated from.
+Output is FASTA with the following information on the identifier line:
+
+    >Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeed
+    SEQUENCE
+
+To prevent clustering of sequences not highly sampled (and improve execution ti-
+me), invoke the read filter and enter a number.  Only sequences with total reads
+greater than the number entered will be clustered.
+
+Input for FASTAptamer-Cluster MUST come from FASTAptamer-Count.
+
+PLEASE NOTE: This is a computationally intense program that can take multiple h-
+ours to finish, depending on the size and complexity of your population. Utilize
+the read filter [-f] to improve execution time.
+
+HELP
+exit;
+}
+
+if (defined $version){                           ## Print version screen if -v is true
+    print <<"VERSION";
+
+FASTAptamer v1.0
+
+VERSION
+exit;
+}
+
+#########################################
+# Open input file or exit with warning  #
+#########################################
+
+open (INPUT, '<', $infile) or die
+"No input file specified.\nSee help [-h] or README for correct usage.\n";
+
+#########################################
+# Open output file or exit with warning #
+#########################################
+
+open (OUTPUT, '>', $outfile) or die
+"No output file specified.\nSee help [-h] or README for correct usage.\n";
+
+##############################################
+# Exit with warning if no distance specified #
+##############################################
+
+unless ($defined_edit_distance) {
+die "No edit distance specified.\nSee help [-h] or README for correct usage.\n"; }
+
+my @sequence_and_info;   ## Array to contain each FASTA entry individually
+my $entries;             ## Variable to keep track of the number of entries
+
+$/= ">";   ## Change default input record separator to read FASTA formatted file
+
+while (<INPUT>){                             ## Reads through entire input file
+    if ($_ =~ /(\d+-\d+-\d+.*\d*\n\w+)/){    ## Regex for FASTAptamer-Count format
+        my @read_count_test = split /-/, $1; ## Splits entry by dashes, loads array
+        if ($read_count_test[1] > $filter){     ## Tests for read count filter
+            push @sequence_and_info, $1;         ## Add each matched entry to array
+            $entries++;                          ## Increase entry count by 1
+        }
+    }
+}
+
+close INPUT;  ## Closes input file
+
+unless ($quiet){  ## Unless -q option is invoked, print this warning and summary
+	print "\n**************************************************";
+    print "\nClustering is a computationally intense process.\n";
+    print "Please be patient as clustering can take several\n";
+    print "hours for a single file.  For faster clustering\n";
+    print "use the read filter [-f] to avoid clustering of low-read.\n";
+    print "sequences.\n";
+    print "**************************************************";
+    print "\n\nTotal number of sequences to cluster: $entries.\n";
+    print "\nCluster\tUnique Sequences\tReads\tRPM\n";
+}
+
+my $current_cluster = 1;  ## This variable defines what cluster we're working on
+
+iterate(@sequence_and_info);  ## Sends array with sequence entries to subroutine
+
+##############################################
+#   SUBROUTINE THAT BEGINS TO PROCESS FILE   #
+# One iteration generates a single cluster   #
+# and then repeats w/  unclustered sequences #
+# for next cluster                           #
+##############################################
+
+sub iterate {
+    my ( $top_entry, @entries ) = @_; ## Takes "seed sequence" and remaining entries
+    my @keepers; ## Array to store unclustered entries
+    my @current_seq_info = split /\n/, $top_entry;
+    ## Splits entry into metrics (reads, rank, RPM) and the "seed" sequence
+    my $cluster_rank = 1; ## Defines that seed sequence is ranked #1
+
+    print OUTPUT ">$current_seq_info[0]-$current_cluster-$cluster_rank-0\n$current_seq_info[1]\n";
+    ## Prints current seed sequence in FASTAptamer-Cluster format
+
+	my @current_seq_metrics = split /-/, $current_seq_info[0]; ## Split sequence identifier line
+	my $current_cluster_reads = $current_seq_metrics[1]; ## Take reads to tally cluster size
+	my $current_cluster_rpm = $current_seq_metrics[2]; ## Take RPM to tally cluster size in RPM
+
+    for (@entries) { ## for each entry left in array send to calculate distance
+    	my @comparison_seq_info = split /\n/, $_; ## Split entry into metrics and sequence
+    	my @comparison_seq_metrics = split /-/, $comparison_seq_info[0]; ## Split identifier line
+    	my $comparison_seq_reads = $comparison_seq_metrics[1];  ## Take reads to tally cluster size
+    	my $comparison_seq_rpm = $comparison_seq_metrics[2];  ## Take RPM to tally cluster size
+        my $distance = levenshtein( $current_seq_info[1], $comparison_seq_info[1] );
+        ## sends comparison sequence to compare against current seed sequence, returns distances
+        if ( $distance > $defined_edit_distance ) { ## If distance is greater than defined
+            push @keepers, $_;  ## Add to array for comparison in next iteration
+        }
+        elsif ( $distance <= $defined_edit_distance ){ ## If distance is less than or equal to defined
+        	$cluster_rank++;  ## Increment the cluster rank
+        	$current_cluster_reads += $comparison_seq_reads; ## Add reads to cluster reads tally
+        	$current_cluster_rpm += $comparison_seq_rpm;  ## Add RPM to cluster RPM tally
+        	print OUTPUT ">$comparison_seq_info[0]-$current_cluster-$cluster_rank-$distance\n";
+        	print OUTPUT "$comparison_seq_info[1]\n"; ## Print entry in output file
+        }
+    }
+
+    unless ($quiet) { print "$current_cluster\t$cluster_rank\t$current_cluster_reads\t$current_cluster_rpm\n"; }
+    ## Display cluster number, number of unique sequences, reads and RPM
+    $current_cluster++; ## Increment cluster number prior to next cluster
+
+    if (@keepers) { ## Subroutine within subroutine for sequences that are unclustered
+        iterate(@keepers);
+    }
+}
+
+my $duration = time - $start; ## Calculates how much time has elapsed since start
+
+unless ($quiet) { ## Displays summary report unless -q is invoked
+    print "\nInput file: \"$infile\".\n";
+    print "Output file: \"$outfile\".\n";
+    print "Execution time: $duration s.\n\n";
+}
+
+############################################################################
+# The subroutines below calculates the Levenshtein edit distance for the   #
+# current "seed" sequence and the comparison sequence that are sent to it  #
+############################################################################
+
+
+sub levenshtein {
+	my ($s1, $s2) = @_;
+	my ($len1, $len2) = (length $s1, length $s2);
+
+	return $len2 if ($len1 == 0);
+	return $len1 if ($len2 == 0);
+
+	my %mat;
+
+	for (my $i = 0; $i <= $len1; ++$i){
+		for (my $j = 0; $j <= $len2; ++$j){
+			$mat{$i}{$j} = 0;
+			$mat{0}{$j} = $j;
+		}
+		$mat{$i}{0} = $i;
+	}
+
+	my @ar1 = split(//, $s1);
+	my @ar2 = split(//, $s2);
+
+	for (my $i = 1; $i <= $len1; ++$i){
+		for (my $j = 1; $j <= $len2; ++$j){
+			my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
+			$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
+			$mat{$i}{$j-1} + 1,
+			$mat{$i-1}{$j-1} + $cost]);
+		}
+	}
+    return $mat{$len1}{$len2};
+}
+
+sub min
+{
+    my @list = @{$_[0]};
+    my $min = $list[0];
+
+    foreach my $i (@list)
+    {
+        $min = $i if ($i < $min);
+    }
+
+    return $min;
+}