changeset 0:32c1b4bb8d17 draft

Uploaded
author fastaptamer
date Wed, 14 Jan 2015 19:00:40 -0500
parents
children 350a7a33158a
files fastaptamer_count
diffstat 1 files changed, 158 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastaptamer_count	Wed Jan 14 19:00:40 2015 -0500
@@ -0,0 +1,158 @@
+#!/usr/bin/env perl
+
+# Last Modified December 23rd, 2014 21:32 CST
+
+use Getopt::Long;    ## Core Perl module for command line arguments/options
+
+my $start = time;    ## Starts program execution timer
+my $input_fh;        ## Input file handle
+my $output_fh;       ## Output file handle
+my $quiet;           ## true/false variable for summary report supression
+my $help;            ## true/false variable for help screen
+my $version;         ## true/false variable for version screen
+
+                                           ## Take command line arguments for...
+GetOptions (    "input=s" => \$input_fh,        ## ...input file
+                "output=s" => \$output_fh,      ## ...output file
+                "quiet" => \$quiet,             ## ...supressing std. out
+                "version" => \$version,         ## ...showing version screen
+                "help" => \$help);              ## ...showing help screen
+		
+if (defined $help){                           ## Print help screen if -h is true
+    print <<"HELP";
+	
+--------------------------------------------------------------------------------
+                               FASTAptamer-Count
+--------------------------------------------------------------------------------
+
+Usage: fastaptamer_count [-h] [-q] [-i INFILE] [-o OUTFILE] [-v]
+
+    [-h]            = Help screen.
+    [-q]            = Suppress STDOUT of run report.
+    [-i INFILE]     = FASTQ input file.
+    [-o OUTFILE]    = FASTA output file. 
+    [-v]            = Display version.
+
+FASTAptamer-Count serves as the gateway to the FASTAptamer toolkit.  For a given
+.FASTQ input file, FASTAptamer-Count will determine the number of times each se-
+quence was read, rank and sort sequences by decreasing total reads, and normali-
+ze sequence frequency to reads per million. Output is generated as a non-redund-
+ant FASTA file in the following format:
+
+        >RANK-READS-RPM
+        SEQUENCE
+		
+Summary report (total reads, unique reads, and execution time) is displayed as 
+STDOUT at program completion, unless [-q] is invoked.
+
+HELP
+exit;
+}
+
+if (defined $version){                           ## Print version screen if -v is true
+    print <<"VERSION";
+	
+FASTAptamer v1.0
+	
+VERSION
+exit;
+}
+
+#########################################	
+# Open input file or exit with warning  #
+#########################################
+
+open (INPUT, '<', $input_fh) or die		
+"\nCould not open input file or no input file was specified.\n
+See help documentation [-h] for program usage.\n";
+
+#########################################
+# Open output file or exit with warning #
+#########################################
+
+open (OUTPUT, '>', $output_fh) or die		
+"\nNo output file specified.\n
+See help documentation [-h] for program usage.\n";
+
+##############################################
+# Create a hash for sequence reads where key #
+# is the sequence and value is read count    #
+##############################################
+
+my %sequence_reads;	
+
+my $entries = 0;    ## Counts the number of entries processed
+
+$/ = "@"; ## Change default input record separator to read FASTQ formatted file	
+
+###############################################################################
+# Read input file and for each entry, add sequence or increment value in hash #
+###############################################################################
+
+while (<INPUT>){
+    if ($_ =~ /.+\n(\S+)\n.+\n.+/){
+        my $sequence = $1;
+        $sequence_reads{$sequence} += 1;
+        $entries++;
+    }
+}
+
+close INPUT;
+
+###########################################
+# Sort hash by decreasing read count by   #
+# converting to arrays of values and keys #
+###########################################
+
+my @keys = sort { $sequence_reads{$b} <=> $sequence_reads{$a} } keys(%sequence_reads);
+my @vals = @sequence_reads{@keys};
+
+#################################################################################
+# Using both arrays, print to output the FASTA formatted file containing        #
+# the sequence rank, reads, reads per million, and sequence. The extra          #
+# scalars here are used to ensure that sequences with equal read counts         #
+# receive the same rank value, and that the next untied value is properly       #
+# assigned a rank that takes into account the number of tied values before it,  #
+# also known as standard competition ranking.                                   #
+#################################################################################
+
+my $last_reads_count = $vals[0];
+my $rank = 1;
+
+for my $i (0..$#keys){
+    my $current_reads_count = $vals[$i];
+    if ($current_reads_count < $last_reads_count){
+        $rank = $i + 1; 
+        print OUTPUT ">$rank-$vals[$i]-";
+        printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
+        print OUTPUT "\n";
+        print OUTPUT "$keys[$i]\n";
+		
+    }
+    elsif ($current_reads_count == $last_reads_count){
+        print OUTPUT ">$rank-$vals[$i]-";
+        printf OUTPUT "%.2f", ($current_reads_count/$entries)*(1000000);
+        print OUTPUT "\n";
+        print OUTPUT "$keys[$i]\n";
+    }
+    $last_reads_count = $vals[$i];
+	
+}
+
+close OUTPUT;
+
+#################################################################################
+# Unless the -q option is invoked, the following statistics are printed as      #
+# standard output.                                                              #
+#################################################################################
+
+unless ($quiet){
+    print "\n$entries total sequences.\n" . ($#keys + 1) . " unique sequences.\n";
+    print "\nInput file: \"$input_fh\".\n";
+    print "Output file: \"$output_fh\".\n";
+
+    my $duration = time - $start;
+    print "Execution time: $duration s.\n\n";
+}
+
+exit;