changeset 0:7a29d36b8f85 draft

Uploaded
author fastaptamer
date Wed, 14 Jan 2015 19:08:15 -0500
parents
children 119f3b536f60
files fastaptamer_search
diffstat 1 files changed, 234 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastaptamer_search	Wed Jan 14 19:08:15 2015 -0500
@@ -0,0 +1,234 @@
+#!/usr/bin/env perl
+
+# Last Modified January 11th, 2015 17:09 CST
+
+use Getopt::Long;  # Core Perl module for command line arguments/options
+
+###############################################################################
+
+                ### Variables for command line arguments
+my @inputlist;    # Array of input files
+my $output;       # Output filename (optional)
+my @patternlist;  # Array of pattern(s) to search for
+my $highlight;    # If defined, highlight matches using parens
+my $help;         # If defined, show help dialogue
+my $quiet;        # If defined, suppress report in STDOUT
+my $version;      # display version
+
+### Process command line arguments
+GetOptions (    
+  "input=s"    => \@inputlist,    # input file(s)
+  "output=s"   => \$output,       # output file (optional)
+  "pattern=s"  => \@patternlist,  # pattern(s) to search for
+  "highlight"  => \$highlight,    # highlight matched portion with parens
+  "quiet"      => \$quiet,        # suppressing summary report
+  "version"    => \$version,      # display version
+  "help"       => \$help);        # displaying help information
+
+if(defined $help) {  ### Prints help screen if -help is invoked
+        print <<"HELP";
+
+--------------------------------------------------------------------------------
+                              FASTAptamer-Search
+--------------------------------------------------------------------------------
+
+Usage: fastaptamer_search [-i INFILE] [-o OUTFILE] [-p PATTERN] 
+
+    [-help]            	= Show help screen and exit.
+    [-i FILENAME]     	= Input filename; can be used multiple times.
+    [-p PATTERN]		= Sequence pattern to search for; can be used
+                          multiple times.
+    [-o FILENAME]   	= Output file for search results. 
+                          If none given, output goes to STDOUT. 
+    [-highlight]        = "Highlight" matched portion in parentheses.
+    [-q]                = Suppress summary report.
+    [-v]                = Display version.
+
+FASTAptamer-Search allows users to search for specific patterns within one or m-
+ore sequence files.
+
+To search through more than one input file, simply use the -i flag multiple tim-
+es. All input files must use FASTA format.
+
+Similarly, to search for multiple patterns simultaneously, use the -p flag as m-
+any times as needed. When searching for multiple patterns, note that partial ma-
+tches are not returned. For example, entering the following command:
+
+fastaptamer_search -i FILE1 -i FILE2 -p ATTGCC -p TGGCAT
+
+would search FILE1 and FILE2 for any sequences containing both ATTGCC and TGGCAT
+
+Patterns and input sequence data are case insensitive, and T/U are interchangea-
+ble. In addition to single bases, patterns can include any of the degenerate ba-
+se symbols from IUPAC nucleic acid notation:
+
+    A/T/G/C/U    single bases
+
+    R    puRines (A/G)
+    Y    pYrimidines (C/T)
+    W    Weak (A/T)
+    S    Strong (G/C)
+    M    aMino (A/C)
+    K    Keto (G/T)
+
+    B    not A
+    D    not C
+    H    not G
+    V    not T
+
+    N    aNy base (not a gap)
+
+For greater visibility, pattern matches can be highlighted by parentheses in the
+output by calling the -highlight flag.
+
+A summary report is generated after each file's search results and after search 
+completion. To suppress these reports, enable quiet mode using the [-quiet] flag
+
+HELP
+exit;
+
+}
+
+if (defined $version){                           ## Print version screen if -v is true
+    print <<"VERSION";
+	
+FASTAptamer v1.0
+	
+VERSION
+exit;
+}
+
+########################################################################################
+
+unless(@inputlist) { die "\nNo input file was specified.\nSee help documentation [-help] for program usage.\n\n"; }
+
+unless(@patternlist) {die "\nNo search pattern was specified.\nSee help documentation [-help] for program usage.\n\n"; }
+
+my $start_time = time;		### Start timer
+
+my $inputcount = scalar(@inputlist);
+my $patterncount = scalar(@patternlist);
+
+### Convert user-entered patterns into regexes
+my @superarray;		### Array of 2-element arrays via references: ([pattern, regex], [...], etc.)
+my $regex;
+foreach my $pattern (@patternlist) {
+	$regex = $pattern;
+        $regex =~ s{[TU]}{\[TU\]}gi;	# Make T and U interchangeable
+	$regex =~ s{W}{\[ATU\]}gi;     	# W = ATU	Regex modifiers after s{}{}:
+        $regex =~ s{S}{\[CG\]}gi;      	# S = CG	g tells the script to find ALL matches rather than stop after the first
+        $regex =~ s{M}{\[AC\]}gi;      	# M = AC	i makes searching case insensitive
+        $regex =~ s{K}{\[GTU\]}gi;     	# K = GTU
+        $regex =~ s{R}{\[AG\]}gi;      	# R = AG
+        $regex =~ s{Y}{\[CTU\]}gi;     	# Y = CTU
+        $regex =~ s{B}{\[CGTU\]}gi;    	# B = CGTU
+        $regex =~ s{D}{\[AGTU\]}gi;    	# D = AGTU
+        $regex =~ s{H}{\[ACTU\]}gi;    	# H = ACTU
+        $regex =~ s{V}{\[ACG\]}gi;     	# V = ACG
+        $regex =~ s{N}{\[ACGTU\]}gi;   	# N = ACGTU
+	### Put the pattern and regex into an array and push the whole thing onto @superarray
+	push(@superarray, [$pattern, $regex]);		# Push $subarray onto the end of @superarray
+	}
+
+my $fh_out;
+if(defined $output) { open($fh_out, '>', $output) or die "\nCould not open output file.\nSee help documentation [-help] for program usage.\n\n"; }
+
+my $input;
+my $current_input = 0;
+my $subarray;
+my ($line1, $line2);
+my $match_hits = 0;
+my $match_line = '';
+my $match_portion = '';
+my $filehits = 0;
+my $totalhits = 0;
+
+### Loop logic: read one input file at a time and search all patterns through each line
+
+
+
+### FASTA input
+
+foreach $input (@inputlist) {                
+    $current_input++;
+    if(defined $output) {
+        print $fh_out "\n\n--------------------------------------------------------------------------------\n";
+        print $fh_out "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n";
+        print $fh_out "--------------------------------------------------------------------------------\n\n";
+    }
+    else {
+        print "\n\n--------------------------------------------------------------------------------\n";
+        print "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n";
+        print "--------------------------------------------------------------------------------\n\n";
+    }
+    open(my $fh_in, '<', $input) or die "Could not open input file $input";
+
+    while($line1 = <$fh_in>) {
+        $line2 = <$fh_in>;
+        my $not_first_regex = 0;
+        my $hit_confirmed = 0;         
+        foreach $subarray (@superarray) {
+            $regex = $subarray->[1];        ### Get the regex from subarray
+            ###            $1
+            if($line2 =~ m{($regex)}gi) {           ### Search for all matches, case insensitive
+                $match_portion = $1;            ### Portion of sequence that matched was captured in magic variable $1
+                if($not_first_regex == 0) { ($match_line = $line2) =~ s{$match_portion}{\($match_portion\)}g; }
+                else { $match_line =~ s{$match_portion}{\($match_portion\)}g; }
+                $not_first_regex = 1;
+                $hit_confirmed++; 
+            }
+        }
+        if(defined $output and $hit_confirmed == $patterncount) {
+            $filehits++;    ### Increment FILE-SPECIFIC hit counter      
+            $totalhits++;   ### Increment OVERALL hit counter
+            if (defined $highlight) { print $fh_out "$line1$match_line"; }
+            else { print $fh_out "$line1$line2"; }
+        }
+        elsif(not defined $output and $hit_confirmed == $patterncount) {
+            $filehits++;    ### Increment FILE-SPECIFIC hit counter      
+            $totalhits++;   ### Increment OVERALL hit counter
+            if (defined $highlight) { print "$line1$match_line"; }
+            else { print "$line1$line2"; }
+        }
+    }
+                
+    if(defined $output and not defined $quiet) {            ### Print file-specific stats, unless quiet mode
+        if ($filehits > 1) { print $fh_out "\nMatched $filehits sequences in file $input.\n"; } 
+        elsif ($filehits == 1) { print $fh_out "\nMatched 1 sequence in file $input.\n"; }                                   
+        elsif ($filehits == 0) { print $fh_out "\nDid not match any sequences in file $input.\n" }
+        $filehits = 0;     ### Reset file-specific hit counter after stats are printed
+    }
+    elsif(not defined $output and not defined $quiet) {
+        if ($filehits > 1) { print "\nMatched $filehits sequences in file $input.\n"; } 
+        elsif ($filehits == 1) { print "\nMatched 1 sequence in file $input.\n"; }                                   
+        elsif ($filehits == 0) { print "\nDid not match any sequences in file $input.\n" }
+        $filehits = 0;     ### Reset file-specific hit counter after stats are printed
+    }
+}
+
+
+### Print a summary after script completion, unless quiet mode
+unless(defined $quiet) {
+	my $duration = time - $start_time;
+	
+	print "\n\n--------------------------------------------------------------------------------\n";
+	print " SEARCH RESULT SUMMARY\n";
+	print "--------------------------------------------------------------------------------\n\n";
+	
+	if($patterncount > 1) { print "Searched for $patterncount patterns:\n"; }
+	elsif($patterncount == 1) { print "Searched for 1 pattern:\n"; }
+	foreach $subarray (@superarray) { print "$subarray->[0]\n"; }
+	
+	print "\nacross the following $inputcount input files:\n";
+	print join("\n", @inputlist);
+	
+	if($totalhits > 1) { print "\nOverall, $totalhits sequences were matched.\n"; } 
+        elsif($totalhits == 1) { print "\nOverall, 1 sequence was matched.\n"; }
+	elsif($totalhits == 0) { print "\nDid not find any matches.\n"; }
+	
+	if($duration == 1) { print "\nYour search took 1 second.\n\n"; }
+	else { print "\nYour search took $duration seconds.\n\n"; }
+	
+	print "--------------------------------------------------------------------------------\n";
+	print "--------------------------------------------------------------------------------\n\n";
+}