Mercurial > repos > fastaptamer > fastaptamer_search
changeset 0:7a29d36b8f85 draft
Uploaded
author | fastaptamer |
---|---|
date | Wed, 14 Jan 2015 19:08:15 -0500 |
parents | |
children | 119f3b536f60 |
files | fastaptamer_search |
diffstat | 1 files changed, 234 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaptamer_search Wed Jan 14 19:08:15 2015 -0500 @@ -0,0 +1,234 @@ +#!/usr/bin/env perl + +# Last Modified January 11th, 2015 17:09 CST + +use Getopt::Long; # Core Perl module for command line arguments/options + +############################################################################### + + ### Variables for command line arguments +my @inputlist; # Array of input files +my $output; # Output filename (optional) +my @patternlist; # Array of pattern(s) to search for +my $highlight; # If defined, highlight matches using parens +my $help; # If defined, show help dialogue +my $quiet; # If defined, suppress report in STDOUT +my $version; # display version + +### Process command line arguments +GetOptions ( + "input=s" => \@inputlist, # input file(s) + "output=s" => \$output, # output file (optional) + "pattern=s" => \@patternlist, # pattern(s) to search for + "highlight" => \$highlight, # highlight matched portion with parens + "quiet" => \$quiet, # suppressing summary report + "version" => \$version, # display version + "help" => \$help); # displaying help information + +if(defined $help) { ### Prints help screen if -help is invoked + print <<"HELP"; + +-------------------------------------------------------------------------------- + FASTAptamer-Search +-------------------------------------------------------------------------------- + +Usage: fastaptamer_search [-i INFILE] [-o OUTFILE] [-p PATTERN] + + [-help] = Show help screen and exit. + [-i FILENAME] = Input filename; can be used multiple times. + [-p PATTERN] = Sequence pattern to search for; can be used + multiple times. + [-o FILENAME] = Output file for search results. + If none given, output goes to STDOUT. + [-highlight] = "Highlight" matched portion in parentheses. + [-q] = Suppress summary report. + [-v] = Display version. + +FASTAptamer-Search allows users to search for specific patterns within one or m- +ore sequence files. + +To search through more than one input file, simply use the -i flag multiple tim- +es. All input files must use FASTA format. + +Similarly, to search for multiple patterns simultaneously, use the -p flag as m- +any times as needed. When searching for multiple patterns, note that partial ma- +tches are not returned. For example, entering the following command: + +fastaptamer_search -i FILE1 -i FILE2 -p ATTGCC -p TGGCAT + +would search FILE1 and FILE2 for any sequences containing both ATTGCC and TGGCAT + +Patterns and input sequence data are case insensitive, and T/U are interchangea- +ble. In addition to single bases, patterns can include any of the degenerate ba- +se symbols from IUPAC nucleic acid notation: + + A/T/G/C/U single bases + + R puRines (A/G) + Y pYrimidines (C/T) + W Weak (A/T) + S Strong (G/C) + M aMino (A/C) + K Keto (G/T) + + B not A + D not C + H not G + V not T + + N aNy base (not a gap) + +For greater visibility, pattern matches can be highlighted by parentheses in the +output by calling the -highlight flag. + +A summary report is generated after each file's search results and after search +completion. To suppress these reports, enable quiet mode using the [-quiet] flag + +HELP +exit; + +} + +if (defined $version){ ## Print version screen if -v is true + print <<"VERSION"; + +FASTAptamer v1.0 + +VERSION +exit; +} + +######################################################################################## + +unless(@inputlist) { die "\nNo input file was specified.\nSee help documentation [-help] for program usage.\n\n"; } + +unless(@patternlist) {die "\nNo search pattern was specified.\nSee help documentation [-help] for program usage.\n\n"; } + +my $start_time = time; ### Start timer + +my $inputcount = scalar(@inputlist); +my $patterncount = scalar(@patternlist); + +### Convert user-entered patterns into regexes +my @superarray; ### Array of 2-element arrays via references: ([pattern, regex], [...], etc.) +my $regex; +foreach my $pattern (@patternlist) { + $regex = $pattern; + $regex =~ s{[TU]}{\[TU\]}gi; # Make T and U interchangeable + $regex =~ s{W}{\[ATU\]}gi; # W = ATU Regex modifiers after s{}{}: + $regex =~ s{S}{\[CG\]}gi; # S = CG g tells the script to find ALL matches rather than stop after the first + $regex =~ s{M}{\[AC\]}gi; # M = AC i makes searching case insensitive + $regex =~ s{K}{\[GTU\]}gi; # K = GTU + $regex =~ s{R}{\[AG\]}gi; # R = AG + $regex =~ s{Y}{\[CTU\]}gi; # Y = CTU + $regex =~ s{B}{\[CGTU\]}gi; # B = CGTU + $regex =~ s{D}{\[AGTU\]}gi; # D = AGTU + $regex =~ s{H}{\[ACTU\]}gi; # H = ACTU + $regex =~ s{V}{\[ACG\]}gi; # V = ACG + $regex =~ s{N}{\[ACGTU\]}gi; # N = ACGTU + ### Put the pattern and regex into an array and push the whole thing onto @superarray + push(@superarray, [$pattern, $regex]); # Push $subarray onto the end of @superarray + } + +my $fh_out; +if(defined $output) { open($fh_out, '>', $output) or die "\nCould not open output file.\nSee help documentation [-help] for program usage.\n\n"; } + +my $input; +my $current_input = 0; +my $subarray; +my ($line1, $line2); +my $match_hits = 0; +my $match_line = ''; +my $match_portion = ''; +my $filehits = 0; +my $totalhits = 0; + +### Loop logic: read one input file at a time and search all patterns through each line + + + +### FASTA input + +foreach $input (@inputlist) { + $current_input++; + if(defined $output) { + print $fh_out "\n\n--------------------------------------------------------------------------------\n"; + print $fh_out "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n"; + print $fh_out "--------------------------------------------------------------------------------\n\n"; + } + else { + print "\n\n--------------------------------------------------------------------------------\n"; + print "SEARCH RESULTS FOR INPUT FILE $current_input: $input\n"; + print "--------------------------------------------------------------------------------\n\n"; + } + open(my $fh_in, '<', $input) or die "Could not open input file $input"; + + while($line1 = <$fh_in>) { + $line2 = <$fh_in>; + my $not_first_regex = 0; + my $hit_confirmed = 0; + foreach $subarray (@superarray) { + $regex = $subarray->[1]; ### Get the regex from subarray + ### $1 + if($line2 =~ m{($regex)}gi) { ### Search for all matches, case insensitive + $match_portion = $1; ### Portion of sequence that matched was captured in magic variable $1 + if($not_first_regex == 0) { ($match_line = $line2) =~ s{$match_portion}{\($match_portion\)}g; } + else { $match_line =~ s{$match_portion}{\($match_portion\)}g; } + $not_first_regex = 1; + $hit_confirmed++; + } + } + if(defined $output and $hit_confirmed == $patterncount) { + $filehits++; ### Increment FILE-SPECIFIC hit counter + $totalhits++; ### Increment OVERALL hit counter + if (defined $highlight) { print $fh_out "$line1$match_line"; } + else { print $fh_out "$line1$line2"; } + } + elsif(not defined $output and $hit_confirmed == $patterncount) { + $filehits++; ### Increment FILE-SPECIFIC hit counter + $totalhits++; ### Increment OVERALL hit counter + if (defined $highlight) { print "$line1$match_line"; } + else { print "$line1$line2"; } + } + } + + if(defined $output and not defined $quiet) { ### Print file-specific stats, unless quiet mode + if ($filehits > 1) { print $fh_out "\nMatched $filehits sequences in file $input.\n"; } + elsif ($filehits == 1) { print $fh_out "\nMatched 1 sequence in file $input.\n"; } + elsif ($filehits == 0) { print $fh_out "\nDid not match any sequences in file $input.\n" } + $filehits = 0; ### Reset file-specific hit counter after stats are printed + } + elsif(not defined $output and not defined $quiet) { + if ($filehits > 1) { print "\nMatched $filehits sequences in file $input.\n"; } + elsif ($filehits == 1) { print "\nMatched 1 sequence in file $input.\n"; } + elsif ($filehits == 0) { print "\nDid not match any sequences in file $input.\n" } + $filehits = 0; ### Reset file-specific hit counter after stats are printed + } +} + + +### Print a summary after script completion, unless quiet mode +unless(defined $quiet) { + my $duration = time - $start_time; + + print "\n\n--------------------------------------------------------------------------------\n"; + print " SEARCH RESULT SUMMARY\n"; + print "--------------------------------------------------------------------------------\n\n"; + + if($patterncount > 1) { print "Searched for $patterncount patterns:\n"; } + elsif($patterncount == 1) { print "Searched for 1 pattern:\n"; } + foreach $subarray (@superarray) { print "$subarray->[0]\n"; } + + print "\nacross the following $inputcount input files:\n"; + print join("\n", @inputlist); + + if($totalhits > 1) { print "\nOverall, $totalhits sequences were matched.\n"; } + elsif($totalhits == 1) { print "\nOverall, 1 sequence was matched.\n"; } + elsif($totalhits == 0) { print "\nDid not find any matches.\n"; } + + if($duration == 1) { print "\nYour search took 1 second.\n\n"; } + else { print "\nYour search took $duration seconds.\n\n"; } + + print "--------------------------------------------------------------------------------\n"; + print "--------------------------------------------------------------------------------\n\n"; +}