# HG changeset patch # User mingchen0919 # Date 1541794332 18000 # Node ID dad0996ce0b791069ae76ce0126a6a622faaaf7e # Parent 6da16f4e1e3b482aa9c4a8d3fc98b831dc3f595c planemo upload commit 0ab80de699747ead439fcaccfdd73f66066affc1-dirty diff -r 6da16f4e1e3b -r dad0996ce0b7 findSSRs_altered.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/findSSRs_altered.pl Fri Nov 09 15:12:12 2018 -0500 @@ -0,0 +1,1107 @@ +#!/usr/bin/env perl +################################################################################ +# Author: Meg Staton & Stephen Ficklin +# +# DESCRIPTION +# ----------- +# This script identifies simple sequence repeats (SSRs) and calls primers from +# the sequences in a fastq formatted file. +# +# Dependencies: +# ------------ +# Perl must have access to the packages: +# Getopt::Long +# Bio::SeqIO +# Excel::Writer::XLSX +# All are available from CPAN. +# +# Also path to the primer3 executable and primer3 config files must be specified +# in the global variables section of the script. +# +# Usage: +# ----- +# Usage: findSSRs.pl +# +# The list of arguments includes: +# +# -f|--fasta_file +# Required. The file of the sequences to be searched. +# +# -m|--masked_file +# Required. A soft-masked version of the fasta file (soft masked means low +# complexity sequences are in lower case bases.) +# +# Output: +# ------ +# .ssr.fasta +# A fasta file with sequences with a SSR. (Sequences with compound SSRs are included) +# +# .ssr_stats.txt +# A text file of statistics about the SSRs discovered. +# +# .ssr_report.txt +# A tab-delimited file with each SSR. The columns are SSR ID, +# motif, number of repeats, start position, end position. +# +# .ssr_report.xlsx +# A excel file with SSR results and stats +# +# .di_primer_report.txt +# .tri_primer_report.txt +# .tetra_primer_report.txt +# Tab-delimited files with sequences with a specified SSR motif length. Columns are +# SSR ID, motif, number of repeats, start position, end position, left primer, +# right primer, left primer Tm, right primer Tm, amplicon size +# +# Details: +# ------- +# By default the script finds: +# 2 bp motifs repeated from 8 to 200 times, +# 3 bp motifs repeated from 7 to 133 times, +# 4 bp motifs repeated from 6 to 100 times, +# +# These parameters may be changed in the "GLOBAL PARAMETERS" part of +# the script. +# +# Compound SSRs are defined as any SSRs that abut or are less than 15 bases +# apart. These are essentially compound SSRs for the purposes of mapping +# because it is unlikely that primers can be designed between the repeat +# segments. +# + + +use strict; + +#------------------------------------------------------------------------------- +# DEPENDENCIES +#------------------------------------------------------------------------------- + +use Getopt::Long; +use Bio::SeqIO; +use Excel::Writer::XLSX; + +#------------------------------------------------------------------------------- +# GLOBAL PARAMETERS +#------------------------------------------------------------------------------- + +#-------------- +# REPEAT IDENTIFICATION PARAMETERS +# Specify Motif Frequency +# Motifs that occur less frequently than indicated below will be ignored. +# A 0 indicates that this motif length will be ignored. + +our $MIN_REPS_2bp = 6; +our $MIN_REPS_3bp = 6; +our $MIN_REPS_4bp = 4; + +our $MAX_REPS_2bp = 20; +our $MAX_REPS_3bp = 20; +our $MAX_REPS_4bp = 20; + +#------------ +# PRIMER PARAMETERS + +# my $PRIMER3 = "/staton/software/primer3-2.3.6/src/primer3_core"; +# my $PRIMER3_CONFIG = "/staton/software/primer3-2.3.6/src/primer3_config/"; + +my $PRIMER_OPT_SIZE="21"; # default 20 +my $PRIMER_MIN_SIZE="18"; # default 18 +my $PRIMER_MAX_SIZE="27"; # default 27 + +my $PRIMER_NUM_NS_ACCEPTED = "0"; # default 0 + +my $PRIMER_PRODUCT_SIZE_RANGE = "100-400"; + +my $PRIMER_OPT_TM = "60.0"; +my $PRIMER_MIN_TM = "55.0"; +my $PRIMER_MAX_TM = "65.0"; + +my $PRIMER_MIN_GC = "40"; +my $PRIMER_MAX_GC = "60"; + +my $PRIMER_MAX_POLY_X = "3"; +my $PRIMER_GC_CLAMP = "2"; + +my $PRIMER_LOWERCASE_MASKING = 1; + +#------------------------------------------------------------------------------- +# GLOBAL HASHES +#------------------------------------------------------------------------------- +# This makes life much easier than passing a bunch of hash refs all over the place. +# +#------------------------------------------------------------------------------- +# Data structures: + +## CONTIG_SSR_STARTS structure: +## key: contig_name +## value: array of starts of SSRs in that contig +my %CONTIG_SSR_STARTS = (); + +## SSR_STATS structure: +## key: ssr_id +## value -> keys: MOTIF START END MOTIF_LENGTH NO_REPEATS +my %SSR_STATS = (); +my $SEQ_COUNT = 0; + +# Set up the Motif specifications, based on the chosen motif types:. +my @MOTIF_SPECS; +push(@MOTIF_SPECS,[2, $MIN_REPS_2bp, $MAX_REPS_2bp, 'dinucleotides']); +push(@MOTIF_SPECS,[3, $MIN_REPS_3bp, $MAX_REPS_3bp, 'trinucleotides']); +push(@MOTIF_SPECS,[4, $MIN_REPS_4bp, $MAX_REPS_4bp, 'tetranucleotides']); + +#------------------------------------------------------------------------------- +# Generating statistics +my $TIME = scalar localtime; # Get the current time +my $SEQ_w_SSRS = 0; +my $SSR_COUNT = 0; +my $SSR_COUNT_COMPOUND = 0; +my $SSR_COUNT_PRIMER = 0; + +my %MOTIFLEN = (2 => 0, + 3 => 0, + 4 => 0); + +my %MOTIFLEN_w_PRIMERS = (2 => 0, + 3 => 0, + 4 => 0); +my %MOTIFS = ('|AT|TA|' => 0, + '|AG|GA|CT|TC|' => 0, + '|AC|CA|TG|GT|' => 0, + '|GC|CG|' => 0, + + '|AAT|ATA|TAA|ATT|TTA|TAT|' => 0, + '|AAG|AGA|GAA|CTT|TTC|TCT|' => 0, + '|AAC|ACA|CAA|GTT|TTG|TGT|' => 0, + + '|CCA|CAC|ACC|TGG|GTG|GGT|' => 0, + '|GGC|GCG|CGG|GCC|CCG|CGC|' => 0, + '|AGG|GAG|GGA|CCT|CTC|TCC|' => 0, + + '|ATG|TGA|GAT|CAT|ATC|TCA|' => 0, + '|AGT|GTA|TAG|ACT|CTA|TAC|' => 0, + '|AGC|GCA|CAG|GCT|CTG|TGC|' => 0, + '|ACG|CGA|GAC|CGT|GTC|TCG|' => 0); + + + +#------------------------------------------------------------------------------- +# EXECUTE +#------------------------------------------------------------------------------- +main(); +#------------------------------------------------------------------------------- +# PUBLIC SUBROUTINES +#------------------------------------------------------------------------------- +# Function Name: main() + +sub main{ + + ##--------------------------------------------------------------- + ## Get input parameters + my $fasta_file; + my $masked_file; + my $project; + + Getopt::Long::Configure ('bundling'); + GetOptions('f|fasta_file=s' => \$fasta_file, + 'm|masked_file=s' => \$masked_file, + 'p|project=s' => \$project); + + ## Check that all required parameters have been included + if(!$fasta_file){ print "A fasta file is required.\n"; _printUsage(); exit;} + if(!$masked_file){ print "A masked file is required.\n"; _printUsage(); exit;} + + ## Check that fasta file exists + if(! -e $fasta_file) { print "Fasta file $fasta_file does not exist\n"; exit; } + if(! -e $masked_file) { print "Masked file $masked_file does not exist\n"; exit; } + + ##--------------------------------------------------------------- + ## Set up output files + my $p3_input = "$fasta_file.p3in.txt"; + my $p3_output = "$fasta_file.p3out.txt"; + my $ssr_out = "$fasta_file.ssr_report.txt"; + my $fasta_out = "$fasta_file.ssr.fasta"; + my $stats_out = "$fasta_file.ssr_stats.txt"; + my $di_primer_out = "$fasta_file.di_primer_report.txt"; + my $tri_primer_out = "$fasta_file.tri_primer_report.txt"; + my $tetra_primer_out = "$fasta_file.tetra_primer_report.txt"; + my $ssr_xlsx = "$fasta_file.ssr_report.xlsx"; + + ##--------------------------------------------------------------- + print "finding SSRs...\n"; + process_file($fasta_file, $masked_file); + collapse_compound_ssrs(); + flag_multiSSRs(); + print "done.\n"; + + ##--------------------------------------------------------------- + print "running primer3...\n"; + addToPrimer3InputFile ($p3_input); + print "primer3_core < $p3_input > $p3_output\n"; + my $status = system("primer3_core < $p3_input > $p3_output"); + print "$status\n"; + parseP3_output($p3_output); + print "done.\n"; + + ##--------------------------------------------------------------- + ## Producing output - Fasta files and flat files + print "printing output files..."; + create_flat_files($ssr_out, $di_primer_out, $tri_primer_out, $tetra_primer_out); + create_fasta_file($fasta_out); + + ##--------------------------------------------------------------- + ## Producing output - statistics + + calculate_stats(); + print_stats($stats_out); + + ##--------------------------------------------------------------- + ## Producing output - Excel + create_excel_file($ssr_xlsx, $project); + print "done.\n"; + +} + +############################################################### + +sub process_file{ + my $fasta_file = $_[0]; # file name + my $masked_file = $_[1]; # file name + + my $seqio = Bio::SeqIO->new('-format' => 'fasta', -file => $fasta_file); + my $seqioM = Bio::SeqIO->new('-format' => 'fasta', -file => $masked_file); + + # Get seq obj from io stream + while(my $seqobj = $seqio->next_seq){ + my $seqobjM = $seqioM->next_seq; + + $SEQ_COUNT++; + + my $seqname = $seqobj->id; # get actual sequence as a string + my $seqnameM = $seqobjM->id; # get actual sequence as a string + + my $seqstr = $seqobj->seq(); # get actual sequence as a string + my $seqstrM = $seqobjM->seq(); # get actual sequence as a string + + if($seqname ne $seqnameM){ + die "masked sequence $seqnameM not in same order as regular sequence $seqname\n"; + } + + process_seq($seqname, $seqstr, $seqstrM); + } + +} + +############################################################### + +sub process_seq{ + my $contig_name = shift; + my $seq = shift; + my $seq_masked = shift; + + + my %seen; # used to keep track of start positions we've already seen + my $index; # used to iterate through the 2D motif specs array + + ## LOOPB + # iterate through the motif specifications + for $index (0 .. (scalar @MOTIF_SPECS - 1)) { + + # holds the motif size (1,2,3,4,5 or 6) + my $motifLength = $MOTIF_SPECS[$index][0]; + # holds the minimum number of repeats + my $min_number_of_repeats = $MOTIF_SPECS[$index][1]-1; + # holds the maximum number of repeats + my $max_number_of_repeats = $MOTIF_SPECS[$index][2]-1; + # the regular expression for looking for SSRs + my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,$max_number_of_repeats})"; + + # run through the sequence and check for this motif spec + while ($seq =~ /$regex/ig) { + # Get the ssr and motif that were found + my $ssr = $1; + my $motif = lc $2; + my $start_index = $-[0]; + my $end_index = $+[0]; + + if(quality_check_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq)){ + process_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq, $seq_masked); + + } + + } + } +} + + + +############################################################### +sub quality_check_ssr{ + my $contig_name = shift; + my $ssr = shift; + my $motif = shift; + my $start_index = shift; + my $end_index = shift; + my $seq = shift; + + ##------------------------------------- + ## CHECKS to see if this is a good ssr + my $flag_same_base = 0; + my $flag_already_seen = 0; + + ## Check #1 + ## ignore SSRs that are the same base repeated + if ($ssr !~ /^g+$/i && + $ssr !~ /^a+$/i && + $ssr !~ /^c+$/i && + $ssr !~ /^t+$/i ) { + $flag_same_base = 1; + } + + # Check #2 + # Make sure this isn't an already called SSR in disguise + # (a dinucleotide repeat posing as a tetranucleotide repeat, for instance) + if (!exists $SSR_STATS{$contig_name."_ssr".$start_index} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index-1)} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index-2)} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index+1)} && + !exists $SSR_STATS{$contig_name."_ssr".($start_index+2)} + ) { + $flag_already_seen = 1; + } + + if($flag_same_base && $flag_already_seen){ + return 1; + } + else{ + return 0; + } + +} + + + +###################################################### +sub process_ssr{ + my $contig_name = shift; + my $ssr = shift; + my $motif = shift; + my $start_index = shift; + my $end_index = shift; + my $seq = shift; + my $seq_masked = shift; + + ##------------------------------------- + ## generate a few more stats and variables + my $motif_len = length $motif; + my $num_of_repeats = ($end_index-$start_index)/$motif_len; + my $ssr_id = $contig_name."_ssr".$start_index; + + ##------------------------------------- + ## store in data structures + + if(exists $CONTIG_SSR_STARTS{$contig_name}){ + push @{ $CONTIG_SSR_STARTS{$contig_name} }, $start_index; + } + else{ + $CONTIG_SSR_STARTS{$contig_name} = [$start_index]; + } + + $SSR_STATS{$ssr_id}{MOTIF} = $motif; + $SSR_STATS{$ssr_id}{START} = $start_index; + $SSR_STATS{$ssr_id}{END} = $end_index; + $SSR_STATS{$ssr_id}{MOTIF_LENGTH} = $motif_len; + $SSR_STATS{$ssr_id}{NO_REPEATS} = $num_of_repeats; + $SSR_STATS{$ssr_id}{SEQ} = $seq; + $SSR_STATS{$ssr_id}{SEQM} = $seq_masked; + $SSR_STATS{$ssr_id}{COMPOUND} = 0; #assume its not until proven otherwise + + +} + +####################################################### +sub collapse_compound_ssrs{ + foreach my $contig (keys %CONTIG_SSR_STARTS){ + my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; + if(@starts > 1){ + ## this contig has multiple ssrs + my $previous_start = -1; + my $previous_end = -1; + my $previous_ssr_id = ""; + + foreach my $current_start (sort {$a <=> $b} @starts){ + my $current_ssr_id = $contig."_ssr".$current_start; + my $current_end = $SSR_STATS{$current_ssr_id}{END}; + + if(too_close($previous_start, $previous_end, $current_start, $current_end)){ + collapse($contig, $previous_ssr_id, $current_ssr_id); + } + + $previous_start = $current_start; + $previous_end = $current_end; + $previous_ssr_id = $current_ssr_id; + } + } + } +} + +################################################################ +sub too_close{ + my $previous_start = shift; + my $previous_end = shift; + my $current_start = shift; + my $current_end = shift; + + # if start is a -1, then go ahead and return ok + if($previous_start == -1){ + return 0; + } + # we want to know if they overlap, abut or are less than 15 bases apart + elsif($previous_end >= $current_start || + ($current_start - $previous_end) < 15){ + #print "$previous_start - $previous_end, $current_start - $current_end\n"; + return 1; + } + else{ + return 0; + } +} +################################################################ +sub collapse{ + my $contig = shift; + my $first_ssr_id = shift; + my $second_ssr_id = shift; + my $second_ssr_start = $SSR_STATS{$second_ssr_id}{START}; + + ##fix SSR_STATS + $SSR_STATS{$first_ssr_id}{MOTIF} = "COMPOUND"; + $SSR_STATS{$first_ssr_id}{END} = $SSR_STATS{$second_ssr_id}{END}; + $SSR_STATS{$first_ssr_id}{MOTIF_LENGTH} = "COMPOUND"; + $SSR_STATS{$first_ssr_id}{NO_REPEATS} = "COMPOUND"; + $SSR_STATS{$first_ssr_id}{COMPOUND} = 1; #assume its not until proven otherwise + + delete $SSR_STATS{$second_ssr_id}{MOTIF}; + delete $SSR_STATS{$second_ssr_id}{START}; + delete $SSR_STATS{$second_ssr_id}{END}; + delete $SSR_STATS{$second_ssr_id}{MOTIF_LENGTH}; + delete $SSR_STATS{$second_ssr_id}{NO_REPEATS}; + delete $SSR_STATS{$second_ssr_id}{SEQ}; + delete $SSR_STATS{$second_ssr_id}{SEQM}; + delete $SSR_STATS{$second_ssr_id}{COMPOUND}; + undef %{$SSR_STATS{$second_ssr_id}}; + delete $SSR_STATS{$second_ssr_id}; + + print "\tdeleting $second_ssr_id, part of compound ssr\n"; + + if(exists $SSR_STATS{$second_ssr_id}){ print "\t still exists\n";} + + ##fix CONTIG_SSR_STARTS + # get rid of the start index for the second ssr (it is now part of the + # first ssr) + + my $index = 0; + $index++ until $CONTIG_SSR_STARTS{$contig}[$index] == $second_ssr_start; + splice(@{$CONTIG_SSR_STARTS{$contig}}, $index, 1); + +} + + +################################################################ +sub flag_multiSSRs{ + # adds a MULTI flag to the data hash indicating if the + # ssr is the only one in the sequence or one of many + + foreach my $contig (keys %CONTIG_SSR_STARTS){ + my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; + if(@starts == 1){ + ## this contig has only one ssr + my $start_index = $starts[0]; + my $ssr_id = $contig."_ssr".$start_index; + $SSR_STATS{$ssr_id}{MULTI} = 0; + } + else{ + ## this contig has multiple ssrs + foreach my $start_index (@starts){ + my $ssr_id = $contig."_ssr".$start_index; + $SSR_STATS{$ssr_id}{MULTI} = 1; + } + } + } + close FASTA; + +} + +################################################################ +sub addToPrimer3InputFile{ + my $p3_file = shift; + + open OUT, ">$p3_file"; + + foreach my $ssr_id (keys %SSR_STATS){ + #skip compound SSRS + if($SSR_STATS{$ssr_id}{COMPOUND} == 0){ + my $ssrStart = $SSR_STATS{$ssr_id}{START}; + my $ssrEnd = $SSR_STATS{$ssr_id}{END}; + my $seq = $SSR_STATS{$ssr_id}{SEQM}; + + # change from soft mask to hard mask + $seq =~ s/[actg]/N/g; + + my $len = $ssrEnd-$ssrStart; + + printf OUT ("SEQUENCE_ID=$ssr_id\n"); + printf OUT ("SEQUENCE_TEMPLATE=$seq\n"); + printf OUT ("SEQUENCE_TARGET=$ssrStart,$len\n"); + printf OUT ("PRIMER_TASK=generic\n"); + printf OUT ("PRIMER_PICK_LEFT_PRIMER=1\n"); + printf OUT ("PRIMER_PICK_INTERNAL_OLIGO=0\n"); + printf OUT ("PRIMER_PICK_RIGHT_PRIMER=1\n"); + printf OUT ("PRIMER_OPT_SIZE=$PRIMER_OPT_SIZE\n"); + printf OUT ("PRIMER_MIN_SIZE=$PRIMER_MIN_SIZE\n"); + printf OUT ("PRIMER_MAX_SIZE=$PRIMER_MAX_SIZE\n"); + printf OUT ("PRIMER_NUM_NS_ACCEPTED=$PRIMER_NUM_NS_ACCEPTED\n"); + printf OUT ("PRIMER_PRODUCT_SIZE_RANGE=$PRIMER_PRODUCT_SIZE_RANGE\n"); + printf OUT ("PRIMER_OPT_TM=$PRIMER_OPT_TM\n"); + printf OUT ("PRIMER_MIN_TM=$PRIMER_MIN_TM\n"); + printf OUT ("PRIMER_MAX_TM=$PRIMER_MAX_TM\n"); + printf OUT ("PRIMER_MIN_GC=$PRIMER_MIN_GC\n"); + printf OUT ("PRIMER_MAX_GC=$PRIMER_MAX_GC\n"); + printf OUT ("PRIMER_MAX_POLY_X=$PRIMER_MAX_POLY_X\n"); + printf OUT ("PRIMER_GC_CLAMP=$PRIMER_GC_CLAMP\n"); + # printf OUT ("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=$PRIMER3_CONFIG\n"); + printf OUT ("PRIMER_LOWERCASE_MASKING=$PRIMER_LOWERCASE_MASKING\n"); + printf OUT ("=\n"); + } + } + close OUT; +} +################################################################ +sub parseP3_output{ + my $p3_output = $_[0]; # file name + + # We are going to keep track of a weird phenomenon only seen in one + # project - the generation of identical forward and reverse primers. The + # sequences from this project were overlapping paired ends that were joined. + # Apparently something went wrong and weird sequences were obtained, all of + # which yield the identical primers. + # This is not reported in the final stats, just as part of the standard output. + my $identical_primer_cnt = 0; + + # The primers output file separates information about different sequences + # with an equal sign on a single line. So, we want to set the file line + # delimiter (for looping on the input file below) to a single equal sign + # followed by a line feed. This way were guranteed to have all the primer + # information together per line + local $/ = "=\n"; + + open (P3O, $p3_output) || die "could not open $_\n"; + + # Read in all of the lines of the input file + my $primer_record; + while ($primer_record = ) { + my $start = ""; + my $seq_id = ""; + my $ssr_id = ""; + my $forward = ""; + my $reverse = ""; + my $product_size = ""; + my $left_tm = ""; + my $right_tm = ""; + + if ($primer_record =~ /SEQUENCE_ID=(\S+)/) { + $ssr_id = $1; + } + # get the primary primers only + if ($primer_record =~ /PRIMER_LEFT_0_SEQUENCE=(\S+)/) { + $forward = $1; + } + if ($primer_record =~ /PRIMER_RIGHT_0_SEQUENCE=(\S+)/) { + $reverse = $1; + } + if ($primer_record =~ /PRIMER_LEFT_0_TM=(\S+)/) { + $left_tm = $1; + } + if ($primer_record =~ /PRIMER_RIGHT_0_TM=(\S+)/) { + $right_tm = $1; + } + if ($primer_record =~ /PRIMER_PAIR_0_PRODUCT_SIZE=(\S+)/) { + $product_size = $1; + } + + if(length $forward > 1){ + if($forward eq $reverse){ + print "\tFLAG: identical primer problem with $ssr_id\n"; + $identical_primer_cnt++; + } + else{ + $SSR_STATS{$ssr_id}{FORWARD} = $forward; + $SSR_STATS{$ssr_id}{REVERSE} = $reverse; + $SSR_STATS{$ssr_id}{PRODUCT_SIZE} = $product_size; + $SSR_STATS{$ssr_id}{LEFT_TM} = $left_tm; + $SSR_STATS{$ssr_id}{RIGHT_TM} = $right_tm; + + } + } + } + + print "\ttotal identical primers: $identical_primer_cnt\n"; +} + +################################################### +sub create_flat_files{ + my $ssr_out = shift; + my $di_primer_out = shift; + my $tri_primer_out = shift; + my $tetra_primer_out = shift; + + open OUTS, ">$ssr_out"; + open OUT2, ">$di_primer_out"; + open OUT3, ">$tri_primer_out"; + open OUT4, ">$tetra_primer_out"; + + my $di_fh = *OUT2; + my $tri_fh = *OUT3; + my $tetra_fh = *OUT4; + + ##printer headers + print OUTS join("\t", "SSR ID", + "motif", "number of repeats", "start position", "end position"); + print OUTS "\n"; + + print OUT2 join("\t", "SSR ID", + "motif", "number of repeats", "start position", + "end position", "forward primer", "reverse primer", + "forward Tm", "reverse Tm","product size" ); + print OUT2 "\n"; + + print OUT3 join("\t", "SSR ID", + "motif", "number of repeats", "start position", + "end position", "forward primer", "reverse primer", + "forward Tm", "reverse Tm","product size" ); + print OUT3 "\n"; + + print OUT4 join("\t", "SSR ID", + "motif", "number of repeats", "start position", + "end position", "forward primer", "reverse primer", + "forward Tm", "reverse Tm","product size" ); + print OUT4 "\n"; + + foreach my $ssr_id (keys %SSR_STATS){ + ## all ssrs including compound go in main ssr file + print OUTS join("\t", + $ssr_id, + $SSR_STATS{$ssr_id}{MOTIF}, + $SSR_STATS{$ssr_id}{NO_REPEATS}, + $SSR_STATS{$ssr_id}{START}, + $SSR_STATS{$ssr_id}{END}, + ); + print OUTS "\n"; + + # for primer flat files, only print SSRs with + # that have primers + if($SSR_STATS{$ssr_id}{COMPOUND} == 0 && + $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/ + ){ + if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){ + _print_primer_flat_file_line($di_fh, $ssr_id); + } + elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){ + _print_primer_flat_file_line($tri_fh, $ssr_id); + } + elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){ + _print_primer_flat_file_line($tetra_fh, $ssr_id); + } + } + } + close OUTS; + close OUT2; + close OUT3; + close OUT4; + +} +################################################### +sub _print_primer_flat_file_line{ + my $fh = shift; + my $ssr_id = shift; + + print $fh join("\t", $ssr_id, + $SSR_STATS{$ssr_id}{MOTIF}, + $SSR_STATS{$ssr_id}{NO_REPEATS}, + $SSR_STATS{$ssr_id}{START}, + $SSR_STATS{$ssr_id}{END}, + $SSR_STATS{$ssr_id}{FORWARD}, + $SSR_STATS{$ssr_id}{REVERSE}, + $SSR_STATS{$ssr_id}{LEFT_TM}, + $SSR_STATS{$ssr_id}{RIGHT_TM}, + $SSR_STATS{$ssr_id}{PRODUCT_SIZE}, + ); + print $fh "\n"; + +} + +################################################### +sub create_fasta_file{ + my $fasta_out = shift; + open FASTA, ">$fasta_out"; + + foreach my $contig (keys %CONTIG_SSR_STARTS){ + my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; + my $seq = ""; + print FASTA ">$contig ("; + foreach my $start_index (sort {$a <=> $b} @starts){ + my $ssr_id = $contig."_ssr".$start_index; + + #if($SSR_STATS{$ssr_id}{START} >= 0){ + $seq = $SSR_STATS{$ssr_id}{SEQ}; + print FASTA "$SSR_STATS{$ssr_id}{START}-$SSR_STATS{$ssr_id}{END} "; + if($SSR_STATS{$ssr_id}{COMPOUND} == 1){ + print FASTA "*Compound "; + } + } + #get the first ssr index just so we can get the sequence + print FASTA ")\n"; + print FASTA "$seq\n"; + + } + close FASTA; + +} +################################################################ +sub calculate_stats{ + + $SEQ_w_SSRS = keys %CONTIG_SSR_STARTS; + + foreach my $ssr_id (keys %SSR_STATS){ + $SSR_COUNT++; + + if($SSR_STATS{$ssr_id}{COMPOUND} == 1){ + $SSR_COUNT_COMPOUND++; + } + else{ + my $motif_len = $SSR_STATS{$ssr_id}{MOTIF_LENGTH} ; + #print "motif length is $motif_len\n"; + $MOTIFLEN{$motif_len}++; + + my $motifUC = uc($SSR_STATS{$ssr_id}{MOTIF}); + foreach my $group (keys %MOTIFS) { + if($group =~ /\|$motifUC\|/){ + #print "Incrementing $group for $motifUC\n"; + $MOTIFS{$group}++; + } + } + + if($SSR_STATS{$ssr_id}{FORWARD} =~ /\S/){ + $SSR_COUNT_PRIMER++; + $MOTIFLEN_w_PRIMERS{$motif_len}++; + } + } + } + +} +sub print_stats{ + my $stats_out = $_[0]; # file name + + open (OUTS, ">".$stats_out) || die "ERROR cannot open $stats_out\n"; + + print OUTS 'SSR Summary Report\n'; + print OUTS "Analysis of $SEQ_COUNT sequences\n"; + print OUTS "$TIME\n"; + print OUTS "\n"; + print OUTS "Number of sequences with at least one SSR\t$SEQ_w_SSRS\n"; + print OUTS "Number of SSRs identified\t$SSR_COUNT\n"; + print OUTS "\n"; + print OUTS "Number of compound SSRs*: $SSR_COUNT_COMPOUND\n"; + print OUTS "Number of SSRs with primers**: $SSR_COUNT_PRIMER\n"; + print OUTS "\n"; + print OUTS "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n"; + print OUTS "**No primers are designed for compound SSRs\n"; + print OUTS "\n"; + print OUTS "Parameters used for identifying SSRS:\n"; + print OUTS "Base Pairs in Motif\tMin # Reps\tMax # Reps\n"; + print OUTS "--------------------------------------\n"; + print OUTS "2 (Dinucleotides)\t$MIN_REPS_2bp\t$MAX_REPS_2bp\n"; + print OUTS "3 (Trinucleotides)\t$MIN_REPS_3bp\t$MAX_REPS_3bp\n"; + print OUTS "4 (Tetranucleotides)\t$MIN_REPS_4bp\t$MAX_REPS_4bp\n"; + print OUTS "\n"; + print OUTS "Chart of motif pattern frequence (compound SSRs excluded)\n"; + print OUTS "Motif Patterns\tNumber of SSRs Found\n"; + print OUTS "--------------------------------------\n"; + my $group; + foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ + $group =~ s/^|//; + $group =~ s/|$//; + print OUTS "$group\t$MOTIFS{$group}\n"; + } + print OUTS "\n"; + print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n"; + print OUTS "Motif Pattern Length\tNumber of SSRs\n"; + print OUTS "--------------------------------------\n"; + + foreach $group (sort keys %MOTIFLEN){ + print OUTS "$group\t$MOTIFLEN{$group}\n"; + } + + print OUTS "\n"; + print OUTS "SSRS with Primers \n"; + print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n"; + print OUTS "Motif Pattern Length\tNumber of SSRs\n"; + print OUTS "--------------------------------------\n"; + + foreach $group (sort keys %MOTIFLEN_w_PRIMERS){ + print OUTS "$group\t$MOTIFLEN_w_PRIMERS{$group}\n"; + } + + + close OUTS; + +} + +################################################################ + +sub create_excel_file{ + my $ssr_xlsx = shift; + my $project = shift; + + # Create an excel workbook + my $workbook = Excel::Writer::XLSX->new("$ssr_xlsx"); + + # Setup the formats that will be necessary for the excel spreadsheet + my %header = (font => 'Calibri', + size => 12, + bold => 1, + color => 'black', + align => 'left', + text_wrap => 1); + + my %text = (font => 'Calibri', + size => 12, + color => 'black', + align => 'left', + text_wrap => 1); + + #add the formats to the workbook + my $header_format = $workbook->add_format(%header); + my $text_format = $workbook->add_format(%text); + + my $worksheet_stats = create_stats_worksheet($workbook, $header_format, $text_format, $project); + + build_data_worksheets($workbook, $header_format, $text_format); + + $worksheet_stats->activate(); + $worksheet_stats->select(); + $workbook->close(); + + +} + +sub create_stats_worksheet{ + my $workbook = shift; + my $header_format = shift; + my $text_format = shift; + my $project = shift; + + my $worksheet = $workbook->add_worksheet("Summary"); + + ## set all cells to text format + ## only cells that need the header format will need to specify the format during write + ## set column widths + $worksheet->set_column('A:A', 75, $text_format); + $worksheet->set_column('B:B', 30, $text_format); + $worksheet->set_column('C:C', 30, $text_format); + + $worksheet->write('A1', "SSR Summary Report for $project", $header_format); + $worksheet->write('A2', "Analysis of $SEQ_COUNT sequences"); + $worksheet->write('A3', "$TIME"); + + $worksheet->write('A5', "Number of sequences with at least one SSR"); + $worksheet->write('B5', "$SEQ_w_SSRS"); + + $worksheet->write('A6', "Number of SSRs identified"); + $worksheet->write('B6', "$SSR_COUNT"); + + $worksheet->write('A8', "Number of compound SSRs*:"); + $worksheet->write('B8', "$SSR_COUNT_COMPOUND"); + + $worksheet->write('A9', "Number of SSRs with primers**"); + $worksheet->write('B9', "$SSR_COUNT_PRIMER"); + + $worksheet->write('A11', "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n"); + + $worksheet->write('A12', "**No primers are designed for compound SSRs\n"); + + $worksheet->write('A14', "Parameters used for identifying SSRS:\n", $header_format); + + $worksheet->write('A15','Base Pairs in Motif', $header_format); + $worksheet->write('B15','Min # Reps', $header_format); + $worksheet->write('C15','Max # Reps', $header_format); + + $worksheet->write('A16','2 (Dinucleotides)'); + $worksheet->write('B16',"$MIN_REPS_2bp"); + $worksheet->write('C16',"$MAX_REPS_2bp"); + + $worksheet->write('A17','3 (Trinucleotides)'); + $worksheet->write('B17',"$MIN_REPS_3bp"); + $worksheet->write('C17',"$MAX_REPS_3bp"); + + $worksheet->write('A18','4 (Tetranucleotides)'); + $worksheet->write('B18',"$MIN_REPS_4bp"); + $worksheet->write('C18',"$MAX_REPS_4bp"); + + ##---------------------------------------------------------- + ##Chart of motif pattern frequence (compound SSRs excluded) + + $worksheet->write('A20','Chart of motif pattern frequence (compound SSRs excluded)', $header_format); + $worksheet->write('A21','Motif Patterns', $header_format); + $worksheet->write('B21','Number of SSRs Found', $header_format); + my $group; + my $i = 21; + foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ + $group =~ s/^|//; + $group =~ s/|$//; + $i++; + $worksheet->write("A$i", $group); + $worksheet->write("B$i", $MOTIFS{$group}); + } + + ##---------------------------------------------------------- + ## Chart of motif pattern length frequence (compound SSRs excluded) + $i++; + $i++; + $worksheet->write("A$i", "Chart of motif pattern length frequence (compound SSRs excluded)", $header_format); + $i++; + $worksheet->write("A$i",'Motif Pattern Length', $header_format); + $worksheet->write("B$i",'Number of SSRs Found', $header_format); + foreach $group (sort keys %MOTIFLEN){ + $i++; + $worksheet->write("A$i", "$group bp"); + $worksheet->write("B$i", $MOTIFLEN{$group}); + } + + ##---------------------------------------------------------- + ## SSRs w primers: + ## Chart of motif pattern length frequence (compound SSRs excluded) + $i++; + $i++; + $worksheet->write("A$i",'SSRs with Primers', $header_format); + $i++; + $worksheet->write("A$i",'Chart of motif pattern length frequence (compound SSRs excluded)', $header_format); + $i++; + $worksheet->write("A$i",'Motif Pattern Length', $header_format); + $worksheet->write("B$i",'Number of SSRs Found', $header_format); + foreach $group (sort keys %MOTIFLEN_w_PRIMERS){ + $i++; + $worksheet->write("A$i", "$group bp"); + $worksheet->write("B$i", $MOTIFLEN_w_PRIMERS{$group}); + } + + return $worksheet; +} + +############################################################## +sub build_data_worksheets{ + my $workbook = shift; + my $header_format = shift; + my $text_format = shift; + + my $di_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Dinucleotides"); + my $tri_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Trinucleotides"); + my $tetra_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Tetranucleotides"); + + my $di_index = 3; + my $tri_index = 3; + my $tetra_index = 3; + + foreach my $ssr_id (keys %SSR_STATS){ + # for excel data files, only print SSRs + # that have primers + if($SSR_STATS{$ssr_id}{COMPOUND} == 0 && + $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/ + ){ + if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){ + _print_excel_file_line($di_worksheet, $di_index, $ssr_id); + $di_index++; + } + elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){ + _print_excel_file_line($tri_worksheet, $tri_index, $ssr_id); + $tri_index++; + } + elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){ + _print_excel_file_line($tetra_worksheet, $tetra_index, $ssr_id); + $tetra_index++; + } + } + } + + +} + +############################################################## +sub _initiate_worksheet{ + my $workbook = $_[0]; + my $header_format = $_[1]; + my $text_format = $_[2]; + my $name = $_[3]; + + my $worksheet = $workbook->add_worksheet($name); + $worksheet->set_column('A:A', 60, $text_format); + $worksheet->set_column('B:E', 10, $text_format); + $worksheet->set_column('F:G', 30, $text_format); + $worksheet->set_column('H:J', 10, $text_format); + + $worksheet->write('A1', "$name with primers", $header_format); + $worksheet->write('A2', 'SSR ID', $header_format); + $worksheet->write('B2', 'Motif', $header_format); + $worksheet->write('C2', '# Repeats', $header_format); + $worksheet->write('D2', 'Start', $header_format); + $worksheet->write('E2', 'End', $header_format); + $worksheet->write('F2', 'Forward Primer', $header_format); + $worksheet->write('G2', 'Reverse Primer', $header_format); + $worksheet->write('H2', 'Forward Tm', $header_format); + $worksheet->write('I2', 'Reverse Tm', $header_format); + $worksheet->write('J2', 'Fragment Size', $header_format); + + return $worksheet; +} + + +################################################################ +sub _print_excel_file_line{ + my $worksheet = shift; + my $index = shift; + my $ssr_id = shift; + + $worksheet->write("A$index", $ssr_id); + $worksheet->write("B$index", $SSR_STATS{$ssr_id}{MOTIF}); + $worksheet->write("C$index", $SSR_STATS{$ssr_id}{NO_REPEATS}); + $worksheet->write("D$index", $SSR_STATS{$ssr_id}{START}); + $worksheet->write("E$index", $SSR_STATS{$ssr_id}{END}); + $worksheet->write("F$index", $SSR_STATS{$ssr_id}{FORWARD}); + $worksheet->write("G$index", $SSR_STATS{$ssr_id}{REVERSE}); + $worksheet->write("H$index", $SSR_STATS{$ssr_id}{LEFT_TM}); + $worksheet->write("I$index", $SSR_STATS{$ssr_id}{RIGHT_TM}); + $worksheet->write("J$index", $SSR_STATS{$ssr_id}{PRODUCT_SIZE}); + +} + +############################################################### +sub _printUsage { + print "Usage: $0.pl "; + print qq( + The list of arguments includes: + + -f|--fasta_file + Required. The file of the sequences to be searched. + + -m|--masked_file + Required. A soft-masked version of the fasta file (soft masked means low + complexity sequences are in lower case bases.) + + -p|--project "project name" + Optional. A project name for use in the Excel output. + + ); + print "\n"; + return; +} + + +1; diff -r 6da16f4e1e3b -r dad0996ce0b7 galaxy_outputs.sh --- a/galaxy_outputs.sh Fri Nov 09 12:23:25 2018 -0500 +++ b/galaxy_outputs.sh Fri Nov 09 15:12:12 2018 -0500 @@ -11,6 +11,15 @@ fi -if [ -e `*.ssr.fasta` ]; then - cp `*.ssr.fasta` ${X_F} -fi \ No newline at end of file +if [ -e "input.ssr.fasta" ]; then + cp "input.ssr.fasta" ${X_F} +fi + +if [ -e "input.ssr_stats.txt" ]; then + cp "input.ssr_stats.txt" ${X_S} +fi + +if [ -e "input.ssr_report.txt" ]; then + cp "input.ssr_report.txt" ${X_R} +fi + diff -r 6da16f4e1e3b -r dad0996ce0b7 rmarkdown_report.Rmd --- a/rmarkdown_report.Rmd Fri Nov 09 12:23:25 2018 -0500 +++ b/rmarkdown_report.Rmd Fri Nov 09 15:12:12 2018 -0500 @@ -1,5 +1,5 @@ --- -title: 'Aurora Tool Report' +title: 'FindSSRs Report' output: html_document: highlight: pygments @@ -55,16 +55,17 @@ # Run `findSRRs_altered.pl` -* `${REPORT_FILES_PATH}`: the output directory that stores all analysis outputs. -* `${TOOL_INSTALL_DIR}`: the tool install directory where the `findSSRs_altered.pl` is located. -* `$X_f`: the path to the fasta file. -* `$X_m`: the path to the masked fasta file. - -```{bash} -cd ${REPORT_FILES_PATH} -perl ${TOOL_INSTALL_DIR}/findSSRs_altered.pl \ - -f $X_f \ - -m $X_m +```{bash echo=FALSE} +sh ${TOOL_INSTALL_DIR}/scripts_generator.sh ``` +```{r echo=FALSE,warning=FALSE,results='asis'} +# display content of the job-script.sh file. +cat('```bash\n') +cat(readLines(paste0(Sys.getenv('REPORT_FILES_PATH'), '/findSSRs.sh')), sep = '\n') +cat('\n```') +``` +* `-f`: the path to the fasta file. +* `-m`: the path to the masked fasta file. + diff -r 6da16f4e1e3b -r dad0996ce0b7 rmarkdown_report.xml --- a/rmarkdown_report.xml Fri Nov 09 12:23:25 2018 -0500 +++ b/rmarkdown_report.xml Fri Nov 09 15:12:12 2018 -0500 @@ -35,7 +35,7 @@ - +