Mercurial > repos > mingchen0919 > aurora_findssrs
view findSSRs_altered.pl @ 2:dad0996ce0b7 draft default tip
planemo upload commit 0ab80de699747ead439fcaccfdd73f66066affc1-dirty
author | mingchen0919 |
---|---|
date | Fri, 09 Nov 2018 15:12:12 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl ################################################################################ # Author: Meg Staton & Stephen Ficklin # # DESCRIPTION # ----------- # This script identifies simple sequence repeats (SSRs) and calls primers from # the sequences in a fastq formatted file. # # Dependencies: # ------------ # Perl must have access to the packages: # Getopt::Long # Bio::SeqIO # Excel::Writer::XLSX # All are available from CPAN. # # Also path to the primer3 executable and primer3 config files must be specified # in the global variables section of the script. # # Usage: # ----- # Usage: findSSRs.pl <arguments> # # The list of arguments includes: # # -f|--fasta_file <fasta_file> # Required. The file of the sequences to be searched. # # -m|--masked_file <masked_fasta_file> # Required. A soft-masked version of the fasta file (soft masked means low # complexity sequences are in lower case bases.) # # Output: # ------ # <input-file-name>.ssr.fasta # A fasta file with sequences with a SSR. (Sequences with compound SSRs are included) # # <input-file-name>.ssr_stats.txt # A text file of statistics about the SSRs discovered. # # <input-file-name>.ssr_report.txt # A tab-delimited file with each SSR. The columns are SSR ID, # motif, number of repeats, start position, end position. # # <input-file-name>.ssr_report.xlsx # A excel file with SSR results and stats # # <input-file-name>.di_primer_report.txt # <input-file-name>.tri_primer_report.txt # <input-file-name>.tetra_primer_report.txt # Tab-delimited files with sequences with a specified SSR motif length. Columns are # SSR ID, motif, number of repeats, start position, end position, left primer, # right primer, left primer Tm, right primer Tm, amplicon size # # Details: # ------- # By default the script finds: # 2 bp motifs repeated from 8 to 200 times, # 3 bp motifs repeated from 7 to 133 times, # 4 bp motifs repeated from 6 to 100 times, # # These parameters may be changed in the "GLOBAL PARAMETERS" part of # the script. # # Compound SSRs are defined as any SSRs that abut or are less than 15 bases # apart. These are essentially compound SSRs for the purposes of mapping # because it is unlikely that primers can be designed between the repeat # segments. # use strict; #------------------------------------------------------------------------------- # DEPENDENCIES #------------------------------------------------------------------------------- use Getopt::Long; use Bio::SeqIO; use Excel::Writer::XLSX; #------------------------------------------------------------------------------- # GLOBAL PARAMETERS #------------------------------------------------------------------------------- #-------------- # REPEAT IDENTIFICATION PARAMETERS # Specify Motif Frequency # Motifs that occur less frequently than indicated below will be ignored. # A 0 indicates that this motif length will be ignored. our $MIN_REPS_2bp = 6; our $MIN_REPS_3bp = 6; our $MIN_REPS_4bp = 4; our $MAX_REPS_2bp = 20; our $MAX_REPS_3bp = 20; our $MAX_REPS_4bp = 20; #------------ # PRIMER PARAMETERS # my $PRIMER3 = "/staton/software/primer3-2.3.6/src/primer3_core"; # my $PRIMER3_CONFIG = "/staton/software/primer3-2.3.6/src/primer3_config/"; my $PRIMER_OPT_SIZE="21"; # default 20 my $PRIMER_MIN_SIZE="18"; # default 18 my $PRIMER_MAX_SIZE="27"; # default 27 my $PRIMER_NUM_NS_ACCEPTED = "0"; # default 0 my $PRIMER_PRODUCT_SIZE_RANGE = "100-400"; my $PRIMER_OPT_TM = "60.0"; my $PRIMER_MIN_TM = "55.0"; my $PRIMER_MAX_TM = "65.0"; my $PRIMER_MIN_GC = "40"; my $PRIMER_MAX_GC = "60"; my $PRIMER_MAX_POLY_X = "3"; my $PRIMER_GC_CLAMP = "2"; my $PRIMER_LOWERCASE_MASKING = 1; #------------------------------------------------------------------------------- # GLOBAL HASHES #------------------------------------------------------------------------------- # This makes life much easier than passing a bunch of hash refs all over the place. # #------------------------------------------------------------------------------- # Data structures: ## CONTIG_SSR_STARTS structure: ## key: contig_name ## value: array of starts of SSRs in that contig my %CONTIG_SSR_STARTS = (); ## SSR_STATS structure: ## key: ssr_id ## value -> keys: MOTIF START END MOTIF_LENGTH NO_REPEATS my %SSR_STATS = (); my $SEQ_COUNT = 0; # Set up the Motif specifications, based on the chosen motif types:. my @MOTIF_SPECS; push(@MOTIF_SPECS,[2, $MIN_REPS_2bp, $MAX_REPS_2bp, 'dinucleotides']); push(@MOTIF_SPECS,[3, $MIN_REPS_3bp, $MAX_REPS_3bp, 'trinucleotides']); push(@MOTIF_SPECS,[4, $MIN_REPS_4bp, $MAX_REPS_4bp, 'tetranucleotides']); #------------------------------------------------------------------------------- # Generating statistics my $TIME = scalar localtime; # Get the current time my $SEQ_w_SSRS = 0; my $SSR_COUNT = 0; my $SSR_COUNT_COMPOUND = 0; my $SSR_COUNT_PRIMER = 0; my %MOTIFLEN = (2 => 0, 3 => 0, 4 => 0); my %MOTIFLEN_w_PRIMERS = (2 => 0, 3 => 0, 4 => 0); my %MOTIFS = ('|AT|TA|' => 0, '|AG|GA|CT|TC|' => 0, '|AC|CA|TG|GT|' => 0, '|GC|CG|' => 0, '|AAT|ATA|TAA|ATT|TTA|TAT|' => 0, '|AAG|AGA|GAA|CTT|TTC|TCT|' => 0, '|AAC|ACA|CAA|GTT|TTG|TGT|' => 0, '|CCA|CAC|ACC|TGG|GTG|GGT|' => 0, '|GGC|GCG|CGG|GCC|CCG|CGC|' => 0, '|AGG|GAG|GGA|CCT|CTC|TCC|' => 0, '|ATG|TGA|GAT|CAT|ATC|TCA|' => 0, '|AGT|GTA|TAG|ACT|CTA|TAC|' => 0, '|AGC|GCA|CAG|GCT|CTG|TGC|' => 0, '|ACG|CGA|GAC|CGT|GTC|TCG|' => 0); #------------------------------------------------------------------------------- # EXECUTE #------------------------------------------------------------------------------- main(); #------------------------------------------------------------------------------- # PUBLIC SUBROUTINES #------------------------------------------------------------------------------- # Function Name: main() sub main{ ##--------------------------------------------------------------- ## Get input parameters my $fasta_file; my $masked_file; my $project; Getopt::Long::Configure ('bundling'); GetOptions('f|fasta_file=s' => \$fasta_file, 'm|masked_file=s' => \$masked_file, 'p|project=s' => \$project); ## Check that all required parameters have been included if(!$fasta_file){ print "A fasta file is required.\n"; _printUsage(); exit;} if(!$masked_file){ print "A masked file is required.\n"; _printUsage(); exit;} ## Check that fasta file exists if(! -e $fasta_file) { print "Fasta file $fasta_file does not exist\n"; exit; } if(! -e $masked_file) { print "Masked file $masked_file does not exist\n"; exit; } ##--------------------------------------------------------------- ## Set up output files my $p3_input = "$fasta_file.p3in.txt"; my $p3_output = "$fasta_file.p3out.txt"; my $ssr_out = "$fasta_file.ssr_report.txt"; my $fasta_out = "$fasta_file.ssr.fasta"; my $stats_out = "$fasta_file.ssr_stats.txt"; my $di_primer_out = "$fasta_file.di_primer_report.txt"; my $tri_primer_out = "$fasta_file.tri_primer_report.txt"; my $tetra_primer_out = "$fasta_file.tetra_primer_report.txt"; my $ssr_xlsx = "$fasta_file.ssr_report.xlsx"; ##--------------------------------------------------------------- print "finding SSRs...\n"; process_file($fasta_file, $masked_file); collapse_compound_ssrs(); flag_multiSSRs(); print "done.\n"; ##--------------------------------------------------------------- print "running primer3...\n"; addToPrimer3InputFile ($p3_input); print "primer3_core < $p3_input > $p3_output\n"; my $status = system("primer3_core < $p3_input > $p3_output"); print "$status\n"; parseP3_output($p3_output); print "done.\n"; ##--------------------------------------------------------------- ## Producing output - Fasta files and flat files print "printing output files..."; create_flat_files($ssr_out, $di_primer_out, $tri_primer_out, $tetra_primer_out); create_fasta_file($fasta_out); ##--------------------------------------------------------------- ## Producing output - statistics calculate_stats(); print_stats($stats_out); ##--------------------------------------------------------------- ## Producing output - Excel create_excel_file($ssr_xlsx, $project); print "done.\n"; } ############################################################### sub process_file{ my $fasta_file = $_[0]; # file name my $masked_file = $_[1]; # file name my $seqio = Bio::SeqIO->new('-format' => 'fasta', -file => $fasta_file); my $seqioM = Bio::SeqIO->new('-format' => 'fasta', -file => $masked_file); # Get seq obj from io stream while(my $seqobj = $seqio->next_seq){ my $seqobjM = $seqioM->next_seq; $SEQ_COUNT++; my $seqname = $seqobj->id; # get actual sequence as a string my $seqnameM = $seqobjM->id; # get actual sequence as a string my $seqstr = $seqobj->seq(); # get actual sequence as a string my $seqstrM = $seqobjM->seq(); # get actual sequence as a string if($seqname ne $seqnameM){ die "masked sequence $seqnameM not in same order as regular sequence $seqname\n"; } process_seq($seqname, $seqstr, $seqstrM); } } ############################################################### sub process_seq{ my $contig_name = shift; my $seq = shift; my $seq_masked = shift; my %seen; # used to keep track of start positions we've already seen my $index; # used to iterate through the 2D motif specs array ## LOOPB # iterate through the motif specifications for $index (0 .. (scalar @MOTIF_SPECS - 1)) { # holds the motif size (1,2,3,4,5 or 6) my $motifLength = $MOTIF_SPECS[$index][0]; # holds the minimum number of repeats my $min_number_of_repeats = $MOTIF_SPECS[$index][1]-1; # holds the maximum number of repeats my $max_number_of_repeats = $MOTIF_SPECS[$index][2]-1; # the regular expression for looking for SSRs my $regex = "(([gatc]{$motifLength})\\2{$min_number_of_repeats,$max_number_of_repeats})"; # run through the sequence and check for this motif spec while ($seq =~ /$regex/ig) { # Get the ssr and motif that were found my $ssr = $1; my $motif = lc $2; my $start_index = $-[0]; my $end_index = $+[0]; if(quality_check_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq)){ process_ssr($contig_name, $ssr, $motif, $start_index, $end_index, $seq, $seq_masked); } } } } ############################################################### sub quality_check_ssr{ my $contig_name = shift; my $ssr = shift; my $motif = shift; my $start_index = shift; my $end_index = shift; my $seq = shift; ##------------------------------------- ## CHECKS to see if this is a good ssr my $flag_same_base = 0; my $flag_already_seen = 0; ## Check #1 ## ignore SSRs that are the same base repeated if ($ssr !~ /^g+$/i && $ssr !~ /^a+$/i && $ssr !~ /^c+$/i && $ssr !~ /^t+$/i ) { $flag_same_base = 1; } # Check #2 # Make sure this isn't an already called SSR in disguise # (a dinucleotide repeat posing as a tetranucleotide repeat, for instance) if (!exists $SSR_STATS{$contig_name."_ssr".$start_index} && !exists $SSR_STATS{$contig_name."_ssr".($start_index-1)} && !exists $SSR_STATS{$contig_name."_ssr".($start_index-2)} && !exists $SSR_STATS{$contig_name."_ssr".($start_index+1)} && !exists $SSR_STATS{$contig_name."_ssr".($start_index+2)} ) { $flag_already_seen = 1; } if($flag_same_base && $flag_already_seen){ return 1; } else{ return 0; } } ###################################################### sub process_ssr{ my $contig_name = shift; my $ssr = shift; my $motif = shift; my $start_index = shift; my $end_index = shift; my $seq = shift; my $seq_masked = shift; ##------------------------------------- ## generate a few more stats and variables my $motif_len = length $motif; my $num_of_repeats = ($end_index-$start_index)/$motif_len; my $ssr_id = $contig_name."_ssr".$start_index; ##------------------------------------- ## store in data structures if(exists $CONTIG_SSR_STARTS{$contig_name}){ push @{ $CONTIG_SSR_STARTS{$contig_name} }, $start_index; } else{ $CONTIG_SSR_STARTS{$contig_name} = [$start_index]; } $SSR_STATS{$ssr_id}{MOTIF} = $motif; $SSR_STATS{$ssr_id}{START} = $start_index; $SSR_STATS{$ssr_id}{END} = $end_index; $SSR_STATS{$ssr_id}{MOTIF_LENGTH} = $motif_len; $SSR_STATS{$ssr_id}{NO_REPEATS} = $num_of_repeats; $SSR_STATS{$ssr_id}{SEQ} = $seq; $SSR_STATS{$ssr_id}{SEQM} = $seq_masked; $SSR_STATS{$ssr_id}{COMPOUND} = 0; #assume its not until proven otherwise } ####################################################### sub collapse_compound_ssrs{ foreach my $contig (keys %CONTIG_SSR_STARTS){ my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; if(@starts > 1){ ## this contig has multiple ssrs my $previous_start = -1; my $previous_end = -1; my $previous_ssr_id = ""; foreach my $current_start (sort {$a <=> $b} @starts){ my $current_ssr_id = $contig."_ssr".$current_start; my $current_end = $SSR_STATS{$current_ssr_id}{END}; if(too_close($previous_start, $previous_end, $current_start, $current_end)){ collapse($contig, $previous_ssr_id, $current_ssr_id); } $previous_start = $current_start; $previous_end = $current_end; $previous_ssr_id = $current_ssr_id; } } } } ################################################################ sub too_close{ my $previous_start = shift; my $previous_end = shift; my $current_start = shift; my $current_end = shift; # if start is a -1, then go ahead and return ok if($previous_start == -1){ return 0; } # we want to know if they overlap, abut or are less than 15 bases apart elsif($previous_end >= $current_start || ($current_start - $previous_end) < 15){ #print "$previous_start - $previous_end, $current_start - $current_end\n"; return 1; } else{ return 0; } } ################################################################ sub collapse{ my $contig = shift; my $first_ssr_id = shift; my $second_ssr_id = shift; my $second_ssr_start = $SSR_STATS{$second_ssr_id}{START}; ##fix SSR_STATS $SSR_STATS{$first_ssr_id}{MOTIF} = "COMPOUND"; $SSR_STATS{$first_ssr_id}{END} = $SSR_STATS{$second_ssr_id}{END}; $SSR_STATS{$first_ssr_id}{MOTIF_LENGTH} = "COMPOUND"; $SSR_STATS{$first_ssr_id}{NO_REPEATS} = "COMPOUND"; $SSR_STATS{$first_ssr_id}{COMPOUND} = 1; #assume its not until proven otherwise delete $SSR_STATS{$second_ssr_id}{MOTIF}; delete $SSR_STATS{$second_ssr_id}{START}; delete $SSR_STATS{$second_ssr_id}{END}; delete $SSR_STATS{$second_ssr_id}{MOTIF_LENGTH}; delete $SSR_STATS{$second_ssr_id}{NO_REPEATS}; delete $SSR_STATS{$second_ssr_id}{SEQ}; delete $SSR_STATS{$second_ssr_id}{SEQM}; delete $SSR_STATS{$second_ssr_id}{COMPOUND}; undef %{$SSR_STATS{$second_ssr_id}}; delete $SSR_STATS{$second_ssr_id}; print "\tdeleting $second_ssr_id, part of compound ssr\n"; if(exists $SSR_STATS{$second_ssr_id}){ print "\t still exists\n";} ##fix CONTIG_SSR_STARTS # get rid of the start index for the second ssr (it is now part of the # first ssr) my $index = 0; $index++ until $CONTIG_SSR_STARTS{$contig}[$index] == $second_ssr_start; splice(@{$CONTIG_SSR_STARTS{$contig}}, $index, 1); } ################################################################ sub flag_multiSSRs{ # adds a MULTI flag to the data hash indicating if the # ssr is the only one in the sequence or one of many foreach my $contig (keys %CONTIG_SSR_STARTS){ my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; if(@starts == 1){ ## this contig has only one ssr my $start_index = $starts[0]; my $ssr_id = $contig."_ssr".$start_index; $SSR_STATS{$ssr_id}{MULTI} = 0; } else{ ## this contig has multiple ssrs foreach my $start_index (@starts){ my $ssr_id = $contig."_ssr".$start_index; $SSR_STATS{$ssr_id}{MULTI} = 1; } } } close FASTA; } ################################################################ sub addToPrimer3InputFile{ my $p3_file = shift; open OUT, ">$p3_file"; foreach my $ssr_id (keys %SSR_STATS){ #skip compound SSRS if($SSR_STATS{$ssr_id}{COMPOUND} == 0){ my $ssrStart = $SSR_STATS{$ssr_id}{START}; my $ssrEnd = $SSR_STATS{$ssr_id}{END}; my $seq = $SSR_STATS{$ssr_id}{SEQM}; # change from soft mask to hard mask $seq =~ s/[actg]/N/g; my $len = $ssrEnd-$ssrStart; printf OUT ("SEQUENCE_ID=$ssr_id\n"); printf OUT ("SEQUENCE_TEMPLATE=$seq\n"); printf OUT ("SEQUENCE_TARGET=$ssrStart,$len\n"); printf OUT ("PRIMER_TASK=generic\n"); printf OUT ("PRIMER_PICK_LEFT_PRIMER=1\n"); printf OUT ("PRIMER_PICK_INTERNAL_OLIGO=0\n"); printf OUT ("PRIMER_PICK_RIGHT_PRIMER=1\n"); printf OUT ("PRIMER_OPT_SIZE=$PRIMER_OPT_SIZE\n"); printf OUT ("PRIMER_MIN_SIZE=$PRIMER_MIN_SIZE\n"); printf OUT ("PRIMER_MAX_SIZE=$PRIMER_MAX_SIZE\n"); printf OUT ("PRIMER_NUM_NS_ACCEPTED=$PRIMER_NUM_NS_ACCEPTED\n"); printf OUT ("PRIMER_PRODUCT_SIZE_RANGE=$PRIMER_PRODUCT_SIZE_RANGE\n"); printf OUT ("PRIMER_OPT_TM=$PRIMER_OPT_TM\n"); printf OUT ("PRIMER_MIN_TM=$PRIMER_MIN_TM\n"); printf OUT ("PRIMER_MAX_TM=$PRIMER_MAX_TM\n"); printf OUT ("PRIMER_MIN_GC=$PRIMER_MIN_GC\n"); printf OUT ("PRIMER_MAX_GC=$PRIMER_MAX_GC\n"); printf OUT ("PRIMER_MAX_POLY_X=$PRIMER_MAX_POLY_X\n"); printf OUT ("PRIMER_GC_CLAMP=$PRIMER_GC_CLAMP\n"); # printf OUT ("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=$PRIMER3_CONFIG\n"); printf OUT ("PRIMER_LOWERCASE_MASKING=$PRIMER_LOWERCASE_MASKING\n"); printf OUT ("=\n"); } } close OUT; } ################################################################ sub parseP3_output{ my $p3_output = $_[0]; # file name # We are going to keep track of a weird phenomenon only seen in one # project - the generation of identical forward and reverse primers. The # sequences from this project were overlapping paired ends that were joined. # Apparently something went wrong and weird sequences were obtained, all of # which yield the identical primers. # This is not reported in the final stats, just as part of the standard output. my $identical_primer_cnt = 0; # The primers output file separates information about different sequences # with an equal sign on a single line. So, we want to set the file line # delimiter (for looping on the input file below) to a single equal sign # followed by a line feed. This way were guranteed to have all the primer # information together per line local $/ = "=\n"; open (P3O, $p3_output) || die "could not open $_\n"; # Read in all of the lines of the input file my $primer_record; while ($primer_record = <P3O>) { my $start = ""; my $seq_id = ""; my $ssr_id = ""; my $forward = ""; my $reverse = ""; my $product_size = ""; my $left_tm = ""; my $right_tm = ""; if ($primer_record =~ /SEQUENCE_ID=(\S+)/) { $ssr_id = $1; } # get the primary primers only if ($primer_record =~ /PRIMER_LEFT_0_SEQUENCE=(\S+)/) { $forward = $1; } if ($primer_record =~ /PRIMER_RIGHT_0_SEQUENCE=(\S+)/) { $reverse = $1; } if ($primer_record =~ /PRIMER_LEFT_0_TM=(\S+)/) { $left_tm = $1; } if ($primer_record =~ /PRIMER_RIGHT_0_TM=(\S+)/) { $right_tm = $1; } if ($primer_record =~ /PRIMER_PAIR_0_PRODUCT_SIZE=(\S+)/) { $product_size = $1; } if(length $forward > 1){ if($forward eq $reverse){ print "\tFLAG: identical primer problem with $ssr_id\n"; $identical_primer_cnt++; } else{ $SSR_STATS{$ssr_id}{FORWARD} = $forward; $SSR_STATS{$ssr_id}{REVERSE} = $reverse; $SSR_STATS{$ssr_id}{PRODUCT_SIZE} = $product_size; $SSR_STATS{$ssr_id}{LEFT_TM} = $left_tm; $SSR_STATS{$ssr_id}{RIGHT_TM} = $right_tm; } } } print "\ttotal identical primers: $identical_primer_cnt\n"; } ################################################### sub create_flat_files{ my $ssr_out = shift; my $di_primer_out = shift; my $tri_primer_out = shift; my $tetra_primer_out = shift; open OUTS, ">$ssr_out"; open OUT2, ">$di_primer_out"; open OUT3, ">$tri_primer_out"; open OUT4, ">$tetra_primer_out"; my $di_fh = *OUT2; my $tri_fh = *OUT3; my $tetra_fh = *OUT4; ##printer headers print OUTS join("\t", "SSR ID", "motif", "number of repeats", "start position", "end position"); print OUTS "\n"; print OUT2 join("\t", "SSR ID", "motif", "number of repeats", "start position", "end position", "forward primer", "reverse primer", "forward Tm", "reverse Tm","product size" ); print OUT2 "\n"; print OUT3 join("\t", "SSR ID", "motif", "number of repeats", "start position", "end position", "forward primer", "reverse primer", "forward Tm", "reverse Tm","product size" ); print OUT3 "\n"; print OUT4 join("\t", "SSR ID", "motif", "number of repeats", "start position", "end position", "forward primer", "reverse primer", "forward Tm", "reverse Tm","product size" ); print OUT4 "\n"; foreach my $ssr_id (keys %SSR_STATS){ ## all ssrs including compound go in main ssr file print OUTS join("\t", $ssr_id, $SSR_STATS{$ssr_id}{MOTIF}, $SSR_STATS{$ssr_id}{NO_REPEATS}, $SSR_STATS{$ssr_id}{START}, $SSR_STATS{$ssr_id}{END}, ); print OUTS "\n"; # for primer flat files, only print SSRs with # that have primers if($SSR_STATS{$ssr_id}{COMPOUND} == 0 && $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/ ){ if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){ _print_primer_flat_file_line($di_fh, $ssr_id); } elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){ _print_primer_flat_file_line($tri_fh, $ssr_id); } elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){ _print_primer_flat_file_line($tetra_fh, $ssr_id); } } } close OUTS; close OUT2; close OUT3; close OUT4; } ################################################### sub _print_primer_flat_file_line{ my $fh = shift; my $ssr_id = shift; print $fh join("\t", $ssr_id, $SSR_STATS{$ssr_id}{MOTIF}, $SSR_STATS{$ssr_id}{NO_REPEATS}, $SSR_STATS{$ssr_id}{START}, $SSR_STATS{$ssr_id}{END}, $SSR_STATS{$ssr_id}{FORWARD}, $SSR_STATS{$ssr_id}{REVERSE}, $SSR_STATS{$ssr_id}{LEFT_TM}, $SSR_STATS{$ssr_id}{RIGHT_TM}, $SSR_STATS{$ssr_id}{PRODUCT_SIZE}, ); print $fh "\n"; } ################################################### sub create_fasta_file{ my $fasta_out = shift; open FASTA, ">$fasta_out"; foreach my $contig (keys %CONTIG_SSR_STARTS){ my @starts = @{ $CONTIG_SSR_STARTS{$contig}}; my $seq = ""; print FASTA ">$contig ("; foreach my $start_index (sort {$a <=> $b} @starts){ my $ssr_id = $contig."_ssr".$start_index; #if($SSR_STATS{$ssr_id}{START} >= 0){ $seq = $SSR_STATS{$ssr_id}{SEQ}; print FASTA "$SSR_STATS{$ssr_id}{START}-$SSR_STATS{$ssr_id}{END} "; if($SSR_STATS{$ssr_id}{COMPOUND} == 1){ print FASTA "*Compound "; } } #get the first ssr index just so we can get the sequence print FASTA ")\n"; print FASTA "$seq\n"; } close FASTA; } ################################################################ sub calculate_stats{ $SEQ_w_SSRS = keys %CONTIG_SSR_STARTS; foreach my $ssr_id (keys %SSR_STATS){ $SSR_COUNT++; if($SSR_STATS{$ssr_id}{COMPOUND} == 1){ $SSR_COUNT_COMPOUND++; } else{ my $motif_len = $SSR_STATS{$ssr_id}{MOTIF_LENGTH} ; #print "motif length is $motif_len\n"; $MOTIFLEN{$motif_len}++; my $motifUC = uc($SSR_STATS{$ssr_id}{MOTIF}); foreach my $group (keys %MOTIFS) { if($group =~ /\|$motifUC\|/){ #print "Incrementing $group for $motifUC\n"; $MOTIFS{$group}++; } } if($SSR_STATS{$ssr_id}{FORWARD} =~ /\S/){ $SSR_COUNT_PRIMER++; $MOTIFLEN_w_PRIMERS{$motif_len}++; } } } } sub print_stats{ my $stats_out = $_[0]; # file name open (OUTS, ">".$stats_out) || die "ERROR cannot open $stats_out\n"; print OUTS 'SSR Summary Report\n'; print OUTS "Analysis of $SEQ_COUNT sequences\n"; print OUTS "$TIME\n"; print OUTS "\n"; print OUTS "Number of sequences with at least one SSR\t$SEQ_w_SSRS\n"; print OUTS "Number of SSRs identified\t$SSR_COUNT\n"; print OUTS "\n"; print OUTS "Number of compound SSRs*: $SSR_COUNT_COMPOUND\n"; print OUTS "Number of SSRs with primers**: $SSR_COUNT_PRIMER\n"; print OUTS "\n"; print OUTS "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n"; print OUTS "**No primers are designed for compound SSRs\n"; print OUTS "\n"; print OUTS "Parameters used for identifying SSRS:\n"; print OUTS "Base Pairs in Motif\tMin # Reps\tMax # Reps\n"; print OUTS "--------------------------------------\n"; print OUTS "2 (Dinucleotides)\t$MIN_REPS_2bp\t$MAX_REPS_2bp\n"; print OUTS "3 (Trinucleotides)\t$MIN_REPS_3bp\t$MAX_REPS_3bp\n"; print OUTS "4 (Tetranucleotides)\t$MIN_REPS_4bp\t$MAX_REPS_4bp\n"; print OUTS "\n"; print OUTS "Chart of motif pattern frequence (compound SSRs excluded)\n"; print OUTS "Motif Patterns\tNumber of SSRs Found\n"; print OUTS "--------------------------------------\n"; my $group; foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ $group =~ s/^|//; $group =~ s/|$//; print OUTS "$group\t$MOTIFS{$group}\n"; } print OUTS "\n"; print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n"; print OUTS "Motif Pattern Length\tNumber of SSRs\n"; print OUTS "--------------------------------------\n"; foreach $group (sort keys %MOTIFLEN){ print OUTS "$group\t$MOTIFLEN{$group}\n"; } print OUTS "\n"; print OUTS "SSRS with Primers \n"; print OUTS "Chart of motif pattern length frequence (compound SSRs excluded)\n"; print OUTS "Motif Pattern Length\tNumber of SSRs\n"; print OUTS "--------------------------------------\n"; foreach $group (sort keys %MOTIFLEN_w_PRIMERS){ print OUTS "$group\t$MOTIFLEN_w_PRIMERS{$group}\n"; } close OUTS; } ################################################################ sub create_excel_file{ my $ssr_xlsx = shift; my $project = shift; # Create an excel workbook my $workbook = Excel::Writer::XLSX->new("$ssr_xlsx"); # Setup the formats that will be necessary for the excel spreadsheet my %header = (font => 'Calibri', size => 12, bold => 1, color => 'black', align => 'left', text_wrap => 1); my %text = (font => 'Calibri', size => 12, color => 'black', align => 'left', text_wrap => 1); #add the formats to the workbook my $header_format = $workbook->add_format(%header); my $text_format = $workbook->add_format(%text); my $worksheet_stats = create_stats_worksheet($workbook, $header_format, $text_format, $project); build_data_worksheets($workbook, $header_format, $text_format); $worksheet_stats->activate(); $worksheet_stats->select(); $workbook->close(); } sub create_stats_worksheet{ my $workbook = shift; my $header_format = shift; my $text_format = shift; my $project = shift; my $worksheet = $workbook->add_worksheet("Summary"); ## set all cells to text format ## only cells that need the header format will need to specify the format during write ## set column widths $worksheet->set_column('A:A', 75, $text_format); $worksheet->set_column('B:B', 30, $text_format); $worksheet->set_column('C:C', 30, $text_format); $worksheet->write('A1', "SSR Summary Report for $project", $header_format); $worksheet->write('A2', "Analysis of $SEQ_COUNT sequences"); $worksheet->write('A3', "$TIME"); $worksheet->write('A5', "Number of sequences with at least one SSR"); $worksheet->write('B5', "$SEQ_w_SSRS"); $worksheet->write('A6', "Number of SSRs identified"); $worksheet->write('B6', "$SSR_COUNT"); $worksheet->write('A8', "Number of compound SSRs*:"); $worksheet->write('B8', "$SSR_COUNT_COMPOUND"); $worksheet->write('A9', "Number of SSRs with primers**"); $worksheet->write('B9', "$SSR_COUNT_PRIMER"); $worksheet->write('A11', "*Compound SSRs are defined as any SSRs next to each or separated by less than 15 bases\n"); $worksheet->write('A12', "**No primers are designed for compound SSRs\n"); $worksheet->write('A14', "Parameters used for identifying SSRS:\n", $header_format); $worksheet->write('A15','Base Pairs in Motif', $header_format); $worksheet->write('B15','Min # Reps', $header_format); $worksheet->write('C15','Max # Reps', $header_format); $worksheet->write('A16','2 (Dinucleotides)'); $worksheet->write('B16',"$MIN_REPS_2bp"); $worksheet->write('C16',"$MAX_REPS_2bp"); $worksheet->write('A17','3 (Trinucleotides)'); $worksheet->write('B17',"$MIN_REPS_3bp"); $worksheet->write('C17',"$MAX_REPS_3bp"); $worksheet->write('A18','4 (Tetranucleotides)'); $worksheet->write('B18',"$MIN_REPS_4bp"); $worksheet->write('C18',"$MAX_REPS_4bp"); ##---------------------------------------------------------- ##Chart of motif pattern frequence (compound SSRs excluded) $worksheet->write('A20','Chart of motif pattern frequence (compound SSRs excluded)', $header_format); $worksheet->write('A21','Motif Patterns', $header_format); $worksheet->write('B21','Number of SSRs Found', $header_format); my $group; my $i = 21; foreach $group (sort {length $a <=> length $b} keys %MOTIFS){ $group =~ s/^|//; $group =~ s/|$//; $i++; $worksheet->write("A$i", $group); $worksheet->write("B$i", $MOTIFS{$group}); } ##---------------------------------------------------------- ## Chart of motif pattern length frequence (compound SSRs excluded) $i++; $i++; $worksheet->write("A$i", "Chart of motif pattern length frequence (compound SSRs excluded)", $header_format); $i++; $worksheet->write("A$i",'Motif Pattern Length', $header_format); $worksheet->write("B$i",'Number of SSRs Found', $header_format); foreach $group (sort keys %MOTIFLEN){ $i++; $worksheet->write("A$i", "$group bp"); $worksheet->write("B$i", $MOTIFLEN{$group}); } ##---------------------------------------------------------- ## SSRs w primers: ## Chart of motif pattern length frequence (compound SSRs excluded) $i++; $i++; $worksheet->write("A$i",'SSRs with Primers', $header_format); $i++; $worksheet->write("A$i",'Chart of motif pattern length frequence (compound SSRs excluded)', $header_format); $i++; $worksheet->write("A$i",'Motif Pattern Length', $header_format); $worksheet->write("B$i",'Number of SSRs Found', $header_format); foreach $group (sort keys %MOTIFLEN_w_PRIMERS){ $i++; $worksheet->write("A$i", "$group bp"); $worksheet->write("B$i", $MOTIFLEN_w_PRIMERS{$group}); } return $worksheet; } ############################################################## sub build_data_worksheets{ my $workbook = shift; my $header_format = shift; my $text_format = shift; my $di_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Dinucleotides"); my $tri_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Trinucleotides"); my $tetra_worksheet = _initiate_worksheet($workbook, $header_format, $text_format, "Tetranucleotides"); my $di_index = 3; my $tri_index = 3; my $tetra_index = 3; foreach my $ssr_id (keys %SSR_STATS){ # for excel data files, only print SSRs # that have primers if($SSR_STATS{$ssr_id}{COMPOUND} == 0 && $SSR_STATS{$ssr_id}{FORWARD} =~ /\S/ ){ if($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 2){ _print_excel_file_line($di_worksheet, $di_index, $ssr_id); $di_index++; } elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 3){ _print_excel_file_line($tri_worksheet, $tri_index, $ssr_id); $tri_index++; } elsif($SSR_STATS{$ssr_id}{MOTIF_LENGTH} == 4){ _print_excel_file_line($tetra_worksheet, $tetra_index, $ssr_id); $tetra_index++; } } } } ############################################################## sub _initiate_worksheet{ my $workbook = $_[0]; my $header_format = $_[1]; my $text_format = $_[2]; my $name = $_[3]; my $worksheet = $workbook->add_worksheet($name); $worksheet->set_column('A:A', 60, $text_format); $worksheet->set_column('B:E', 10, $text_format); $worksheet->set_column('F:G', 30, $text_format); $worksheet->set_column('H:J', 10, $text_format); $worksheet->write('A1', "$name with primers", $header_format); $worksheet->write('A2', 'SSR ID', $header_format); $worksheet->write('B2', 'Motif', $header_format); $worksheet->write('C2', '# Repeats', $header_format); $worksheet->write('D2', 'Start', $header_format); $worksheet->write('E2', 'End', $header_format); $worksheet->write('F2', 'Forward Primer', $header_format); $worksheet->write('G2', 'Reverse Primer', $header_format); $worksheet->write('H2', 'Forward Tm', $header_format); $worksheet->write('I2', 'Reverse Tm', $header_format); $worksheet->write('J2', 'Fragment Size', $header_format); return $worksheet; } ################################################################ sub _print_excel_file_line{ my $worksheet = shift; my $index = shift; my $ssr_id = shift; $worksheet->write("A$index", $ssr_id); $worksheet->write("B$index", $SSR_STATS{$ssr_id}{MOTIF}); $worksheet->write("C$index", $SSR_STATS{$ssr_id}{NO_REPEATS}); $worksheet->write("D$index", $SSR_STATS{$ssr_id}{START}); $worksheet->write("E$index", $SSR_STATS{$ssr_id}{END}); $worksheet->write("F$index", $SSR_STATS{$ssr_id}{FORWARD}); $worksheet->write("G$index", $SSR_STATS{$ssr_id}{REVERSE}); $worksheet->write("H$index", $SSR_STATS{$ssr_id}{LEFT_TM}); $worksheet->write("I$index", $SSR_STATS{$ssr_id}{RIGHT_TM}); $worksheet->write("J$index", $SSR_STATS{$ssr_id}{PRODUCT_SIZE}); } ############################################################### sub _printUsage { print "Usage: $0.pl <arguments>"; print qq( The list of arguments includes: -f|--fasta_file <fasta_file> Required. The file of the sequences to be searched. -m|--masked_file <masked_fasta_file> Required. A soft-masked version of the fasta file (soft masked means low complexity sequences are in lower case bases.) -p|--project "project name" Optional. A project name for use in the Excel output. ); print "\n"; return; } 1;