changeset 0:56658e35798d draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/phosphopeptide_kinase_mapping commit d256bec9d43378291734e2b2a93bdbfcc2d83f61"
author galaxyp
date Thu, 04 Nov 2021 19:37:36 +0000
parents
children
files PhosphoPeptide_Upstream_Kinase_Mapping.pl phosphopeptide_kinase_mapping.xml
diffstat 2 files changed, 997 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl	Thu Nov 04 19:37:36 2021 +0000
@@ -0,0 +1,957 @@
+#!/usr/local/bin/perl
+
+use Getopt::Std;
+
+###############################################################################################################################
+#	perl Kinase_enrichment_analysis_complete_v0.pl
+#
+#	Nick Graham, USC
+#	2016-02-27
+#
+#	Built from scripts written by NG at UCLA in Tom Graeber's lab:
+#		CombinePhosphoSites.pl
+#		Retrieve_p_motifs.pl
+#		NetworKIN_Motif_Finder_v7.pl
+#
+#	Given a list of phospho-peptides, find protein information and upstream kinases.
+#	Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
+#
+###############################################################################################################################
+
+my ($file_in, $average_or_sum, $file_out, $phospho_type);
+my ($fasta_in, $networkin_in, $motifs_in, $PhosphoSite_in, $PhosphoSite_molecular_function);
+
+##########
+## opts ##
+##########
+## input files
+# i : path to input outputfile_STEP2.txt
+# f : path to fasta
+# n : path to NetworKIN_201612_cutoffscore2.0.txt
+# m : path to pSTY_Motifs.txt
+# p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+# r : path to 2017-03_PSP_Regulatory_sites.txt
+## options
+# P : phospho_type
+# F : function
+## output files
+# o : path to output file
+
+sub usage()
+    {
+        print STDERR <<"EOH";
+    This program given a list of phospho-peptides, finds protein information and upstream kinases.
+    usage: $0 [-hvd] [-f file]
+     -h : this (help) message
+     -i : path to input outputfile_STEP2.txt
+     -f : path to fasta
+     -n : path to NetworKIN_201612_cutoffscore2.0.txt
+     -m : path to pSTY_Motifs.txt
+     -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+     -r : path to 2017-03_PSP_Regulatory_sites.txt
+     -P : phospho_type
+     -F : function
+     -o : path to output file
+    example: $0
+EOH
+        exit;
+    }
+
+my %opts;
+getopts('i:f:n:m:p:r:o:P:F:h', \%opts) ;
+
+if (exists($opts{'h'})) {
+    usage();
+}
+if (!exists($opts{'i'}) || !-e $opts{'i'}) {
+	die('Input File not found');
+} else {
+	$file_in = $opts{'i'};
+}
+if (!exists($opts{'f'}) || !-e $opts{'f'}) {
+	die('Input Fasta File not found');
+} else {
+	$fasta_in = $opts{'f'};
+}
+if (!exists($opts{'n'}) || !-e $opts{'n'}) {
+	die('Input NetworKIN File not found');
+} else {
+	$networkin_in = $opts{'n'};
+}
+if (!exists($opts{'m'}) || !-e $opts{'m'}) {
+	die('Input pSTY_Motifs File not found');
+} else {
+	$motifs_in = $opts{'m'};
+}
+if (!exists($opts{'p'}) || !-e $opts{'p'}) {
+	die('Input PSP_Kinase_Substrate_Dataset File not found');
+} else {
+	$PhosphoSite_in = $opts{'p'};
+}
+if (!exists($opts{'r'}) || !-e $opts{'r'}) {
+	die('Input PSP_Regulatory_sites File not found');
+} else {
+	$PhosphoSite_molecular_function = $opts{'r'};
+}
+if (exists($opts{'P'})) {
+        $phospho_type = $opts{'P'};
+}
+else {
+        $phospho_type = "sty";
+}
+if (exists($opts{'F'})) {
+        $average_or_sum = $opts{'F'};
+}
+else {
+        $average_or_sum = "sum";
+}
+if (exists($opts{'o'})) {
+        $file_out = $opts{'o'};
+}
+else {
+        $file_out = "output.tsv";
+}
+
+
+###############################################################################################################################
+# Print the relevant file names to the screen
+###############################################################################################################################
+# print "\nData file:  $data_in\nFASTA file:  $fasta_in\nSpecies:  $species\nOutput file:  $motifs_out\n\n";
+print "\nData file:  $file_in\nAverage or sum identical p-sites?  $average_or_sum\nOutput file:  $file_out\n\n";
+print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PhosphoSite_in\nPhosphosite functional data:  $PhosphoSite_molecular_function\nFASTA file:  $fasta_in\n\n";
+
+
+print "\nPhospho-residues(s) = $phospho_type\n";
+if ($phospho_type ne 'y') {
+	if ($phospho_type ne 'sty') {
+		die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
+	}
+}
+
+###############################################################################################################################
+# read the input data file
+# average or sum identical phospho-sites, depending on the value of $average_or_sum
+###############################################################################################################################
+
+open (IN, "$file_in") or die "I couldn't find the input file:  $file_in\n";
+
+die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;
+
+
+my (@samples, %data, @tmp_data, %n);
+my $line = 0;
+
+while (<IN>) {
+	chomp;
+	my @x = split(/\t/);
+	for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n]  =~ s/\n//g; $x[$n]  =~ s/\"//g;}
+	
+	# Read in the samples
+	if ($line == 0) {
+		for my $n (1 .. $#x) {
+			push (@samples, $x[$n]); 
+		}
+		$line++;
+	} else {
+		# check whether we have already seen a phospho-peptide
+		if (exists($data{$x[0]})) { 
+			if ($average_or_sum eq "sum") {		# add the data
+				# unload the data
+				@tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
+				# add the new data and repack
+				for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
+				$all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
+				
+			} elsif ($average_or_sum eq "average") {		# average the data
+				# unload the data
+				@tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
+				# average with the new data and repack
+				for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
+				$n{$x[0]}++;
+				$data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
+			}
+		} 
+		# if the phospho-sequence has not been seen, save the data
+		else {
+			for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
+			$n{$x[0]} = 1;
+		}
+	}
+}
+close(IN);
+
+
+###############################################################################################################################
+# Search the FASTA database for phospho-sites and motifs
+#
+# based on Retrieve_p_peptide_motifs_v2.pl
+###############################################################################################################################
+
+
+###############################################################################################################################
+#
+#	Read in the Data file:
+#		1) make @p_peptides array as in the original file
+#		2) make @non_p_peptides array w/o residue modifications (p, #, other)
+#
+###############################################################################################################################
+
+my (@p_peptides, @non_p_peptides);
+foreach my $peptide (keys %data) {
+	$peptide =~ s/s/pS/g;	$peptide =~ s/t/pT/g;	$peptide =~ s/y/pY/g;
+	push (@p_peptides, $peptide);
+	$peptide =~ s/p//g;
+	push(@non_p_peptides, $peptide);
+}
+
+###############################################################################################################################
+#
+#	Read in the FASTA sequence file, save them to the @sequences array
+#
+###############################################################################################################################
+
+open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";
+
+my (@accessions, @names, @sequences);
+
+print "Reading FASTA file $fasta_in\n";
+while (<IN1>) {
+	chomp;
+	my (@x) = split(/\|/);
+	for my $i (0 .. $#x) { 
+		$x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g; }
+	if ($x[0] =~ /^>/) {
+		$x[0] =~ s/\>//g;
+		push (@names, $x[2]);
+		push (@accessions, $x[1]);
+	} elsif ($x[0] =~ /^\w/) {
+		$sequences[$#accessions] = $sequences[$#accessions].$x[0];
+	}
+}
+close IN1;
+print "Done Reading FASTA file $fasta_in\n\n";
+
+
+###############################################################################################################################
+#
+#	Match the non_p_peptides to the @sequences array:
+#		1) Format the motifs +/- 10 residues around the phospho-site
+#		2) Print the original data plus the phospho-motif to the output file
+#
+###############################################################################################################################
+
+print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n";
+
+my (%matched_sequences, %accessions, %names, %sites, @tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites);
+
+for my $j (0 .. $#p_peptides) {
+	@tmp_matches = (); @tmp_accessions = (); @tmp_names = (); @tmp_sites = ();
+	
+	#Find the matching protein sequence(s) for the peptide
+	my $site = -1; my $match = 0;
+	for my $k (0 .. $#sequences) {
+		$site = index($sequences[$k], $non_p_peptides[$j]);
+		if ($site != -1) {
+			push(@tmp_matches, $sequences[$k]);
+			push(@tmp_accessions, $accessions[$k]);
+			push(@tmp_names, $names[$k]);
+			push(@tmp_sites, $site);
+			$site = -1; $match++;
+		}
+	}
+	
+	if ($match == 0) {	# Check to see if no match was found.  Skip to next if no match found.
+		print "Warning:  Failed match for $p_peptides[$j]\n";
+		$matched_sequences{$p_peptides[$j]} = "Failed match";
+		next;
+	} else {
+		$matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
+		$accessions{$p_peptides[$j]} = [ @tmp_accessions ];
+		$names{$p_peptides[$j]} = [ @tmp_names ];
+		$sites{$p_peptides[$j]} = [ @tmp_sites ];
+	}
+}
+
+my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);
+
+for my $peptide_to_match ( keys %matched_sequences ) {
+	next if ($peptide_to_match eq "Failed match");
+	my @matches = @{$matched_sequences{$peptide_to_match}};
+	@tmp_motifs_array = ();
+	for my $i (0 .. $#matches) {
+		
+		# Find the location of the phospo-site in the sequence(s)
+		$tmp_site = 0; my $offset = 0;
+		my $tmp_p_peptide = $peptide_to_match;
+		$tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
+		
+		# Find all phosphorylated residues in the p_peptide
+		@p_sites = ();  
+		while ($tmp_site != -1) {
+			$tmp_site = index($tmp_p_peptide, 'p', $offset);
+			if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
+			$offset = $tmp_site + 1;
+			$tmp_p_peptide =~ s/p//;
+		}
+		
+		@tmp_p_residues = ();
+		for my $l (0 .. $#p_sites) {
+			push (@tmp_p_residues, $p_sites[$l]+$sites{$peptide_to_match}[$i]);
+		
+		# Match the sequences around the phospho residues to find the motifs
+		my ($desired_residues_L, $desired_residues_R);
+		if ($tmp_p_residues[0] - 10 < 0) {	#check to see if there are fewer than 10 residues left of the first p-site
+			# eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
+			$desired_residues_L = $tmp_p_residues[0];  
+		}
+		else {
+			$desired_residues_L = 10;
+		}
+		my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
+		if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) {	#check to see if there are fewer than 10 residues right of the last p-site
+			$desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);  
+			# eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
+			# print "Line 170:  seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
+		}
+		else {
+			$desired_residues_R = 10;
+		}
+		
+		my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
+		$tmp_motif = substr($matched_sequences{$peptide_to_match}[$i], $tmp_p_residues[0] - $desired_residues_L, $total_length);
+		
+		# Put the "p" back in front of the appropriate phospho-residue(s).
+		my (@tmp_residues, $tmp_position);
+		for my $m (0 .. $#p_sites) {
+			# print "Line 183: $p_sites[$m]\n";
+			if ($m == 0) {$tmp_position = $desired_residues_L;}
+			else {$tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];}
+			# print "Line 187: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\n";
+			push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
+			if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
+			if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
+			if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
+		}
+		
+		$tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
+		
+		# Comment out on 8.10.13 to remove the numbers from motifs
+		my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
+		my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
+		$tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
+		push(@tmp_motifs_array, $tmp_motif);
+		$residues{$peptide_to_match}{$i} = [ @tmp_residues ];
+		$p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
+		}
+	$p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
+	}		### this bracket could be in the wrong place
+}
+
+
+###############################################################################################################################
+#
+#  Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
+#
+###############################################################################################################################
+
+
+
+###############################################################################################################################
+#
+#	Read the NetworKIN_predictions file:
+#		1) make a "kinases_observed" array
+#		2) annotate the phospho-substrates with the appropriate kinase
+#
+###############################################################################################################################
+
+my (@kinases_observed, $kinases);
+my ($p_sequence_kinase, $p_sequence, $kinase);
+
+open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
+print "\nReading the NetworKIN data:  $networkin_in\n";
+while (<IN1>) {
+	chomp;
+	my (@x) = split(/\t/);
+	for my $i (0 .. $#x) {
+		$x[$i] =~ s/\r//g; 	$x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+	}
+	next if ($x[0] eq "#substrate");
+	if (exists ($kinases -> {$x[2]})) {
+		#do nothing
+	}
+	else {
+		$kinases -> {$x[2]} = $x[2];
+		push (@kinases_observed, $x[2]);
+	}
+	my $tmp = $x[10]."_".$x[2];	#eg, REEILsEMKKV_PKCalpha
+	if (exists($p_sequence_kinase -> {$tmp})) {
+		#do nothing
+	}
+	else {
+		$p_sequence_kinase -> {$tmp} = $tmp;
+	}
+}
+close IN1;
+
+###############################################################################################################################
+#
+#	Read the Kinase motifs file:
+#		1) make a "motif_sequence" array
+#
+###############################################################################################################################
+
+my (@motif_sequence, %motif_type, %motif_count);
+
+open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
+print "Reading the Motifs file:  $motifs_in\n";
+
+while (<IN2>) {
+	chomp;
+	my (@x) = split(/\t/);
+	for my $i (0 .. 2) {
+		$x[$i] =~ s/\r//g;
+		$x[$i]  =~ s/\n//g;
+		$x[$i]  =~ s/\"//g;
+		}
+	if (exists ($motif_type{$x[1]})) {
+		$motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
+	} else {
+		$motif_type{$x[1]} = $x[2];
+		$motif_count{$x[1]} = 0;
+		push (@motif_sequence, $x[1]);
+	}
+}
+close (IN2);
+
+
+###############################################################################################################################
+#  6.28.2011
+#	Read PhosphoSite data:
+#		1) make a "kinases_PhosphoSite" array
+#		2) annotate the phospho-substrates with the appropriate kinase
+#
+###############################################################################################################################
+
+
+my (@kinases_PhosphoSite, $kinases_PhosphoSite);
+my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);
+
+my $line = 0;
+
+open (IN3, "$PhosphoSite_in") or die "I couldn't find $PhosphoSite_in\n";
+print "Reading the PhosphoSite data:  $PhosphoSite_in\n";
+
+while (<IN3>) {
+	chomp;
+	my (@x) = split(/\t/);
+	for my $i (0 .. $#x) {
+		$x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+		}
+	if ($line != 0) {
+		if (exists ($kinases_PhosphoSite -> {$x[0]})) {
+			#do nothing
+		}
+		else {
+			$kinases_PhosphoSite -> {$x[0]} = $x[0];
+			push (@kinases_PhosphoSite, $x[0]);
+		}
+		my $offset = 0;
+		# Replace the superfluous lower case s, t and y
+		my @lowercase = ('s','t','y');
+		my @uppercase = ('S','T','Y');
+		for my $k (0 .. 2) {
+			my $site = 0;
+			while ($site != -1) {
+				$site = index($x[11],$lowercase[$k], $offset);
+				if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
+				$offset = $site + 1;
+			}
+		}
+		my $tmp = $x[11]."_".$x[0];		#eg, RTPGRPLsSYGMDSR_PAK2
+		if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+			#do nothing
+		}
+		else {
+			$p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
+		}
+	}
+	$line++;
+}
+close IN3;
+
+
+###############################################################################################################################
+#  Read PhosphoSite regulatory site data:
+#		1) make a "regulatory_sites_PhosphoSite" hash
+#
+###############################################################################################################################
+
+
+my (%regulatory_sites_PhosphoSite);
+my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes);
+
+my $line = 0;
+
+open (IN4, "$PhosphoSite_molecular_function") or die "I couldn't find $PhosphoSite_molecular_function\n";
+print "Reading the PhosphoSite regulatory site data:  $PhosphoSite_molecular_function\n";
+
+while (<IN4>) {
+	chomp;
+	my (@x) = split(/\t/);
+	for my $i (0 .. $#x) {
+		$x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+	}
+	if ($line != 0) {
+		if (!exists($regulatory_sites_PhosphoSite{$x[9]})) {
+			$regulatory_sites_PhosphoSite{$x[9]} = $x[9];
+			$domain{$x[9]} = $x[10];
+			$ON_FUNCTION{$x[9]} = $x[11];
+			$ON_PROCESS{$x[9]} = $x[12];
+			$ON_PROT_INTERACT{$x[9]} = $x[13];
+			$ON_OTHER_INTERACT{$x[9]} = $x[14];
+			$notes{$x[9]} = $x[19];
+		}
+		else {
+			# $domain
+			if ($domain{$x[9]} eq "") {
+				$domain{$x[9]} = $domain{$x[10]};
+			} elsif ($x[10] eq "") {
+				# do nothing
+			}
+			else {
+				$domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
+			}
+			
+			# $ON_FUNCTION
+			if ($ON_FUNCTION{$x[9]} eq "") {
+				$ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
+			} elsif ($x[10] eq "") {
+				# do nothing
+			}
+			else {
+				$ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
+			}
+			
+			# $ON_PROCESS
+			if ($ON_PROCESS{$x[9]} eq "") {
+				$ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
+			} elsif ($x[10] eq "") {
+				# do nothing
+			}
+			else {
+				$ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
+			}
+			
+			# $ON_PROT_INTERACT
+			if ($ON_PROT_INTERACT{$x[9]} eq "") {
+				$ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
+			} elsif ($x[10] eq "") {
+				# do nothing
+			}
+			else {
+				$ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
+			}
+			
+			# $ON_OTHER_INTERACT
+			if ($ON_OTHER_INTERACT{$x[9]} eq "") {
+				$ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
+			} elsif ($x[10] eq "") {
+				# do nothing
+			}
+			else {
+				$ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
+			}
+			
+			# $notes
+			if ($notes{$x[9]} eq "") {
+				$notes{$x[9]} = $notes{$x[10]};
+			} elsif ($x[10] eq "") {
+				# do nothing
+			}
+			else {
+				$notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
+			}
+			
+		}
+	}
+$line++;
+}
+close IN4;
+
+###############################################################################################################################
+#
+#	Read the data file:
+#		1) find sequences that match the NetworKIN predictions
+#		2) find motifs that match the observed sequences
+#
+###############################################################################################################################
+
+my ($formatted_sequence, %unique_motifs);
+my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches);
+my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2);
+
+foreach my $peptide (keys %data) {
+	# find the unique phospho-motifs for this $peptide
+	my @all_motifs = ();
+	for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+		my $tmp_motif = $p_motifs{$peptide}[$i];
+		push(@all_motifs, $tmp_motif);
+	}
+	for my $j (0 .. $#all_motifs) {
+		$all_motifs[$j] =~ s/\d+-\[\s//;  $all_motifs[$j] =~ s/\s\]\-\d+//;
+	}
+	
+	my %seen = ();
+	foreach my $a (@all_motifs) {
+		if (exists($seen{$a})) { next; } else {
+			push(@{$unique_motifs{$peptide}}, $a);
+			$seen{$a} = 1;
+		}
+	}
+	
+	# count the number of phospo-sites in the motif
+	my $number_pY = 0;
+	my $number_pSTY = 0;
+	if ($phospho_type eq 'y') {while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {$number_pY++;}}
+	if ($phospho_type eq 'sty') {while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {$number_pSTY++;}}
+	
+	# search each of the unique motifs for matches
+	for my $i (0 .. $#{$unique_motifs{$peptide}}) {
+		my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
+		if (($number_pY == 1) || ($number_pSTY == 1)) {
+			my $seq_plus5aa = 0;
+			my $seq_plus7aa = 0;
+			$formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
+			if ($phospho_type eq 'y') {
+				$seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
+				$seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
+			}
+			elsif ($phospho_type eq "sty") {
+				$seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
+				$seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
+			}
+			for my $i (0 .. $#kinases_observed) {
+				my $tmp = $seq_plus5aa."_".$kinases_observed[$i];	#eg, should be PGRPLsSYGMD_PKCalpha
+				if (exists($p_sequence_kinase -> {$tmp})) {
+					$kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
+				}
+			}
+			for my $i (0 .. $#motif_sequence) {
+				if ($peptide =~ /$motif_sequence[$i]/) {
+					$kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
+				}
+			}
+			for my $i (0 .. $#kinases_PhosphoSite) {
+				my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];	#eg, should be RTPGRPLsSYGMDSR_PAK2
+				if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+					$kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+				}
+			}
+			if (exists($regulatory_sites_PhosphoSite{$seq_plus7aa})) {
+				$domain_2{$peptide} = $domain{$seq_plus7aa};
+				$ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+				$ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+				$ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+				$ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+				$notes_2{$peptide} = $notes{$seq_plus7aa};
+			}
+		}
+		elsif (($number_pY > 1) || ($number_pSTY > 1)) {  #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
+			$formatted_sequence = $tmp_motif;
+			#Create the sequences with only one phosphorylation site
+			#eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes  1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329  and  1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329
+
+			my (@sites, $offset, $next_p_site);
+			$sites[0] = index($tmp_motif, "p"); 
+			$offset = $sites[0] + 1;
+			while ($next_p_site != -1) {
+				$next_p_site = index($tmp_motif, "p", $offset);
+				if ($next_p_site != -1) {
+					push (@sites, $next_p_site);
+				}
+				$offset = $next_p_site+1;
+			}
+			
+			my @pSTY_sequences;
+			for my $n (0 .. $#sites) {
+				$pSTY_sequences[$n] = $tmp_motif;
+				for (my $m = $#sites; $m >= 0; $m--) {
+					if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
+				}
+			}
+			
+			my @formatted_sequences;
+			for my $k (0 .. $#sites) {
+				$formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
+			}
+
+			for my $k (0 .. $#formatted_sequences) {
+				if ($phospho_type eq 'y') {
+					$seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence[$k]))[1];
+					$seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence[$k]))[1];
+				}
+				elsif ($phospho_type eq "sty") {
+					$seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence[$k]))[1];
+					$seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence[$k]))[1];
+				}
+				for my $i (0 .. $#kinases_observed) {
+					my $tmp = $seq_plus5aa."_".$kinases_observed[$i];	#eg, should look like REEILsEMKKV_PKCalpha
+					if (exists($p_sequence_kinase -> {$tmp})) {
+						$kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
+					}
+				}
+				for my $i (0 .. $#motif_sequence) {
+					if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
+						$kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
+					}
+				}
+				for my $i (0 .. $#kinases_PhosphoSite) {
+					my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];	#eg, should be RTPGRPLsSYGMDSR_PAK2
+					if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+						$kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+					}
+				}
+				if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
+					# $domain
+					if ($domain_2{$peptide} eq "") {
+						$domain_2{$peptide} = $domain{$seq_plus7aa};
+					}
+					elsif ($domain{$seq_plus7aa} eq "") {
+						# do nothing
+					}
+					else {
+						$domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; 
+					}
+					
+					# $ON_FUNCTION_2
+					if ($ON_FUNCTION_2{$peptide} eq "") {
+						$ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+					}
+					elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
+						# do nothing
+					}
+					else {
+						$ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; 
+					}
+					
+					# $ON_PROCESS_2
+					if ($ON_PROCESS_2{$peptide} eq "") {
+						$ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+					}
+					elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
+						# do nothing
+					}
+					else {
+						$ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; 
+					}
+					
+					# $ON_PROT_INTERACT_2
+					if ($ON_PROT_INTERACT_2{$peptide} eq "") {
+						$ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+					}
+					elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
+						# do nothing
+					}
+					else {
+						$ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; 
+					}
+					
+					# $ON_OTHER_INTERACT_2
+					if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
+						$ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+					}
+					elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
+						# do nothing
+					}
+					else {
+						$ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; 
+					}
+					
+					# $notes_2
+					if ($notes_2{$peptide} eq "") {
+						$notes_2{$peptide} = $notes{$seq_plus7aa};
+					}
+					elsif ($notes{$seq_plus7aa} eq "") {
+						# do nothing
+					}
+					else {
+						$notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; 
+					}
+				}
+			}
+		}
+	}
+}
+
+
+###############################################################################################################################
+#
+# Print to the output file
+#
+###############################################################################################################################
+open (OUT, ">$file_out") || die "could not open the fileout: $file_out";
+
+# print the header info
+print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";
+
+# print the PhosphoSite regulatory data
+print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";
+
+# print the sample names
+for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }
+
+# print the kinases and groups
+for my $i (0 .. $#kinases_observed) {
+	my $temp = $kinases_observed[$i]."_NetworKIN";
+	print OUT "$temp\t";
+}
+for my $i (0 .. $#motif_sequence) {
+	print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t";
+}
+for my $i (0 .. $#kinases_PhosphoSite) {
+	my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite";
+	if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
+	if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
+}
+
+
+foreach my $peptide (keys %data) {
+	# Print the peptide itself
+	print OUT "$peptide\t";
+	
+	# skip over failed matches
+	if ($matched_sequences{$peptide} eq "Failed match") {
+		print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
+	} else {
+		# Print just the protein description
+		my @description = ();
+		for $i (0 .. $#{$names{$peptide}}) {
+			my $long_name = $names{$peptide}[$i];
+			my @naming_parts = split(/\sOS/, $long_name);
+			my @front_half = split(/\s/, $naming_parts[0]);
+			push(@description, join(" ", @front_half[1..($#front_half)]));
+		}
+		print OUT join(" /// ", @description), "\t";
+		
+		# Print just the gene name
+		my @gene = ();
+		my %seen = ();
+		for $i (0 .. $#{$names{$peptide}}) {
+			my $tmp_gene = $names{$peptide}[$i];
+			$tmp_gene =~ s/^.*GN=//;
+			$tmp_gene =~ s/\s.*//;
+			if (!exists($seen{$tmp_gene})) {
+				push(@gene, $tmp_gene);
+				$seen{$tmp_gene} = $tmp_gene;
+			}
+		}
+		print OUT join(" /// ", @gene), "\t";
+		
+		# print the entire names
+		print OUT join(" /// ", @{$names{$peptide}}), "\t";
+		
+		# Print the phospho-residues
+		for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+			if ($i < $#{ $matched_sequences{$peptide} }) {
+				@tmp_p_residues = @{$p_residues{$peptide}{$i}};
+				for my $j (0 .. $#tmp_p_residues) {
+					if ($j < $#tmp_p_residues) {
+						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
+						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+					}
+					elsif ($j == $#tmp_p_residues) {
+						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
+						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
+					}
+				}
+			}
+			elsif ($i == $#{ $matched_sequences{$peptide} }) {
+				@tmp_p_residues = @{$p_residues{$peptide}{$i}};
+				for my $j (0 .. $#tmp_p_residues) {
+					if ($j < $#tmp_p_residues) {
+						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
+						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+					}
+					elsif ($j == $#tmp_p_residues) {
+						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
+						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t";
+					}
+				}
+			}
+		}
+		
+		# Print the UNIQUE phospho-motifs
+		print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
+		
+		# Print the accessions
+		print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
+		
+		# print ALL motifs with residue numbers
+		print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
+	}
+	
+	# Print the PhosphoSite regulatory data
+	
+	if (exists($domain_2{$peptide})) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; }
+	if (exists($ON_FUNCTION_2{$peptide})) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; }
+	if (exists($ON_PROCESS_2{$peptide})) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; }
+	if (exists($ON_PROT_INTERACT_2{$peptide})) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
+	if (exists($ON_OTHER_INTERACT_2{$peptide})) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
+	if (exists($notes_2{$peptide})) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; }
+	
+	# Print the data
+	@tmp_data = (); foreach (@{$data{$peptide}}) { push(@tmp_data, $_); }
+	print OUT join("\t", @tmp_data), "\t";
+	
+	# print the kinase-substrate data
+	for my $i (0 .. $#kinases_observed) {
+		if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
+			print OUT "X\t";
+		}
+		else { print OUT "\t";}
+	}
+	for my $i (0 .. $#motif_sequence) {
+		if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) {
+			print OUT "X\t";
+			# print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n";			#debug
+		}
+		else { print OUT "\t";}
+	}
+	for my $i (0 .. $#kinases_PhosphoSite) {
+		if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) {
+			print OUT "X\t";
+		}
+		elsif (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i == $#kinases_PhosphoSite)) {
+			print OUT "X\n";
+		}
+		elsif (!exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) {
+			print OUT "\t";
+		}
+		else {
+			print OUT "\n";
+		}
+	}
+}
+
+close OUT;
+
+
+my @timeData = localtime(time);
+print "\nFinished $timeData[2]:$timeData[1]:$timeData[0]\n\n";
+
+###############################################################################################################################
+sub replace_pSpTpY {
+	my ($formatted_sequence, $phospho_type) = @_;
+	if ($phospho_type eq 'y') {
+		$formatted_sequence =~ s/pS/S/g;
+		$formatted_sequence =~ s/pT/T/g;
+		$formatted_sequence =~ s/pY/y/g;
+		}
+	elsif ($phospho_type eq "sty") {
+		$formatted_sequence =~ s/pS/s/g;
+		$formatted_sequence =~ s/pT/t/g;
+		$formatted_sequence =~ s/pY/y/g;
+		}
+	$formatted_sequence;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phosphopeptide_kinase_mapping.xml	Thu Nov 04 19:37:36 2021 +0000
@@ -0,0 +1,40 @@
+<tool id="phosphopeptide_kinase_mapping" name="PhosphoPeptide Upstream Kinase Mapping" version="0.1.0" python_template_version="3.5">
+    <requirements>
+        <requirement type="package" version="5.22.0.1">perl</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        perl '$__tool_directory__/PhosphoPeptide_Upstream_Kinase_Mapping.pl'
+        -i '$phophopeptide_intensities'
+        -f '$protein_fasta'
+        -n '$networkin'
+        -m '$p_sty_motifs'
+        -p '$psp_kinase_substrate'
+        -r '$psp_regulatory_sites'
+        -P $phospho_type
+        -F $merge_function
+        -o 'phophopeptides.tsv'
+    ]]></command>
+    <inputs>
+        <param name="phophopeptide_intensities" type="data" format="tabular" label="Phosphopeptide intensity file" 
+            help="generate by MaxQuant Phosphopeptide Intensity"/>
+        <param name="protein_fasta" type="data" format="fasta" label="Protein fasta database" />
+        <param name="networkin" type="data" format="tabular" label="NetworKIN file" />
+        <param name="p_sty_motifs" type="data" format="tabular" label="pSTY_Motifs file" />
+        <param name="psp_kinase_substrate" type="data" format="tabular" label="PSP_Kinase_Substrate_Dataset" />
+        <param name="psp_regulatory_sites" type="data" format="tabular" label="PSP_Regulatory_sites" />
+        <param name="phospho_type" type="select" label="phospho type">
+            <option value="sty" selected="true">sty</option>
+            <option value="y">y</option>
+        </param>
+        <param name="merge_function" type="select" label="Merge identical phospho site intensities by">
+            <option value="sum" selected="true">sum</option>
+            <option value="average">average</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular" from_work_dir="phophopeptides.tsv"/>
+    </outputs>
+    <help><![CDATA[
+        TODO: Fill in help.
+    ]]></help>
+</tool>