# HG changeset patch # User galaxyp # Date 1636054656 0 # Node ID 56658e35798d3169148d2a29490d69818b29633a "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/phosphopeptide_kinase_mapping commit d256bec9d43378291734e2b2a93bdbfcc2d83f61" diff -r 000000000000 -r 56658e35798d PhosphoPeptide_Upstream_Kinase_Mapping.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl Thu Nov 04 19:37:36 2021 +0000 @@ -0,0 +1,957 @@ +#!/usr/local/bin/perl + +use Getopt::Std; + +############################################################################################################################### +# perl Kinase_enrichment_analysis_complete_v0.pl +# +# Nick Graham, USC +# 2016-02-27 +# +# Built from scripts written by NG at UCLA in Tom Graeber's lab: +# CombinePhosphoSites.pl +# Retrieve_p_motifs.pl +# NetworKIN_Motif_Finder_v7.pl +# +# Given a list of phospho-peptides, find protein information and upstream kinases. +# Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl +# +############################################################################################################################### + +my ($file_in, $average_or_sum, $file_out, $phospho_type); +my ($fasta_in, $networkin_in, $motifs_in, $PhosphoSite_in, $PhosphoSite_molecular_function); + +########## +## opts ## +########## +## input files +# i : path to input outputfile_STEP2.txt +# f : path to fasta +# n : path to NetworKIN_201612_cutoffscore2.0.txt +# m : path to pSTY_Motifs.txt +# p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt +# r : path to 2017-03_PSP_Regulatory_sites.txt +## options +# P : phospho_type +# F : function +## output files +# o : path to output file + +sub usage() + { + print STDERR <<"EOH"; + This program given a list of phospho-peptides, finds protein information and upstream kinases. + usage: $0 [-hvd] [-f file] + -h : this (help) message + -i : path to input outputfile_STEP2.txt + -f : path to fasta + -n : path to NetworKIN_201612_cutoffscore2.0.txt + -m : path to pSTY_Motifs.txt + -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt + -r : path to 2017-03_PSP_Regulatory_sites.txt + -P : phospho_type + -F : function + -o : path to output file + example: $0 +EOH + exit; + } + +my %opts; +getopts('i:f:n:m:p:r:o:P:F:h', \%opts) ; + +if (exists($opts{'h'})) { + usage(); +} +if (!exists($opts{'i'}) || !-e $opts{'i'}) { + die('Input File not found'); +} else { + $file_in = $opts{'i'}; +} +if (!exists($opts{'f'}) || !-e $opts{'f'}) { + die('Input Fasta File not found'); +} else { + $fasta_in = $opts{'f'}; +} +if (!exists($opts{'n'}) || !-e $opts{'n'}) { + die('Input NetworKIN File not found'); +} else { + $networkin_in = $opts{'n'}; +} +if (!exists($opts{'m'}) || !-e $opts{'m'}) { + die('Input pSTY_Motifs File not found'); +} else { + $motifs_in = $opts{'m'}; +} +if (!exists($opts{'p'}) || !-e $opts{'p'}) { + die('Input PSP_Kinase_Substrate_Dataset File not found'); +} else { + $PhosphoSite_in = $opts{'p'}; +} +if (!exists($opts{'r'}) || !-e $opts{'r'}) { + die('Input PSP_Regulatory_sites File not found'); +} else { + $PhosphoSite_molecular_function = $opts{'r'}; +} +if (exists($opts{'P'})) { + $phospho_type = $opts{'P'}; +} +else { + $phospho_type = "sty"; +} +if (exists($opts{'F'})) { + $average_or_sum = $opts{'F'}; +} +else { + $average_or_sum = "sum"; +} +if (exists($opts{'o'})) { + $file_out = $opts{'o'}; +} +else { + $file_out = "output.tsv"; +} + + +############################################################################################################################### +# Print the relevant file names to the screen +############################################################################################################################### +# print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n"; +print "\nData file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\n\n"; +print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PhosphoSite_in\nPhosphosite functional data: $PhosphoSite_molecular_function\nFASTA file: $fasta_in\n\n"; + + +print "\nPhospho-residues(s) = $phospho_type\n"; +if ($phospho_type ne 'y') { + if ($phospho_type ne 'sty') { + die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n"; + } +} + +############################################################################################################################### +# read the input data file +# average or sum identical phospho-sites, depending on the value of $average_or_sum +############################################################################################################################### + +open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n"; + +die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ; + + +my (@samples, %data, @tmp_data, %n); +my $line = 0; + +while () { + chomp; + my @x = split(/\t/); + for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;} + + # Read in the samples + if ($line == 0) { + for my $n (1 .. $#x) { + push (@samples, $x[$n]); + } + $line++; + } else { + # check whether we have already seen a phospho-peptide + if (exists($data{$x[0]})) { + if ($average_or_sum eq "sum") { # add the data + # unload the data + @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); } + # add the new data and repack + for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; } + $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); } + + } elsif ($average_or_sum eq "average") { # average the data + # unload the data + @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); } + # average with the new data and repack + for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); } + $n{$x[0]}++; + $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); } + } + } + # if the phospho-sequence has not been seen, save the data + else { + for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); } + $n{$x[0]} = 1; + } + } +} +close(IN); + + +############################################################################################################################### +# Search the FASTA database for phospho-sites and motifs +# +# based on Retrieve_p_peptide_motifs_v2.pl +############################################################################################################################### + + +############################################################################################################################### +# +# Read in the Data file: +# 1) make @p_peptides array as in the original file +# 2) make @non_p_peptides array w/o residue modifications (p, #, other) +# +############################################################################################################################### + +my (@p_peptides, @non_p_peptides); +foreach my $peptide (keys %data) { + $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g; + push (@p_peptides, $peptide); + $peptide =~ s/p//g; + push(@non_p_peptides, $peptide); +} + +############################################################################################################################### +# +# Read in the FASTA sequence file, save them to the @sequences array +# +############################################################################################################################### + +open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n"; + +my (@accessions, @names, @sequences); + +print "Reading FASTA file $fasta_in\n"; +while () { + chomp; + my (@x) = split(/\|/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } + if ($x[0] =~ /^>/) { + $x[0] =~ s/\>//g; + push (@names, $x[2]); + push (@accessions, $x[1]); + } elsif ($x[0] =~ /^\w/) { + $sequences[$#accessions] = $sequences[$#accessions].$x[0]; + } +} +close IN1; +print "Done Reading FASTA file $fasta_in\n\n"; + + +############################################################################################################################### +# +# Match the non_p_peptides to the @sequences array: +# 1) Format the motifs +/- 10 residues around the phospho-site +# 2) Print the original data plus the phospho-motif to the output file +# +############################################################################################################################### + +print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n"; + +my (%matched_sequences, %accessions, %names, %sites, @tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites); + +for my $j (0 .. $#p_peptides) { + @tmp_matches = (); @tmp_accessions = (); @tmp_names = (); @tmp_sites = (); + + #Find the matching protein sequence(s) for the peptide + my $site = -1; my $match = 0; + for my $k (0 .. $#sequences) { + $site = index($sequences[$k], $non_p_peptides[$j]); + if ($site != -1) { + push(@tmp_matches, $sequences[$k]); + push(@tmp_accessions, $accessions[$k]); + push(@tmp_names, $names[$k]); + push(@tmp_sites, $site); + $site = -1; $match++; + } + } + + if ($match == 0) { # Check to see if no match was found. Skip to next if no match found. + print "Warning: Failed match for $p_peptides[$j]\n"; + $matched_sequences{$p_peptides[$j]} = "Failed match"; + next; + } else { + $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ]; + $accessions{$p_peptides[$j]} = [ @tmp_accessions ]; + $names{$p_peptides[$j]} = [ @tmp_names ]; + $sites{$p_peptides[$j]} = [ @tmp_sites ]; + } +} + +my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues); + +for my $peptide_to_match ( keys %matched_sequences ) { + next if ($peptide_to_match eq "Failed match"); + my @matches = @{$matched_sequences{$peptide_to_match}}; + @tmp_motifs_array = (); + for my $i (0 .. $#matches) { + + # Find the location of the phospo-site in the sequence(s) + $tmp_site = 0; my $offset = 0; + my $tmp_p_peptide = $peptide_to_match; + $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; + + # Find all phosphorylated residues in the p_peptide + @p_sites = (); + while ($tmp_site != -1) { + $tmp_site = index($tmp_p_peptide, 'p', $offset); + if ($tmp_site != -1) {push (@p_sites, $tmp_site);} + $offset = $tmp_site + 1; + $tmp_p_peptide =~ s/p//; + } + + @tmp_p_residues = (); + for my $l (0 .. $#p_sites) { + push (@tmp_p_residues, $p_sites[$l]+$sites{$peptide_to_match}[$i]); + + # Match the sequences around the phospho residues to find the motifs + my ($desired_residues_L, $desired_residues_R); + if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site + # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3 + $desired_residues_L = $tmp_p_residues[0]; + } + else { + $desired_residues_L = 10; + } + my $seq_length = length($matched_sequences{$peptide_to_match}[$i]); + if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site + $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1); + # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3 + # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n"; + } + else { + $desired_residues_R = 10; + } + + my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1; + $tmp_motif = substr($matched_sequences{$peptide_to_match}[$i], $tmp_p_residues[0] - $desired_residues_L, $total_length); + + # Put the "p" back in front of the appropriate phospho-residue(s). + my (@tmp_residues, $tmp_position); + for my $m (0 .. $#p_sites) { + # print "Line 183: $p_sites[$m]\n"; + if ($m == 0) {$tmp_position = $desired_residues_L;} + else {$tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];} + # print "Line 187: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\n"; + push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); + if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} + if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");} + if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");} + } + + $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g; + + # Comment out on 8.10.13 to remove the numbers from motifs + my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1; + my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1; + $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue; + push(@tmp_motifs_array, $tmp_motif); + $residues{$peptide_to_match}{$i} = [ @tmp_residues ]; + $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ]; + } + $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ]; + } ### this bracket could be in the wrong place +} + + +############################################################################################################################### +# +# Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs +# +############################################################################################################################### + + + +############################################################################################################################### +# +# Read the NetworKIN_predictions file: +# 1) make a "kinases_observed" array +# 2) annotate the phospho-substrates with the appropriate kinase +# +############################################################################################################################### + +my (@kinases_observed, $kinases); +my ($p_sequence_kinase, $p_sequence, $kinase); + +open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n"; +print "\nReading the NetworKIN data: $networkin_in\n"; +while () { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + next if ($x[0] eq "#substrate"); + if (exists ($kinases -> {$x[2]})) { + #do nothing + } + else { + $kinases -> {$x[2]} = $x[2]; + push (@kinases_observed, $x[2]); + } + my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + #do nothing + } + else { + $p_sequence_kinase -> {$tmp} = $tmp; + } +} +close IN1; + +############################################################################################################################### +# +# Read the Kinase motifs file: +# 1) make a "motif_sequence" array +# +############################################################################################################################### + +my (@motif_sequence, %motif_type, %motif_count); + +open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; +print "Reading the Motifs file: $motifs_in\n"; + +while () { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. 2) { + $x[$i] =~ s/\r//g; + $x[$i] =~ s/\n//g; + $x[$i] =~ s/\"//g; + } + if (exists ($motif_type{$x[1]})) { + $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; + } else { + $motif_type{$x[1]} = $x[2]; + $motif_count{$x[1]} = 0; + push (@motif_sequence, $x[1]); + } +} +close (IN2); + + +############################################################################################################################### +# 6.28.2011 +# Read PhosphoSite data: +# 1) make a "kinases_PhosphoSite" array +# 2) annotate the phospho-substrates with the appropriate kinase +# +############################################################################################################################### + + +my (@kinases_PhosphoSite, $kinases_PhosphoSite); +my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); + +my $line = 0; + +open (IN3, "$PhosphoSite_in") or die "I couldn't find $PhosphoSite_in\n"; +print "Reading the PhosphoSite data: $PhosphoSite_in\n"; + +while () { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + if ($line != 0) { + if (exists ($kinases_PhosphoSite -> {$x[0]})) { + #do nothing + } + else { + $kinases_PhosphoSite -> {$x[0]} = $x[0]; + push (@kinases_PhosphoSite, $x[0]); + } + my $offset = 0; + # Replace the superfluous lower case s, t and y + my @lowercase = ('s','t','y'); + my @uppercase = ('S','T','Y'); + for my $k (0 .. 2) { + my $site = 0; + while ($site != -1) { + $site = index($x[11],$lowercase[$k], $offset); + if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);} + $offset = $site + 1; + } + } + my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2 + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + #do nothing + } + else { + $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp; + } + } + $line++; +} +close IN3; + + +############################################################################################################################### +# Read PhosphoSite regulatory site data: +# 1) make a "regulatory_sites_PhosphoSite" hash +# +############################################################################################################################### + + +my (%regulatory_sites_PhosphoSite); +my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes); + +my $line = 0; + +open (IN4, "$PhosphoSite_molecular_function") or die "I couldn't find $PhosphoSite_molecular_function\n"; +print "Reading the PhosphoSite regulatory site data: $PhosphoSite_molecular_function\n"; + +while () { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + if ($line != 0) { + if (!exists($regulatory_sites_PhosphoSite{$x[9]})) { + $regulatory_sites_PhosphoSite{$x[9]} = $x[9]; + $domain{$x[9]} = $x[10]; + $ON_FUNCTION{$x[9]} = $x[11]; + $ON_PROCESS{$x[9]} = $x[12]; + $ON_PROT_INTERACT{$x[9]} = $x[13]; + $ON_OTHER_INTERACT{$x[9]} = $x[14]; + $notes{$x[9]} = $x[19]; + } + else { + # $domain + if ($domain{$x[9]} eq "") { + $domain{$x[9]} = $domain{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $domain{$x[9]} = $domain{$x[9]}." / ".$x[10]; + } + + # $ON_FUNCTION + if ($ON_FUNCTION{$x[9]} eq "") { + $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10]; + } + + # $ON_PROCESS + if ($ON_PROCESS{$x[9]} eq "") { + $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10]; + } + + # $ON_PROT_INTERACT + if ($ON_PROT_INTERACT{$x[9]} eq "") { + $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10]; + } + + # $ON_OTHER_INTERACT + if ($ON_OTHER_INTERACT{$x[9]} eq "") { + $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10]; + } + + # $notes + if ($notes{$x[9]} eq "") { + $notes{$x[9]} = $notes{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $notes{$x[9]} = $notes{$x[9]}." / ".$x[10]; + } + + } + } +$line++; +} +close IN4; + +############################################################################################################################### +# +# Read the data file: +# 1) find sequences that match the NetworKIN predictions +# 2) find motifs that match the observed sequences +# +############################################################################################################################### + +my ($formatted_sequence, %unique_motifs); +my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches); +my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2); + +foreach my $peptide (keys %data) { + # find the unique phospho-motifs for this $peptide + my @all_motifs = (); + for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { + my $tmp_motif = $p_motifs{$peptide}[$i]; + push(@all_motifs, $tmp_motif); + } + for my $j (0 .. $#all_motifs) { + $all_motifs[$j] =~ s/\d+-\[\s//; $all_motifs[$j] =~ s/\s\]\-\d+//; + } + + my %seen = (); + foreach my $a (@all_motifs) { + if (exists($seen{$a})) { next; } else { + push(@{$unique_motifs{$peptide}}, $a); + $seen{$a} = 1; + } + } + + # count the number of phospo-sites in the motif + my $number_pY = 0; + my $number_pSTY = 0; + if ($phospho_type eq 'y') {while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {$number_pY++;}} + if ($phospho_type eq 'sty') {while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {$number_pSTY++;}} + + # search each of the unique motifs for matches + for my $i (0 .. $#{$unique_motifs{$peptide}}) { + my $tmp_motif = ${$unique_motifs{$peptide}}[$i]; + if (($number_pY == 1) || ($number_pSTY == 1)) { + my $seq_plus5aa = 0; + my $seq_plus7aa = 0; + $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); + if ($phospho_type eq 'y') { + $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; + $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; + } + elsif ($phospho_type eq "sty") { + $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1]; + $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1]; + } + for my $i (0 .. $#kinases_observed) { + my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; + } + } + for my $i (0 .. $#motif_sequence) { + if ($peptide =~ /$motif_sequence[$i]/) { + $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; + } + } + for my $i (0 .. $#kinases_PhosphoSite) { + my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; + } + } + if (exists($regulatory_sites_PhosphoSite{$seq_plus7aa})) { + $domain_2{$peptide} = $domain{$seq_plus7aa}; + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; + $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; + $notes_2{$peptide} = $notes{$seq_plus7aa}; + } + } + elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 + $formatted_sequence = $tmp_motif; + #Create the sequences with only one phosphorylation site + #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329 + + my (@sites, $offset, $next_p_site); + $sites[0] = index($tmp_motif, "p"); + $offset = $sites[0] + 1; + while ($next_p_site != -1) { + $next_p_site = index($tmp_motif, "p", $offset); + if ($next_p_site != -1) { + push (@sites, $next_p_site); + } + $offset = $next_p_site+1; + } + + my @pSTY_sequences; + for my $n (0 .. $#sites) { + $pSTY_sequences[$n] = $tmp_motif; + for (my $m = $#sites; $m >= 0; $m--) { + if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";} + } + } + + my @formatted_sequences; + for my $k (0 .. $#sites) { + $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); + } + + for my $k (0 .. $#formatted_sequences) { + if ($phospho_type eq 'y') { + $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence[$k]))[1]; + $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence[$k]))[1]; + } + elsif ($phospho_type eq "sty") { + $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence[$k]))[1]; + $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence[$k]))[1]; + } + for my $i (0 .. $#kinases_observed) { + my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; + } + } + for my $i (0 .. $#motif_sequence) { + if ($pSTY_sequence =~ /$motif_sequence[$i]/) { + $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; + } + } + for my $i (0 .. $#kinases_PhosphoSite) { + my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; + } + } + if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { + # $domain + if ($domain_2{$peptide} eq "") { + $domain_2{$peptide} = $domain{$seq_plus7aa}; + } + elsif ($domain{$seq_plus7aa} eq "") { + # do nothing + } + else { + $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; + } + + # $ON_FUNCTION_2 + if ($ON_FUNCTION_2{$peptide} eq "") { + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; + } + elsif ($ON_FUNCTION{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; + } + + # $ON_PROCESS_2 + if ($ON_PROCESS_2{$peptide} eq "") { + $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; + } + elsif ($ON_PROCESS{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; + } + + # $ON_PROT_INTERACT_2 + if ($ON_PROT_INTERACT_2{$peptide} eq "") { + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; + } + elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; + } + + # $ON_OTHER_INTERACT_2 + if ($ON_OTHER_INTERACT_2{$peptide} eq "") { + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; + } + elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; + } + + # $notes_2 + if ($notes_2{$peptide} eq "") { + $notes_2{$peptide} = $notes{$seq_plus7aa}; + } + elsif ($notes{$seq_plus7aa} eq "") { + # do nothing + } + else { + $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; + } + } + } + } + } +} + + +############################################################################################################################### +# +# Print to the output file +# +############################################################################################################################### +open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; + +# print the header info +print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t"; + +# print the PhosphoSite regulatory data +print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t"; + +# print the sample names +for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; } + +# print the kinases and groups +for my $i (0 .. $#kinases_observed) { + my $temp = $kinases_observed[$i]."_NetworKIN"; + print OUT "$temp\t"; +} +for my $i (0 .. $#motif_sequence) { + print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t"; +} +for my $i (0 .. $#kinases_PhosphoSite) { + my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite"; + if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } + if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } +} + + +foreach my $peptide (keys %data) { + # Print the peptide itself + print OUT "$peptide\t"; + + # skip over failed matches + if ($matched_sequences{$peptide} eq "Failed match") { + print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; + } else { + # Print just the protein description + my @description = (); + for $i (0 .. $#{$names{$peptide}}) { + my $long_name = $names{$peptide}[$i]; + my @naming_parts = split(/\sOS/, $long_name); + my @front_half = split(/\s/, $naming_parts[0]); + push(@description, join(" ", @front_half[1..($#front_half)])); + } + print OUT join(" /// ", @description), "\t"; + + # Print just the gene name + my @gene = (); + my %seen = (); + for $i (0 .. $#{$names{$peptide}}) { + my $tmp_gene = $names{$peptide}[$i]; + $tmp_gene =~ s/^.*GN=//; + $tmp_gene =~ s/\s.*//; + if (!exists($seen{$tmp_gene})) { + push(@gene, $tmp_gene); + $seen{$tmp_gene} = $tmp_gene; + } + } + print OUT join(" /// ", @gene), "\t"; + + # print the entire names + print OUT join(" /// ", @{$names{$peptide}}), "\t"; + + # Print the phospho-residues + for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { + if ($i < $#{ $matched_sequences{$peptide} }) { + @tmp_p_residues = @{$p_residues{$peptide}{$i}}; + for my $j (0 .. $#tmp_p_residues) { + if ($j < $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + } + elsif ($j == $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; + } + } + } + elsif ($i == $#{ $matched_sequences{$peptide} }) { + @tmp_p_residues = @{$p_residues{$peptide}{$i}}; + for my $j (0 .. $#tmp_p_residues) { + if ($j < $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + } + elsif ($j == $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t"; + } + } + } + } + + # Print the UNIQUE phospho-motifs + print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; + + # Print the accessions + print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; + + # print ALL motifs with residue numbers + print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; + } + + # Print the PhosphoSite regulatory data + + if (exists($domain_2{$peptide})) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; } + if (exists($ON_FUNCTION_2{$peptide})) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; } + if (exists($ON_PROCESS_2{$peptide})) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; } + if (exists($ON_PROT_INTERACT_2{$peptide})) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } + if (exists($ON_OTHER_INTERACT_2{$peptide})) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } + if (exists($notes_2{$peptide})) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; } + + # Print the data + @tmp_data = (); foreach (@{$data{$peptide}}) { push(@tmp_data, $_); } + print OUT join("\t", @tmp_data), "\t"; + + # print the kinase-substrate data + for my $i (0 .. $#kinases_observed) { + if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { + print OUT "X\t"; + } + else { print OUT "\t";} + } + for my $i (0 .. $#motif_sequence) { + if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) { + print OUT "X\t"; + # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug + } + else { print OUT "\t";} + } + for my $i (0 .. $#kinases_PhosphoSite) { + if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) { + print OUT "X\t"; + } + elsif (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i == $#kinases_PhosphoSite)) { + print OUT "X\n"; + } + elsif (!exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) { + print OUT "\t"; + } + else { + print OUT "\n"; + } + } +} + +close OUT; + + +my @timeData = localtime(time); +print "\nFinished $timeData[2]:$timeData[1]:$timeData[0]\n\n"; + +############################################################################################################################### +sub replace_pSpTpY { + my ($formatted_sequence, $phospho_type) = @_; + if ($phospho_type eq 'y') { + $formatted_sequence =~ s/pS/S/g; + $formatted_sequence =~ s/pT/T/g; + $formatted_sequence =~ s/pY/y/g; + } + elsif ($phospho_type eq "sty") { + $formatted_sequence =~ s/pS/s/g; + $formatted_sequence =~ s/pT/t/g; + $formatted_sequence =~ s/pY/y/g; + } + $formatted_sequence; +} + diff -r 000000000000 -r 56658e35798d phosphopeptide_kinase_mapping.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phosphopeptide_kinase_mapping.xml Thu Nov 04 19:37:36 2021 +0000 @@ -0,0 +1,40 @@ + + + perl + + + + + + + + + + + + + + + + + + + + + + +