Mercurial > repos > galaxyp > phosphopeptide_kinase_mapping
view PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 0:56658e35798d draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/phosphopeptide_kinase_mapping commit d256bec9d43378291734e2b2a93bdbfcc2d83f61"
author | galaxyp |
---|---|
date | Thu, 04 Nov 2021 19:37:36 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/local/bin/perl use Getopt::Std; ############################################################################################################################### # perl Kinase_enrichment_analysis_complete_v0.pl # # Nick Graham, USC # 2016-02-27 # # Built from scripts written by NG at UCLA in Tom Graeber's lab: # CombinePhosphoSites.pl # Retrieve_p_motifs.pl # NetworKIN_Motif_Finder_v7.pl # # Given a list of phospho-peptides, find protein information and upstream kinases. # Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl # ############################################################################################################################### my ($file_in, $average_or_sum, $file_out, $phospho_type); my ($fasta_in, $networkin_in, $motifs_in, $PhosphoSite_in, $PhosphoSite_molecular_function); ########## ## opts ## ########## ## input files # i : path to input outputfile_STEP2.txt # f : path to fasta # n : path to NetworKIN_201612_cutoffscore2.0.txt # m : path to pSTY_Motifs.txt # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt # r : path to 2017-03_PSP_Regulatory_sites.txt ## options # P : phospho_type # F : function ## output files # o : path to output file sub usage() { print STDERR <<"EOH"; This program given a list of phospho-peptides, finds protein information and upstream kinases. usage: $0 [-hvd] [-f file] -h : this (help) message -i : path to input outputfile_STEP2.txt -f : path to fasta -n : path to NetworKIN_201612_cutoffscore2.0.txt -m : path to pSTY_Motifs.txt -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt -r : path to 2017-03_PSP_Regulatory_sites.txt -P : phospho_type -F : function -o : path to output file example: $0 EOH exit; } my %opts; getopts('i:f:n:m:p:r:o:P:F:h', \%opts) ; if (exists($opts{'h'})) { usage(); } if (!exists($opts{'i'}) || !-e $opts{'i'}) { die('Input File not found'); } else { $file_in = $opts{'i'}; } if (!exists($opts{'f'}) || !-e $opts{'f'}) { die('Input Fasta File not found'); } else { $fasta_in = $opts{'f'}; } if (!exists($opts{'n'}) || !-e $opts{'n'}) { die('Input NetworKIN File not found'); } else { $networkin_in = $opts{'n'}; } if (!exists($opts{'m'}) || !-e $opts{'m'}) { die('Input pSTY_Motifs File not found'); } else { $motifs_in = $opts{'m'}; } if (!exists($opts{'p'}) || !-e $opts{'p'}) { die('Input PSP_Kinase_Substrate_Dataset File not found'); } else { $PhosphoSite_in = $opts{'p'}; } if (!exists($opts{'r'}) || !-e $opts{'r'}) { die('Input PSP_Regulatory_sites File not found'); } else { $PhosphoSite_molecular_function = $opts{'r'}; } if (exists($opts{'P'})) { $phospho_type = $opts{'P'}; } else { $phospho_type = "sty"; } if (exists($opts{'F'})) { $average_or_sum = $opts{'F'}; } else { $average_or_sum = "sum"; } if (exists($opts{'o'})) { $file_out = $opts{'o'}; } else { $file_out = "output.tsv"; } ############################################################################################################################### # Print the relevant file names to the screen ############################################################################################################################### # print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n"; print "\nData file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\n\n"; print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PhosphoSite_in\nPhosphosite functional data: $PhosphoSite_molecular_function\nFASTA file: $fasta_in\n\n"; print "\nPhospho-residues(s) = $phospho_type\n"; if ($phospho_type ne 'y') { if ($phospho_type ne 'sty') { die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n"; } } ############################################################################################################################### # read the input data file # average or sum identical phospho-sites, depending on the value of $average_or_sum ############################################################################################################################### open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n"; die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ; my (@samples, %data, @tmp_data, %n); my $line = 0; while (<IN>) { chomp; my @x = split(/\t/); for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;} # Read in the samples if ($line == 0) { for my $n (1 .. $#x) { push (@samples, $x[$n]); } $line++; } else { # check whether we have already seen a phospho-peptide if (exists($data{$x[0]})) { if ($average_or_sum eq "sum") { # add the data # unload the data @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); } # add the new data and repack for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; } $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); } } elsif ($average_or_sum eq "average") { # average the data # unload the data @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); } # average with the new data and repack for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); } $n{$x[0]}++; $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); } } } # if the phospho-sequence has not been seen, save the data else { for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); } $n{$x[0]} = 1; } } } close(IN); ############################################################################################################################### # Search the FASTA database for phospho-sites and motifs # # based on Retrieve_p_peptide_motifs_v2.pl ############################################################################################################################### ############################################################################################################################### # # Read in the Data file: # 1) make @p_peptides array as in the original file # 2) make @non_p_peptides array w/o residue modifications (p, #, other) # ############################################################################################################################### my (@p_peptides, @non_p_peptides); foreach my $peptide (keys %data) { $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g; push (@p_peptides, $peptide); $peptide =~ s/p//g; push(@non_p_peptides, $peptide); } ############################################################################################################################### # # Read in the FASTA sequence file, save them to the @sequences array # ############################################################################################################################### open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n"; my (@accessions, @names, @sequences); print "Reading FASTA file $fasta_in\n"; while (<IN1>) { chomp; my (@x) = split(/\|/); for my $i (0 .. $#x) { $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } if ($x[0] =~ /^>/) { $x[0] =~ s/\>//g; push (@names, $x[2]); push (@accessions, $x[1]); } elsif ($x[0] =~ /^\w/) { $sequences[$#accessions] = $sequences[$#accessions].$x[0]; } } close IN1; print "Done Reading FASTA file $fasta_in\n\n"; ############################################################################################################################### # # Match the non_p_peptides to the @sequences array: # 1) Format the motifs +/- 10 residues around the phospho-site # 2) Print the original data plus the phospho-motif to the output file # ############################################################################################################################### print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n"; my (%matched_sequences, %accessions, %names, %sites, @tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites); for my $j (0 .. $#p_peptides) { @tmp_matches = (); @tmp_accessions = (); @tmp_names = (); @tmp_sites = (); #Find the matching protein sequence(s) for the peptide my $site = -1; my $match = 0; for my $k (0 .. $#sequences) { $site = index($sequences[$k], $non_p_peptides[$j]); if ($site != -1) { push(@tmp_matches, $sequences[$k]); push(@tmp_accessions, $accessions[$k]); push(@tmp_names, $names[$k]); push(@tmp_sites, $site); $site = -1; $match++; } } if ($match == 0) { # Check to see if no match was found. Skip to next if no match found. print "Warning: Failed match for $p_peptides[$j]\n"; $matched_sequences{$p_peptides[$j]} = "Failed match"; next; } else { $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ]; $accessions{$p_peptides[$j]} = [ @tmp_accessions ]; $names{$p_peptides[$j]} = [ @tmp_names ]; $sites{$p_peptides[$j]} = [ @tmp_sites ]; } } my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues); for my $peptide_to_match ( keys %matched_sequences ) { next if ($peptide_to_match eq "Failed match"); my @matches = @{$matched_sequences{$peptide_to_match}}; @tmp_motifs_array = (); for my $i (0 .. $#matches) { # Find the location of the phospo-site in the sequence(s) $tmp_site = 0; my $offset = 0; my $tmp_p_peptide = $peptide_to_match; $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; # Find all phosphorylated residues in the p_peptide @p_sites = (); while ($tmp_site != -1) { $tmp_site = index($tmp_p_peptide, 'p', $offset); if ($tmp_site != -1) {push (@p_sites, $tmp_site);} $offset = $tmp_site + 1; $tmp_p_peptide =~ s/p//; } @tmp_p_residues = (); for my $l (0 .. $#p_sites) { push (@tmp_p_residues, $p_sites[$l]+$sites{$peptide_to_match}[$i]); # Match the sequences around the phospho residues to find the motifs my ($desired_residues_L, $desired_residues_R); if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3 $desired_residues_L = $tmp_p_residues[0]; } else { $desired_residues_L = 10; } my $seq_length = length($matched_sequences{$peptide_to_match}[$i]); if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1); # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3 # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n"; } else { $desired_residues_R = 10; } my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1; $tmp_motif = substr($matched_sequences{$peptide_to_match}[$i], $tmp_p_residues[0] - $desired_residues_L, $total_length); # Put the "p" back in front of the appropriate phospho-residue(s). my (@tmp_residues, $tmp_position); for my $m (0 .. $#p_sites) { # print "Line 183: $p_sites[$m]\n"; if ($m == 0) {$tmp_position = $desired_residues_L;} else {$tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];} # print "Line 187: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\n"; push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");} if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");} } $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g; # Comment out on 8.10.13 to remove the numbers from motifs my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1; my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1; $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue; push(@tmp_motifs_array, $tmp_motif); $residues{$peptide_to_match}{$i} = [ @tmp_residues ]; $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ]; } $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ]; } ### this bracket could be in the wrong place } ############################################################################################################################### # # Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs # ############################################################################################################################### ############################################################################################################################### # # Read the NetworKIN_predictions file: # 1) make a "kinases_observed" array # 2) annotate the phospho-substrates with the appropriate kinase # ############################################################################################################################### my (@kinases_observed, $kinases); my ($p_sequence_kinase, $p_sequence, $kinase); open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n"; print "\nReading the NetworKIN data: $networkin_in\n"; while (<IN1>) { chomp; my (@x) = split(/\t/); for my $i (0 .. $#x) { $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } next if ($x[0] eq "#substrate"); if (exists ($kinases -> {$x[2]})) { #do nothing } else { $kinases -> {$x[2]} = $x[2]; push (@kinases_observed, $x[2]); } my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha if (exists($p_sequence_kinase -> {$tmp})) { #do nothing } else { $p_sequence_kinase -> {$tmp} = $tmp; } } close IN1; ############################################################################################################################### # # Read the Kinase motifs file: # 1) make a "motif_sequence" array # ############################################################################################################################### my (@motif_sequence, %motif_type, %motif_count); open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; print "Reading the Motifs file: $motifs_in\n"; while (<IN2>) { chomp; my (@x) = split(/\t/); for my $i (0 .. 2) { $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } if (exists ($motif_type{$x[1]})) { $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; } else { $motif_type{$x[1]} = $x[2]; $motif_count{$x[1]} = 0; push (@motif_sequence, $x[1]); } } close (IN2); ############################################################################################################################### # 6.28.2011 # Read PhosphoSite data: # 1) make a "kinases_PhosphoSite" array # 2) annotate the phospho-substrates with the appropriate kinase # ############################################################################################################################### my (@kinases_PhosphoSite, $kinases_PhosphoSite); my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); my $line = 0; open (IN3, "$PhosphoSite_in") or die "I couldn't find $PhosphoSite_in\n"; print "Reading the PhosphoSite data: $PhosphoSite_in\n"; while (<IN3>) { chomp; my (@x) = split(/\t/); for my $i (0 .. $#x) { $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } if ($line != 0) { if (exists ($kinases_PhosphoSite -> {$x[0]})) { #do nothing } else { $kinases_PhosphoSite -> {$x[0]} = $x[0]; push (@kinases_PhosphoSite, $x[0]); } my $offset = 0; # Replace the superfluous lower case s, t and y my @lowercase = ('s','t','y'); my @uppercase = ('S','T','Y'); for my $k (0 .. 2) { my $site = 0; while ($site != -1) { $site = index($x[11],$lowercase[$k], $offset); if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);} $offset = $site + 1; } } my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { #do nothing } else { $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp; } } $line++; } close IN3; ############################################################################################################################### # Read PhosphoSite regulatory site data: # 1) make a "regulatory_sites_PhosphoSite" hash # ############################################################################################################################### my (%regulatory_sites_PhosphoSite); my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes); my $line = 0; open (IN4, "$PhosphoSite_molecular_function") or die "I couldn't find $PhosphoSite_molecular_function\n"; print "Reading the PhosphoSite regulatory site data: $PhosphoSite_molecular_function\n"; while (<IN4>) { chomp; my (@x) = split(/\t/); for my $i (0 .. $#x) { $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } if ($line != 0) { if (!exists($regulatory_sites_PhosphoSite{$x[9]})) { $regulatory_sites_PhosphoSite{$x[9]} = $x[9]; $domain{$x[9]} = $x[10]; $ON_FUNCTION{$x[9]} = $x[11]; $ON_PROCESS{$x[9]} = $x[12]; $ON_PROT_INTERACT{$x[9]} = $x[13]; $ON_OTHER_INTERACT{$x[9]} = $x[14]; $notes{$x[9]} = $x[19]; } else { # $domain if ($domain{$x[9]} eq "") { $domain{$x[9]} = $domain{$x[10]}; } elsif ($x[10] eq "") { # do nothing } else { $domain{$x[9]} = $domain{$x[9]}." / ".$x[10]; } # $ON_FUNCTION if ($ON_FUNCTION{$x[9]} eq "") { $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]}; } elsif ($x[10] eq "") { # do nothing } else { $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10]; } # $ON_PROCESS if ($ON_PROCESS{$x[9]} eq "") { $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]}; } elsif ($x[10] eq "") { # do nothing } else { $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10]; } # $ON_PROT_INTERACT if ($ON_PROT_INTERACT{$x[9]} eq "") { $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]}; } elsif ($x[10] eq "") { # do nothing } else { $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10]; } # $ON_OTHER_INTERACT if ($ON_OTHER_INTERACT{$x[9]} eq "") { $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]}; } elsif ($x[10] eq "") { # do nothing } else { $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10]; } # $notes if ($notes{$x[9]} eq "") { $notes{$x[9]} = $notes{$x[10]}; } elsif ($x[10] eq "") { # do nothing } else { $notes{$x[9]} = $notes{$x[9]}." / ".$x[10]; } } } $line++; } close IN4; ############################################################################################################################### # # Read the data file: # 1) find sequences that match the NetworKIN predictions # 2) find motifs that match the observed sequences # ############################################################################################################################### my ($formatted_sequence, %unique_motifs); my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches); my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2); foreach my $peptide (keys %data) { # find the unique phospho-motifs for this $peptide my @all_motifs = (); for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { my $tmp_motif = $p_motifs{$peptide}[$i]; push(@all_motifs, $tmp_motif); } for my $j (0 .. $#all_motifs) { $all_motifs[$j] =~ s/\d+-\[\s//; $all_motifs[$j] =~ s/\s\]\-\d+//; } my %seen = (); foreach my $a (@all_motifs) { if (exists($seen{$a})) { next; } else { push(@{$unique_motifs{$peptide}}, $a); $seen{$a} = 1; } } # count the number of phospo-sites in the motif my $number_pY = 0; my $number_pSTY = 0; if ($phospho_type eq 'y') {while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {$number_pY++;}} if ($phospho_type eq 'sty') {while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {$number_pSTY++;}} # search each of the unique motifs for matches for my $i (0 .. $#{$unique_motifs{$peptide}}) { my $tmp_motif = ${$unique_motifs{$peptide}}[$i]; if (($number_pY == 1) || ($number_pSTY == 1)) { my $seq_plus5aa = 0; my $seq_plus7aa = 0; $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); if ($phospho_type eq 'y') { $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; } elsif ($phospho_type eq "sty") { $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1]; $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1]; } for my $i (0 .. $#kinases_observed) { my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha if (exists($p_sequence_kinase -> {$tmp})) { $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; } } for my $i (0 .. $#motif_sequence) { if ($peptide =~ /$motif_sequence[$i]/) { $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; } } for my $i (0 .. $#kinases_PhosphoSite) { my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; } } if (exists($regulatory_sites_PhosphoSite{$seq_plus7aa})) { $domain_2{$peptide} = $domain{$seq_plus7aa}; $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; $notes_2{$peptide} = $notes{$seq_plus7aa}; } } elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 $formatted_sequence = $tmp_motif; #Create the sequences with only one phosphorylation site #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329 my (@sites, $offset, $next_p_site); $sites[0] = index($tmp_motif, "p"); $offset = $sites[0] + 1; while ($next_p_site != -1) { $next_p_site = index($tmp_motif, "p", $offset); if ($next_p_site != -1) { push (@sites, $next_p_site); } $offset = $next_p_site+1; } my @pSTY_sequences; for my $n (0 .. $#sites) { $pSTY_sequences[$n] = $tmp_motif; for (my $m = $#sites; $m >= 0; $m--) { if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";} } } my @formatted_sequences; for my $k (0 .. $#sites) { $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); } for my $k (0 .. $#formatted_sequences) { if ($phospho_type eq 'y') { $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence[$k]))[1]; $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence[$k]))[1]; } elsif ($phospho_type eq "sty") { $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence[$k]))[1]; $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence[$k]))[1]; } for my $i (0 .. $#kinases_observed) { my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha if (exists($p_sequence_kinase -> {$tmp})) { $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; } } for my $i (0 .. $#motif_sequence) { if ($pSTY_sequence =~ /$motif_sequence[$i]/) { $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; } } for my $i (0 .. $#kinases_PhosphoSite) { my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; } } if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { # $domain if ($domain_2{$peptide} eq "") { $domain_2{$peptide} = $domain{$seq_plus7aa}; } elsif ($domain{$seq_plus7aa} eq "") { # do nothing } else { $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; } # $ON_FUNCTION_2 if ($ON_FUNCTION_2{$peptide} eq "") { $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; } elsif ($ON_FUNCTION{$seq_plus7aa} eq "") { # do nothing } else { $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; } # $ON_PROCESS_2 if ($ON_PROCESS_2{$peptide} eq "") { $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; } elsif ($ON_PROCESS{$seq_plus7aa} eq "") { # do nothing } else { $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; } # $ON_PROT_INTERACT_2 if ($ON_PROT_INTERACT_2{$peptide} eq "") { $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; } elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") { # do nothing } else { $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; } # $ON_OTHER_INTERACT_2 if ($ON_OTHER_INTERACT_2{$peptide} eq "") { $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; } elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") { # do nothing } else { $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; } # $notes_2 if ($notes_2{$peptide} eq "") { $notes_2{$peptide} = $notes{$seq_plus7aa}; } elsif ($notes{$seq_plus7aa} eq "") { # do nothing } else { $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; } } } } } } ############################################################################################################################### # # Print to the output file # ############################################################################################################################### open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; # print the header info print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t"; # print the PhosphoSite regulatory data print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t"; # print the sample names for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; } # print the kinases and groups for my $i (0 .. $#kinases_observed) { my $temp = $kinases_observed[$i]."_NetworKIN"; print OUT "$temp\t"; } for my $i (0 .. $#motif_sequence) { print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t"; } for my $i (0 .. $#kinases_PhosphoSite) { my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite"; if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } } foreach my $peptide (keys %data) { # Print the peptide itself print OUT "$peptide\t"; # skip over failed matches if ($matched_sequences{$peptide} eq "Failed match") { print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; } else { # Print just the protein description my @description = (); for $i (0 .. $#{$names{$peptide}}) { my $long_name = $names{$peptide}[$i]; my @naming_parts = split(/\sOS/, $long_name); my @front_half = split(/\s/, $naming_parts[0]); push(@description, join(" ", @front_half[1..($#front_half)])); } print OUT join(" /// ", @description), "\t"; # Print just the gene name my @gene = (); my %seen = (); for $i (0 .. $#{$names{$peptide}}) { my $tmp_gene = $names{$peptide}[$i]; $tmp_gene =~ s/^.*GN=//; $tmp_gene =~ s/\s.*//; if (!exists($seen{$tmp_gene})) { push(@gene, $tmp_gene); $seen{$tmp_gene} = $tmp_gene; } } print OUT join(" /// ", @gene), "\t"; # print the entire names print OUT join(" /// ", @{$names{$peptide}}), "\t"; # Print the phospho-residues for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { if ($i < $#{ $matched_sequences{$peptide} }) { @tmp_p_residues = @{$p_residues{$peptide}{$i}}; for my $j (0 .. $#tmp_p_residues) { if ($j < $#tmp_p_residues) { my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; } elsif ($j == $#tmp_p_residues) { my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; } } } elsif ($i == $#{ $matched_sequences{$peptide} }) { @tmp_p_residues = @{$p_residues{$peptide}{$i}}; for my $j (0 .. $#tmp_p_residues) { if ($j < $#tmp_p_residues) { my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; } elsif ($j == $#tmp_p_residues) { my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t"; } } } } # Print the UNIQUE phospho-motifs print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; # Print the accessions print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; # print ALL motifs with residue numbers print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; } # Print the PhosphoSite regulatory data if (exists($domain_2{$peptide})) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; } if (exists($ON_FUNCTION_2{$peptide})) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; } if (exists($ON_PROCESS_2{$peptide})) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; } if (exists($ON_PROT_INTERACT_2{$peptide})) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } if (exists($ON_OTHER_INTERACT_2{$peptide})) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } if (exists($notes_2{$peptide})) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; } # Print the data @tmp_data = (); foreach (@{$data{$peptide}}) { push(@tmp_data, $_); } print OUT join("\t", @tmp_data), "\t"; # print the kinase-substrate data for my $i (0 .. $#kinases_observed) { if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { print OUT "X\t"; } else { print OUT "\t";} } for my $i (0 .. $#motif_sequence) { if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) { print OUT "X\t"; # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug } else { print OUT "\t";} } for my $i (0 .. $#kinases_PhosphoSite) { if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) { print OUT "X\t"; } elsif (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i == $#kinases_PhosphoSite)) { print OUT "X\n"; } elsif (!exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) { print OUT "\t"; } else { print OUT "\n"; } } } close OUT; my @timeData = localtime(time); print "\nFinished $timeData[2]:$timeData[1]:$timeData[0]\n\n"; ############################################################################################################################### sub replace_pSpTpY { my ($formatted_sequence, $phospho_type) = @_; if ($phospho_type eq 'y') { $formatted_sequence =~ s/pS/S/g; $formatted_sequence =~ s/pT/T/g; $formatted_sequence =~ s/pY/y/g; } elsif ($phospho_type eq "sty") { $formatted_sequence =~ s/pS/s/g; $formatted_sequence =~ s/pT/t/g; $formatted_sequence =~ s/pY/y/g; } $formatted_sequence; }