comparison PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 0:56658e35798d draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/phosphopeptide_kinase_mapping commit d256bec9d43378291734e2b2a93bdbfcc2d83f61"
author galaxyp
date Thu, 04 Nov 2021 19:37:36 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:56658e35798d
1 #!/usr/local/bin/perl
2
3 use Getopt::Std;
4
5 ###############################################################################################################################
6 # perl Kinase_enrichment_analysis_complete_v0.pl
7 #
8 # Nick Graham, USC
9 # 2016-02-27
10 #
11 # Built from scripts written by NG at UCLA in Tom Graeber's lab:
12 # CombinePhosphoSites.pl
13 # Retrieve_p_motifs.pl
14 # NetworKIN_Motif_Finder_v7.pl
15 #
16 # Given a list of phospho-peptides, find protein information and upstream kinases.
17 # Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
18 #
19 ###############################################################################################################################
20
21 my ($file_in, $average_or_sum, $file_out, $phospho_type);
22 my ($fasta_in, $networkin_in, $motifs_in, $PhosphoSite_in, $PhosphoSite_molecular_function);
23
24 ##########
25 ## opts ##
26 ##########
27 ## input files
28 # i : path to input outputfile_STEP2.txt
29 # f : path to fasta
30 # n : path to NetworKIN_201612_cutoffscore2.0.txt
31 # m : path to pSTY_Motifs.txt
32 # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
33 # r : path to 2017-03_PSP_Regulatory_sites.txt
34 ## options
35 # P : phospho_type
36 # F : function
37 ## output files
38 # o : path to output file
39
40 sub usage()
41 {
42 print STDERR <<"EOH";
43 This program given a list of phospho-peptides, finds protein information and upstream kinases.
44 usage: $0 [-hvd] [-f file]
45 -h : this (help) message
46 -i : path to input outputfile_STEP2.txt
47 -f : path to fasta
48 -n : path to NetworKIN_201612_cutoffscore2.0.txt
49 -m : path to pSTY_Motifs.txt
50 -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
51 -r : path to 2017-03_PSP_Regulatory_sites.txt
52 -P : phospho_type
53 -F : function
54 -o : path to output file
55 example: $0
56 EOH
57 exit;
58 }
59
60 my %opts;
61 getopts('i:f:n:m:p:r:o:P:F:h', \%opts) ;
62
63 if (exists($opts{'h'})) {
64 usage();
65 }
66 if (!exists($opts{'i'}) || !-e $opts{'i'}) {
67 die('Input File not found');
68 } else {
69 $file_in = $opts{'i'};
70 }
71 if (!exists($opts{'f'}) || !-e $opts{'f'}) {
72 die('Input Fasta File not found');
73 } else {
74 $fasta_in = $opts{'f'};
75 }
76 if (!exists($opts{'n'}) || !-e $opts{'n'}) {
77 die('Input NetworKIN File not found');
78 } else {
79 $networkin_in = $opts{'n'};
80 }
81 if (!exists($opts{'m'}) || !-e $opts{'m'}) {
82 die('Input pSTY_Motifs File not found');
83 } else {
84 $motifs_in = $opts{'m'};
85 }
86 if (!exists($opts{'p'}) || !-e $opts{'p'}) {
87 die('Input PSP_Kinase_Substrate_Dataset File not found');
88 } else {
89 $PhosphoSite_in = $opts{'p'};
90 }
91 if (!exists($opts{'r'}) || !-e $opts{'r'}) {
92 die('Input PSP_Regulatory_sites File not found');
93 } else {
94 $PhosphoSite_molecular_function = $opts{'r'};
95 }
96 if (exists($opts{'P'})) {
97 $phospho_type = $opts{'P'};
98 }
99 else {
100 $phospho_type = "sty";
101 }
102 if (exists($opts{'F'})) {
103 $average_or_sum = $opts{'F'};
104 }
105 else {
106 $average_or_sum = "sum";
107 }
108 if (exists($opts{'o'})) {
109 $file_out = $opts{'o'};
110 }
111 else {
112 $file_out = "output.tsv";
113 }
114
115
116 ###############################################################################################################################
117 # Print the relevant file names to the screen
118 ###############################################################################################################################
119 # print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n";
120 print "\nData file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\n\n";
121 print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PhosphoSite_in\nPhosphosite functional data: $PhosphoSite_molecular_function\nFASTA file: $fasta_in\n\n";
122
123
124 print "\nPhospho-residues(s) = $phospho_type\n";
125 if ($phospho_type ne 'y') {
126 if ($phospho_type ne 'sty') {
127 die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
128 }
129 }
130
131 ###############################################################################################################################
132 # read the input data file
133 # average or sum identical phospho-sites, depending on the value of $average_or_sum
134 ###############################################################################################################################
135
136 open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n";
137
138 die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;
139
140
141 my (@samples, %data, @tmp_data, %n);
142 my $line = 0;
143
144 while (<IN>) {
145 chomp;
146 my @x = split(/\t/);
147 for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;}
148
149 # Read in the samples
150 if ($line == 0) {
151 for my $n (1 .. $#x) {
152 push (@samples, $x[$n]);
153 }
154 $line++;
155 } else {
156 # check whether we have already seen a phospho-peptide
157 if (exists($data{$x[0]})) {
158 if ($average_or_sum eq "sum") { # add the data
159 # unload the data
160 @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
161 # add the new data and repack
162 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
163 $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
164
165 } elsif ($average_or_sum eq "average") { # average the data
166 # unload the data
167 @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
168 # average with the new data and repack
169 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
170 $n{$x[0]}++;
171 $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
172 }
173 }
174 # if the phospho-sequence has not been seen, save the data
175 else {
176 for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
177 $n{$x[0]} = 1;
178 }
179 }
180 }
181 close(IN);
182
183
184 ###############################################################################################################################
185 # Search the FASTA database for phospho-sites and motifs
186 #
187 # based on Retrieve_p_peptide_motifs_v2.pl
188 ###############################################################################################################################
189
190
191 ###############################################################################################################################
192 #
193 # Read in the Data file:
194 # 1) make @p_peptides array as in the original file
195 # 2) make @non_p_peptides array w/o residue modifications (p, #, other)
196 #
197 ###############################################################################################################################
198
199 my (@p_peptides, @non_p_peptides);
200 foreach my $peptide (keys %data) {
201 $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g;
202 push (@p_peptides, $peptide);
203 $peptide =~ s/p//g;
204 push(@non_p_peptides, $peptide);
205 }
206
207 ###############################################################################################################################
208 #
209 # Read in the FASTA sequence file, save them to the @sequences array
210 #
211 ###############################################################################################################################
212
213 open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";
214
215 my (@accessions, @names, @sequences);
216
217 print "Reading FASTA file $fasta_in\n";
218 while (<IN1>) {
219 chomp;
220 my (@x) = split(/\|/);
221 for my $i (0 .. $#x) {
222 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; }
223 if ($x[0] =~ /^>/) {
224 $x[0] =~ s/\>//g;
225 push (@names, $x[2]);
226 push (@accessions, $x[1]);
227 } elsif ($x[0] =~ /^\w/) {
228 $sequences[$#accessions] = $sequences[$#accessions].$x[0];
229 }
230 }
231 close IN1;
232 print "Done Reading FASTA file $fasta_in\n\n";
233
234
235 ###############################################################################################################################
236 #
237 # Match the non_p_peptides to the @sequences array:
238 # 1) Format the motifs +/- 10 residues around the phospho-site
239 # 2) Print the original data plus the phospho-motif to the output file
240 #
241 ###############################################################################################################################
242
243 print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n";
244
245 my (%matched_sequences, %accessions, %names, %sites, @tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites);
246
247 for my $j (0 .. $#p_peptides) {
248 @tmp_matches = (); @tmp_accessions = (); @tmp_names = (); @tmp_sites = ();
249
250 #Find the matching protein sequence(s) for the peptide
251 my $site = -1; my $match = 0;
252 for my $k (0 .. $#sequences) {
253 $site = index($sequences[$k], $non_p_peptides[$j]);
254 if ($site != -1) {
255 push(@tmp_matches, $sequences[$k]);
256 push(@tmp_accessions, $accessions[$k]);
257 push(@tmp_names, $names[$k]);
258 push(@tmp_sites, $site);
259 $site = -1; $match++;
260 }
261 }
262
263 if ($match == 0) { # Check to see if no match was found. Skip to next if no match found.
264 print "Warning: Failed match for $p_peptides[$j]\n";
265 $matched_sequences{$p_peptides[$j]} = "Failed match";
266 next;
267 } else {
268 $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
269 $accessions{$p_peptides[$j]} = [ @tmp_accessions ];
270 $names{$p_peptides[$j]} = [ @tmp_names ];
271 $sites{$p_peptides[$j]} = [ @tmp_sites ];
272 }
273 }
274
275 my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);
276
277 for my $peptide_to_match ( keys %matched_sequences ) {
278 next if ($peptide_to_match eq "Failed match");
279 my @matches = @{$matched_sequences{$peptide_to_match}};
280 @tmp_motifs_array = ();
281 for my $i (0 .. $#matches) {
282
283 # Find the location of the phospo-site in the sequence(s)
284 $tmp_site = 0; my $offset = 0;
285 my $tmp_p_peptide = $peptide_to_match;
286 $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
287
288 # Find all phosphorylated residues in the p_peptide
289 @p_sites = ();
290 while ($tmp_site != -1) {
291 $tmp_site = index($tmp_p_peptide, 'p', $offset);
292 if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
293 $offset = $tmp_site + 1;
294 $tmp_p_peptide =~ s/p//;
295 }
296
297 @tmp_p_residues = ();
298 for my $l (0 .. $#p_sites) {
299 push (@tmp_p_residues, $p_sites[$l]+$sites{$peptide_to_match}[$i]);
300
301 # Match the sequences around the phospho residues to find the motifs
302 my ($desired_residues_L, $desired_residues_R);
303 if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site
304 # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
305 $desired_residues_L = $tmp_p_residues[0];
306 }
307 else {
308 $desired_residues_L = 10;
309 }
310 my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
311 if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site
312 $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);
313 # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
314 # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
315 }
316 else {
317 $desired_residues_R = 10;
318 }
319
320 my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
321 $tmp_motif = substr($matched_sequences{$peptide_to_match}[$i], $tmp_p_residues[0] - $desired_residues_L, $total_length);
322
323 # Put the "p" back in front of the appropriate phospho-residue(s).
324 my (@tmp_residues, $tmp_position);
325 for my $m (0 .. $#p_sites) {
326 # print "Line 183: $p_sites[$m]\n";
327 if ($m == 0) {$tmp_position = $desired_residues_L;}
328 else {$tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];}
329 # print "Line 187: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\n";
330 push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
331 if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
332 if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
333 if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
334 }
335
336 $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
337
338 # Comment out on 8.10.13 to remove the numbers from motifs
339 my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
340 my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
341 $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
342 push(@tmp_motifs_array, $tmp_motif);
343 $residues{$peptide_to_match}{$i} = [ @tmp_residues ];
344 $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
345 }
346 $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
347 } ### this bracket could be in the wrong place
348 }
349
350
351 ###############################################################################################################################
352 #
353 # Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
354 #
355 ###############################################################################################################################
356
357
358
359 ###############################################################################################################################
360 #
361 # Read the NetworKIN_predictions file:
362 # 1) make a "kinases_observed" array
363 # 2) annotate the phospho-substrates with the appropriate kinase
364 #
365 ###############################################################################################################################
366
367 my (@kinases_observed, $kinases);
368 my ($p_sequence_kinase, $p_sequence, $kinase);
369
370 open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
371 print "\nReading the NetworKIN data: $networkin_in\n";
372 while (<IN1>) {
373 chomp;
374 my (@x) = split(/\t/);
375 for my $i (0 .. $#x) {
376 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g;
377 }
378 next if ($x[0] eq "#substrate");
379 if (exists ($kinases -> {$x[2]})) {
380 #do nothing
381 }
382 else {
383 $kinases -> {$x[2]} = $x[2];
384 push (@kinases_observed, $x[2]);
385 }
386 my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha
387 if (exists($p_sequence_kinase -> {$tmp})) {
388 #do nothing
389 }
390 else {
391 $p_sequence_kinase -> {$tmp} = $tmp;
392 }
393 }
394 close IN1;
395
396 ###############################################################################################################################
397 #
398 # Read the Kinase motifs file:
399 # 1) make a "motif_sequence" array
400 #
401 ###############################################################################################################################
402
403 my (@motif_sequence, %motif_type, %motif_count);
404
405 open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
406 print "Reading the Motifs file: $motifs_in\n";
407
408 while (<IN2>) {
409 chomp;
410 my (@x) = split(/\t/);
411 for my $i (0 .. 2) {
412 $x[$i] =~ s/\r//g;
413 $x[$i] =~ s/\n//g;
414 $x[$i] =~ s/\"//g;
415 }
416 if (exists ($motif_type{$x[1]})) {
417 $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
418 } else {
419 $motif_type{$x[1]} = $x[2];
420 $motif_count{$x[1]} = 0;
421 push (@motif_sequence, $x[1]);
422 }
423 }
424 close (IN2);
425
426
427 ###############################################################################################################################
428 # 6.28.2011
429 # Read PhosphoSite data:
430 # 1) make a "kinases_PhosphoSite" array
431 # 2) annotate the phospho-substrates with the appropriate kinase
432 #
433 ###############################################################################################################################
434
435
436 my (@kinases_PhosphoSite, $kinases_PhosphoSite);
437 my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);
438
439 my $line = 0;
440
441 open (IN3, "$PhosphoSite_in") or die "I couldn't find $PhosphoSite_in\n";
442 print "Reading the PhosphoSite data: $PhosphoSite_in\n";
443
444 while (<IN3>) {
445 chomp;
446 my (@x) = split(/\t/);
447 for my $i (0 .. $#x) {
448 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g;
449 }
450 if ($line != 0) {
451 if (exists ($kinases_PhosphoSite -> {$x[0]})) {
452 #do nothing
453 }
454 else {
455 $kinases_PhosphoSite -> {$x[0]} = $x[0];
456 push (@kinases_PhosphoSite, $x[0]);
457 }
458 my $offset = 0;
459 # Replace the superfluous lower case s, t and y
460 my @lowercase = ('s','t','y');
461 my @uppercase = ('S','T','Y');
462 for my $k (0 .. 2) {
463 my $site = 0;
464 while ($site != -1) {
465 $site = index($x[11],$lowercase[$k], $offset);
466 if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
467 $offset = $site + 1;
468 }
469 }
470 my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2
471 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
472 #do nothing
473 }
474 else {
475 $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
476 }
477 }
478 $line++;
479 }
480 close IN3;
481
482
483 ###############################################################################################################################
484 # Read PhosphoSite regulatory site data:
485 # 1) make a "regulatory_sites_PhosphoSite" hash
486 #
487 ###############################################################################################################################
488
489
490 my (%regulatory_sites_PhosphoSite);
491 my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes);
492
493 my $line = 0;
494
495 open (IN4, "$PhosphoSite_molecular_function") or die "I couldn't find $PhosphoSite_molecular_function\n";
496 print "Reading the PhosphoSite regulatory site data: $PhosphoSite_molecular_function\n";
497
498 while (<IN4>) {
499 chomp;
500 my (@x) = split(/\t/);
501 for my $i (0 .. $#x) {
502 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g;
503 }
504 if ($line != 0) {
505 if (!exists($regulatory_sites_PhosphoSite{$x[9]})) {
506 $regulatory_sites_PhosphoSite{$x[9]} = $x[9];
507 $domain{$x[9]} = $x[10];
508 $ON_FUNCTION{$x[9]} = $x[11];
509 $ON_PROCESS{$x[9]} = $x[12];
510 $ON_PROT_INTERACT{$x[9]} = $x[13];
511 $ON_OTHER_INTERACT{$x[9]} = $x[14];
512 $notes{$x[9]} = $x[19];
513 }
514 else {
515 # $domain
516 if ($domain{$x[9]} eq "") {
517 $domain{$x[9]} = $domain{$x[10]};
518 } elsif ($x[10] eq "") {
519 # do nothing
520 }
521 else {
522 $domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
523 }
524
525 # $ON_FUNCTION
526 if ($ON_FUNCTION{$x[9]} eq "") {
527 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
528 } elsif ($x[10] eq "") {
529 # do nothing
530 }
531 else {
532 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
533 }
534
535 # $ON_PROCESS
536 if ($ON_PROCESS{$x[9]} eq "") {
537 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
538 } elsif ($x[10] eq "") {
539 # do nothing
540 }
541 else {
542 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
543 }
544
545 # $ON_PROT_INTERACT
546 if ($ON_PROT_INTERACT{$x[9]} eq "") {
547 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
548 } elsif ($x[10] eq "") {
549 # do nothing
550 }
551 else {
552 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
553 }
554
555 # $ON_OTHER_INTERACT
556 if ($ON_OTHER_INTERACT{$x[9]} eq "") {
557 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
558 } elsif ($x[10] eq "") {
559 # do nothing
560 }
561 else {
562 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
563 }
564
565 # $notes
566 if ($notes{$x[9]} eq "") {
567 $notes{$x[9]} = $notes{$x[10]};
568 } elsif ($x[10] eq "") {
569 # do nothing
570 }
571 else {
572 $notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
573 }
574
575 }
576 }
577 $line++;
578 }
579 close IN4;
580
581 ###############################################################################################################################
582 #
583 # Read the data file:
584 # 1) find sequences that match the NetworKIN predictions
585 # 2) find motifs that match the observed sequences
586 #
587 ###############################################################################################################################
588
589 my ($formatted_sequence, %unique_motifs);
590 my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches);
591 my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2);
592
593 foreach my $peptide (keys %data) {
594 # find the unique phospho-motifs for this $peptide
595 my @all_motifs = ();
596 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
597 my $tmp_motif = $p_motifs{$peptide}[$i];
598 push(@all_motifs, $tmp_motif);
599 }
600 for my $j (0 .. $#all_motifs) {
601 $all_motifs[$j] =~ s/\d+-\[\s//; $all_motifs[$j] =~ s/\s\]\-\d+//;
602 }
603
604 my %seen = ();
605 foreach my $a (@all_motifs) {
606 if (exists($seen{$a})) { next; } else {
607 push(@{$unique_motifs{$peptide}}, $a);
608 $seen{$a} = 1;
609 }
610 }
611
612 # count the number of phospo-sites in the motif
613 my $number_pY = 0;
614 my $number_pSTY = 0;
615 if ($phospho_type eq 'y') {while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {$number_pY++;}}
616 if ($phospho_type eq 'sty') {while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {$number_pSTY++;}}
617
618 # search each of the unique motifs for matches
619 for my $i (0 .. $#{$unique_motifs{$peptide}}) {
620 my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
621 if (($number_pY == 1) || ($number_pSTY == 1)) {
622 my $seq_plus5aa = 0;
623 my $seq_plus7aa = 0;
624 $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
625 if ($phospho_type eq 'y') {
626 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
627 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
628 }
629 elsif ($phospho_type eq "sty") {
630 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
631 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
632 }
633 for my $i (0 .. $#kinases_observed) {
634 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha
635 if (exists($p_sequence_kinase -> {$tmp})) {
636 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
637 }
638 }
639 for my $i (0 .. $#motif_sequence) {
640 if ($peptide =~ /$motif_sequence[$i]/) {
641 $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
642 }
643 }
644 for my $i (0 .. $#kinases_PhosphoSite) {
645 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2
646 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
647 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
648 }
649 }
650 if (exists($regulatory_sites_PhosphoSite{$seq_plus7aa})) {
651 $domain_2{$peptide} = $domain{$seq_plus7aa};
652 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
653 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
654 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
655 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
656 $notes_2{$peptide} = $notes{$seq_plus7aa};
657 }
658 }
659 elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
660 $formatted_sequence = $tmp_motif;
661 #Create the sequences with only one phosphorylation site
662 #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329
663
664 my (@sites, $offset, $next_p_site);
665 $sites[0] = index($tmp_motif, "p");
666 $offset = $sites[0] + 1;
667 while ($next_p_site != -1) {
668 $next_p_site = index($tmp_motif, "p", $offset);
669 if ($next_p_site != -1) {
670 push (@sites, $next_p_site);
671 }
672 $offset = $next_p_site+1;
673 }
674
675 my @pSTY_sequences;
676 for my $n (0 .. $#sites) {
677 $pSTY_sequences[$n] = $tmp_motif;
678 for (my $m = $#sites; $m >= 0; $m--) {
679 if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
680 }
681 }
682
683 my @formatted_sequences;
684 for my $k (0 .. $#sites) {
685 $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
686 }
687
688 for my $k (0 .. $#formatted_sequences) {
689 if ($phospho_type eq 'y') {
690 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence[$k]))[1];
691 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence[$k]))[1];
692 }
693 elsif ($phospho_type eq "sty") {
694 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence[$k]))[1];
695 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence[$k]))[1];
696 }
697 for my $i (0 .. $#kinases_observed) {
698 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha
699 if (exists($p_sequence_kinase -> {$tmp})) {
700 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
701 }
702 }
703 for my $i (0 .. $#motif_sequence) {
704 if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
705 $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
706 }
707 }
708 for my $i (0 .. $#kinases_PhosphoSite) {
709 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2
710 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
711 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
712 }
713 }
714 if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
715 # $domain
716 if ($domain_2{$peptide} eq "") {
717 $domain_2{$peptide} = $domain{$seq_plus7aa};
718 }
719 elsif ($domain{$seq_plus7aa} eq "") {
720 # do nothing
721 }
722 else {
723 $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa};
724 }
725
726 # $ON_FUNCTION_2
727 if ($ON_FUNCTION_2{$peptide} eq "") {
728 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
729 }
730 elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
731 # do nothing
732 }
733 else {
734 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa};
735 }
736
737 # $ON_PROCESS_2
738 if ($ON_PROCESS_2{$peptide} eq "") {
739 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
740 }
741 elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
742 # do nothing
743 }
744 else {
745 $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa};
746 }
747
748 # $ON_PROT_INTERACT_2
749 if ($ON_PROT_INTERACT_2{$peptide} eq "") {
750 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
751 }
752 elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
753 # do nothing
754 }
755 else {
756 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa};
757 }
758
759 # $ON_OTHER_INTERACT_2
760 if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
761 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
762 }
763 elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
764 # do nothing
765 }
766 else {
767 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa};
768 }
769
770 # $notes_2
771 if ($notes_2{$peptide} eq "") {
772 $notes_2{$peptide} = $notes{$seq_plus7aa};
773 }
774 elsif ($notes{$seq_plus7aa} eq "") {
775 # do nothing
776 }
777 else {
778 $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa};
779 }
780 }
781 }
782 }
783 }
784 }
785
786
787 ###############################################################################################################################
788 #
789 # Print to the output file
790 #
791 ###############################################################################################################################
792 open (OUT, ">$file_out") || die "could not open the fileout: $file_out";
793
794 # print the header info
795 print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";
796
797 # print the PhosphoSite regulatory data
798 print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";
799
800 # print the sample names
801 for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }
802
803 # print the kinases and groups
804 for my $i (0 .. $#kinases_observed) {
805 my $temp = $kinases_observed[$i]."_NetworKIN";
806 print OUT "$temp\t";
807 }
808 for my $i (0 .. $#motif_sequence) {
809 print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t";
810 }
811 for my $i (0 .. $#kinases_PhosphoSite) {
812 my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite";
813 if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
814 if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
815 }
816
817
818 foreach my $peptide (keys %data) {
819 # Print the peptide itself
820 print OUT "$peptide\t";
821
822 # skip over failed matches
823 if ($matched_sequences{$peptide} eq "Failed match") {
824 print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
825 } else {
826 # Print just the protein description
827 my @description = ();
828 for $i (0 .. $#{$names{$peptide}}) {
829 my $long_name = $names{$peptide}[$i];
830 my @naming_parts = split(/\sOS/, $long_name);
831 my @front_half = split(/\s/, $naming_parts[0]);
832 push(@description, join(" ", @front_half[1..($#front_half)]));
833 }
834 print OUT join(" /// ", @description), "\t";
835
836 # Print just the gene name
837 my @gene = ();
838 my %seen = ();
839 for $i (0 .. $#{$names{$peptide}}) {
840 my $tmp_gene = $names{$peptide}[$i];
841 $tmp_gene =~ s/^.*GN=//;
842 $tmp_gene =~ s/\s.*//;
843 if (!exists($seen{$tmp_gene})) {
844 push(@gene, $tmp_gene);
845 $seen{$tmp_gene} = $tmp_gene;
846 }
847 }
848 print OUT join(" /// ", @gene), "\t";
849
850 # print the entire names
851 print OUT join(" /// ", @{$names{$peptide}}), "\t";
852
853 # Print the phospho-residues
854 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
855 if ($i < $#{ $matched_sequences{$peptide} }) {
856 @tmp_p_residues = @{$p_residues{$peptide}{$i}};
857 for my $j (0 .. $#tmp_p_residues) {
858 if ($j < $#tmp_p_residues) {
859 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
860 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
861 }
862 elsif ($j == $#tmp_p_residues) {
863 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
864 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
865 }
866 }
867 }
868 elsif ($i == $#{ $matched_sequences{$peptide} }) {
869 @tmp_p_residues = @{$p_residues{$peptide}{$i}};
870 for my $j (0 .. $#tmp_p_residues) {
871 if ($j < $#tmp_p_residues) {
872 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
873 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
874 }
875 elsif ($j == $#tmp_p_residues) {
876 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data
877 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t";
878 }
879 }
880 }
881 }
882
883 # Print the UNIQUE phospho-motifs
884 print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
885
886 # Print the accessions
887 print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
888
889 # print ALL motifs with residue numbers
890 print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
891 }
892
893 # Print the PhosphoSite regulatory data
894
895 if (exists($domain_2{$peptide})) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; }
896 if (exists($ON_FUNCTION_2{$peptide})) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; }
897 if (exists($ON_PROCESS_2{$peptide})) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; }
898 if (exists($ON_PROT_INTERACT_2{$peptide})) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
899 if (exists($ON_OTHER_INTERACT_2{$peptide})) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
900 if (exists($notes_2{$peptide})) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; }
901
902 # Print the data
903 @tmp_data = (); foreach (@{$data{$peptide}}) { push(@tmp_data, $_); }
904 print OUT join("\t", @tmp_data), "\t";
905
906 # print the kinase-substrate data
907 for my $i (0 .. $#kinases_observed) {
908 if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
909 print OUT "X\t";
910 }
911 else { print OUT "\t";}
912 }
913 for my $i (0 .. $#motif_sequence) {
914 if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) {
915 print OUT "X\t";
916 # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug
917 }
918 else { print OUT "\t";}
919 }
920 for my $i (0 .. $#kinases_PhosphoSite) {
921 if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) {
922 print OUT "X\t";
923 }
924 elsif (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i == $#kinases_PhosphoSite)) {
925 print OUT "X\n";
926 }
927 elsif (!exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) {
928 print OUT "\t";
929 }
930 else {
931 print OUT "\n";
932 }
933 }
934 }
935
936 close OUT;
937
938
939 my @timeData = localtime(time);
940 print "\nFinished $timeData[2]:$timeData[1]:$timeData[0]\n\n";
941
942 ###############################################################################################################################
943 sub replace_pSpTpY {
944 my ($formatted_sequence, $phospho_type) = @_;
945 if ($phospho_type eq 'y') {
946 $formatted_sequence =~ s/pS/S/g;
947 $formatted_sequence =~ s/pT/T/g;
948 $formatted_sequence =~ s/pY/y/g;
949 }
950 elsif ($phospho_type eq "sty") {
951 $formatted_sequence =~ s/pS/s/g;
952 $formatted_sequence =~ s/pT/t/g;
953 $formatted_sequence =~ s/pY/y/g;
954 }
955 $formatted_sequence;
956 }
957