Mercurial > repos > galaxyp > phosphopeptide_kinase_mapping
comparison PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 0:56658e35798d draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/phosphopeptide_kinase_mapping commit d256bec9d43378291734e2b2a93bdbfcc2d83f61"
author | galaxyp |
---|---|
date | Thu, 04 Nov 2021 19:37:36 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:56658e35798d |
---|---|
1 #!/usr/local/bin/perl | |
2 | |
3 use Getopt::Std; | |
4 | |
5 ############################################################################################################################### | |
6 # perl Kinase_enrichment_analysis_complete_v0.pl | |
7 # | |
8 # Nick Graham, USC | |
9 # 2016-02-27 | |
10 # | |
11 # Built from scripts written by NG at UCLA in Tom Graeber's lab: | |
12 # CombinePhosphoSites.pl | |
13 # Retrieve_p_motifs.pl | |
14 # NetworKIN_Motif_Finder_v7.pl | |
15 # | |
16 # Given a list of phospho-peptides, find protein information and upstream kinases. | |
17 # Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl | |
18 # | |
19 ############################################################################################################################### | |
20 | |
21 my ($file_in, $average_or_sum, $file_out, $phospho_type); | |
22 my ($fasta_in, $networkin_in, $motifs_in, $PhosphoSite_in, $PhosphoSite_molecular_function); | |
23 | |
24 ########## | |
25 ## opts ## | |
26 ########## | |
27 ## input files | |
28 # i : path to input outputfile_STEP2.txt | |
29 # f : path to fasta | |
30 # n : path to NetworKIN_201612_cutoffscore2.0.txt | |
31 # m : path to pSTY_Motifs.txt | |
32 # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt | |
33 # r : path to 2017-03_PSP_Regulatory_sites.txt | |
34 ## options | |
35 # P : phospho_type | |
36 # F : function | |
37 ## output files | |
38 # o : path to output file | |
39 | |
40 sub usage() | |
41 { | |
42 print STDERR <<"EOH"; | |
43 This program given a list of phospho-peptides, finds protein information and upstream kinases. | |
44 usage: $0 [-hvd] [-f file] | |
45 -h : this (help) message | |
46 -i : path to input outputfile_STEP2.txt | |
47 -f : path to fasta | |
48 -n : path to NetworKIN_201612_cutoffscore2.0.txt | |
49 -m : path to pSTY_Motifs.txt | |
50 -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt | |
51 -r : path to 2017-03_PSP_Regulatory_sites.txt | |
52 -P : phospho_type | |
53 -F : function | |
54 -o : path to output file | |
55 example: $0 | |
56 EOH | |
57 exit; | |
58 } | |
59 | |
60 my %opts; | |
61 getopts('i:f:n:m:p:r:o:P:F:h', \%opts) ; | |
62 | |
63 if (exists($opts{'h'})) { | |
64 usage(); | |
65 } | |
66 if (!exists($opts{'i'}) || !-e $opts{'i'}) { | |
67 die('Input File not found'); | |
68 } else { | |
69 $file_in = $opts{'i'}; | |
70 } | |
71 if (!exists($opts{'f'}) || !-e $opts{'f'}) { | |
72 die('Input Fasta File not found'); | |
73 } else { | |
74 $fasta_in = $opts{'f'}; | |
75 } | |
76 if (!exists($opts{'n'}) || !-e $opts{'n'}) { | |
77 die('Input NetworKIN File not found'); | |
78 } else { | |
79 $networkin_in = $opts{'n'}; | |
80 } | |
81 if (!exists($opts{'m'}) || !-e $opts{'m'}) { | |
82 die('Input pSTY_Motifs File not found'); | |
83 } else { | |
84 $motifs_in = $opts{'m'}; | |
85 } | |
86 if (!exists($opts{'p'}) || !-e $opts{'p'}) { | |
87 die('Input PSP_Kinase_Substrate_Dataset File not found'); | |
88 } else { | |
89 $PhosphoSite_in = $opts{'p'}; | |
90 } | |
91 if (!exists($opts{'r'}) || !-e $opts{'r'}) { | |
92 die('Input PSP_Regulatory_sites File not found'); | |
93 } else { | |
94 $PhosphoSite_molecular_function = $opts{'r'}; | |
95 } | |
96 if (exists($opts{'P'})) { | |
97 $phospho_type = $opts{'P'}; | |
98 } | |
99 else { | |
100 $phospho_type = "sty"; | |
101 } | |
102 if (exists($opts{'F'})) { | |
103 $average_or_sum = $opts{'F'}; | |
104 } | |
105 else { | |
106 $average_or_sum = "sum"; | |
107 } | |
108 if (exists($opts{'o'})) { | |
109 $file_out = $opts{'o'}; | |
110 } | |
111 else { | |
112 $file_out = "output.tsv"; | |
113 } | |
114 | |
115 | |
116 ############################################################################################################################### | |
117 # Print the relevant file names to the screen | |
118 ############################################################################################################################### | |
119 # print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n"; | |
120 print "\nData file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\n\n"; | |
121 print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PhosphoSite_in\nPhosphosite functional data: $PhosphoSite_molecular_function\nFASTA file: $fasta_in\n\n"; | |
122 | |
123 | |
124 print "\nPhospho-residues(s) = $phospho_type\n"; | |
125 if ($phospho_type ne 'y') { | |
126 if ($phospho_type ne 'sty') { | |
127 die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n"; | |
128 } | |
129 } | |
130 | |
131 ############################################################################################################################### | |
132 # read the input data file | |
133 # average or sum identical phospho-sites, depending on the value of $average_or_sum | |
134 ############################################################################################################################### | |
135 | |
136 open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n"; | |
137 | |
138 die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ; | |
139 | |
140 | |
141 my (@samples, %data, @tmp_data, %n); | |
142 my $line = 0; | |
143 | |
144 while (<IN>) { | |
145 chomp; | |
146 my @x = split(/\t/); | |
147 for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;} | |
148 | |
149 # Read in the samples | |
150 if ($line == 0) { | |
151 for my $n (1 .. $#x) { | |
152 push (@samples, $x[$n]); | |
153 } | |
154 $line++; | |
155 } else { | |
156 # check whether we have already seen a phospho-peptide | |
157 if (exists($data{$x[0]})) { | |
158 if ($average_or_sum eq "sum") { # add the data | |
159 # unload the data | |
160 @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); } | |
161 # add the new data and repack | |
162 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; } | |
163 $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); } | |
164 | |
165 } elsif ($average_or_sum eq "average") { # average the data | |
166 # unload the data | |
167 @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); } | |
168 # average with the new data and repack | |
169 for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); } | |
170 $n{$x[0]}++; | |
171 $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); } | |
172 } | |
173 } | |
174 # if the phospho-sequence has not been seen, save the data | |
175 else { | |
176 for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); } | |
177 $n{$x[0]} = 1; | |
178 } | |
179 } | |
180 } | |
181 close(IN); | |
182 | |
183 | |
184 ############################################################################################################################### | |
185 # Search the FASTA database for phospho-sites and motifs | |
186 # | |
187 # based on Retrieve_p_peptide_motifs_v2.pl | |
188 ############################################################################################################################### | |
189 | |
190 | |
191 ############################################################################################################################### | |
192 # | |
193 # Read in the Data file: | |
194 # 1) make @p_peptides array as in the original file | |
195 # 2) make @non_p_peptides array w/o residue modifications (p, #, other) | |
196 # | |
197 ############################################################################################################################### | |
198 | |
199 my (@p_peptides, @non_p_peptides); | |
200 foreach my $peptide (keys %data) { | |
201 $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g; | |
202 push (@p_peptides, $peptide); | |
203 $peptide =~ s/p//g; | |
204 push(@non_p_peptides, $peptide); | |
205 } | |
206 | |
207 ############################################################################################################################### | |
208 # | |
209 # Read in the FASTA sequence file, save them to the @sequences array | |
210 # | |
211 ############################################################################################################################### | |
212 | |
213 open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n"; | |
214 | |
215 my (@accessions, @names, @sequences); | |
216 | |
217 print "Reading FASTA file $fasta_in\n"; | |
218 while (<IN1>) { | |
219 chomp; | |
220 my (@x) = split(/\|/); | |
221 for my $i (0 .. $#x) { | |
222 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } | |
223 if ($x[0] =~ /^>/) { | |
224 $x[0] =~ s/\>//g; | |
225 push (@names, $x[2]); | |
226 push (@accessions, $x[1]); | |
227 } elsif ($x[0] =~ /^\w/) { | |
228 $sequences[$#accessions] = $sequences[$#accessions].$x[0]; | |
229 } | |
230 } | |
231 close IN1; | |
232 print "Done Reading FASTA file $fasta_in\n\n"; | |
233 | |
234 | |
235 ############################################################################################################################### | |
236 # | |
237 # Match the non_p_peptides to the @sequences array: | |
238 # 1) Format the motifs +/- 10 residues around the phospho-site | |
239 # 2) Print the original data plus the phospho-motif to the output file | |
240 # | |
241 ############################################################################################################################### | |
242 | |
243 print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n"; | |
244 | |
245 my (%matched_sequences, %accessions, %names, %sites, @tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites); | |
246 | |
247 for my $j (0 .. $#p_peptides) { | |
248 @tmp_matches = (); @tmp_accessions = (); @tmp_names = (); @tmp_sites = (); | |
249 | |
250 #Find the matching protein sequence(s) for the peptide | |
251 my $site = -1; my $match = 0; | |
252 for my $k (0 .. $#sequences) { | |
253 $site = index($sequences[$k], $non_p_peptides[$j]); | |
254 if ($site != -1) { | |
255 push(@tmp_matches, $sequences[$k]); | |
256 push(@tmp_accessions, $accessions[$k]); | |
257 push(@tmp_names, $names[$k]); | |
258 push(@tmp_sites, $site); | |
259 $site = -1; $match++; | |
260 } | |
261 } | |
262 | |
263 if ($match == 0) { # Check to see if no match was found. Skip to next if no match found. | |
264 print "Warning: Failed match for $p_peptides[$j]\n"; | |
265 $matched_sequences{$p_peptides[$j]} = "Failed match"; | |
266 next; | |
267 } else { | |
268 $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ]; | |
269 $accessions{$p_peptides[$j]} = [ @tmp_accessions ]; | |
270 $names{$p_peptides[$j]} = [ @tmp_names ]; | |
271 $sites{$p_peptides[$j]} = [ @tmp_sites ]; | |
272 } | |
273 } | |
274 | |
275 my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues); | |
276 | |
277 for my $peptide_to_match ( keys %matched_sequences ) { | |
278 next if ($peptide_to_match eq "Failed match"); | |
279 my @matches = @{$matched_sequences{$peptide_to_match}}; | |
280 @tmp_motifs_array = (); | |
281 for my $i (0 .. $#matches) { | |
282 | |
283 # Find the location of the phospo-site in the sequence(s) | |
284 $tmp_site = 0; my $offset = 0; | |
285 my $tmp_p_peptide = $peptide_to_match; | |
286 $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; | |
287 | |
288 # Find all phosphorylated residues in the p_peptide | |
289 @p_sites = (); | |
290 while ($tmp_site != -1) { | |
291 $tmp_site = index($tmp_p_peptide, 'p', $offset); | |
292 if ($tmp_site != -1) {push (@p_sites, $tmp_site);} | |
293 $offset = $tmp_site + 1; | |
294 $tmp_p_peptide =~ s/p//; | |
295 } | |
296 | |
297 @tmp_p_residues = (); | |
298 for my $l (0 .. $#p_sites) { | |
299 push (@tmp_p_residues, $p_sites[$l]+$sites{$peptide_to_match}[$i]); | |
300 | |
301 # Match the sequences around the phospho residues to find the motifs | |
302 my ($desired_residues_L, $desired_residues_R); | |
303 if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site | |
304 # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3 | |
305 $desired_residues_L = $tmp_p_residues[0]; | |
306 } | |
307 else { | |
308 $desired_residues_L = 10; | |
309 } | |
310 my $seq_length = length($matched_sequences{$peptide_to_match}[$i]); | |
311 if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site | |
312 $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1); | |
313 # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3 | |
314 # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n"; | |
315 } | |
316 else { | |
317 $desired_residues_R = 10; | |
318 } | |
319 | |
320 my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1; | |
321 $tmp_motif = substr($matched_sequences{$peptide_to_match}[$i], $tmp_p_residues[0] - $desired_residues_L, $total_length); | |
322 | |
323 # Put the "p" back in front of the appropriate phospho-residue(s). | |
324 my (@tmp_residues, $tmp_position); | |
325 for my $m (0 .. $#p_sites) { | |
326 # print "Line 183: $p_sites[$m]\n"; | |
327 if ($m == 0) {$tmp_position = $desired_residues_L;} | |
328 else {$tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];} | |
329 # print "Line 187: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\n"; | |
330 push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); | |
331 if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} | |
332 if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");} | |
333 if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");} | |
334 } | |
335 | |
336 $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g; | |
337 | |
338 # Comment out on 8.10.13 to remove the numbers from motifs | |
339 my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1; | |
340 my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1; | |
341 $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue; | |
342 push(@tmp_motifs_array, $tmp_motif); | |
343 $residues{$peptide_to_match}{$i} = [ @tmp_residues ]; | |
344 $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ]; | |
345 } | |
346 $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ]; | |
347 } ### this bracket could be in the wrong place | |
348 } | |
349 | |
350 | |
351 ############################################################################################################################### | |
352 # | |
353 # Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs | |
354 # | |
355 ############################################################################################################################### | |
356 | |
357 | |
358 | |
359 ############################################################################################################################### | |
360 # | |
361 # Read the NetworKIN_predictions file: | |
362 # 1) make a "kinases_observed" array | |
363 # 2) annotate the phospho-substrates with the appropriate kinase | |
364 # | |
365 ############################################################################################################################### | |
366 | |
367 my (@kinases_observed, $kinases); | |
368 my ($p_sequence_kinase, $p_sequence, $kinase); | |
369 | |
370 open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n"; | |
371 print "\nReading the NetworKIN data: $networkin_in\n"; | |
372 while (<IN1>) { | |
373 chomp; | |
374 my (@x) = split(/\t/); | |
375 for my $i (0 .. $#x) { | |
376 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; | |
377 } | |
378 next if ($x[0] eq "#substrate"); | |
379 if (exists ($kinases -> {$x[2]})) { | |
380 #do nothing | |
381 } | |
382 else { | |
383 $kinases -> {$x[2]} = $x[2]; | |
384 push (@kinases_observed, $x[2]); | |
385 } | |
386 my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha | |
387 if (exists($p_sequence_kinase -> {$tmp})) { | |
388 #do nothing | |
389 } | |
390 else { | |
391 $p_sequence_kinase -> {$tmp} = $tmp; | |
392 } | |
393 } | |
394 close IN1; | |
395 | |
396 ############################################################################################################################### | |
397 # | |
398 # Read the Kinase motifs file: | |
399 # 1) make a "motif_sequence" array | |
400 # | |
401 ############################################################################################################################### | |
402 | |
403 my (@motif_sequence, %motif_type, %motif_count); | |
404 | |
405 open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; | |
406 print "Reading the Motifs file: $motifs_in\n"; | |
407 | |
408 while (<IN2>) { | |
409 chomp; | |
410 my (@x) = split(/\t/); | |
411 for my $i (0 .. 2) { | |
412 $x[$i] =~ s/\r//g; | |
413 $x[$i] =~ s/\n//g; | |
414 $x[$i] =~ s/\"//g; | |
415 } | |
416 if (exists ($motif_type{$x[1]})) { | |
417 $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; | |
418 } else { | |
419 $motif_type{$x[1]} = $x[2]; | |
420 $motif_count{$x[1]} = 0; | |
421 push (@motif_sequence, $x[1]); | |
422 } | |
423 } | |
424 close (IN2); | |
425 | |
426 | |
427 ############################################################################################################################### | |
428 # 6.28.2011 | |
429 # Read PhosphoSite data: | |
430 # 1) make a "kinases_PhosphoSite" array | |
431 # 2) annotate the phospho-substrates with the appropriate kinase | |
432 # | |
433 ############################################################################################################################### | |
434 | |
435 | |
436 my (@kinases_PhosphoSite, $kinases_PhosphoSite); | |
437 my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); | |
438 | |
439 my $line = 0; | |
440 | |
441 open (IN3, "$PhosphoSite_in") or die "I couldn't find $PhosphoSite_in\n"; | |
442 print "Reading the PhosphoSite data: $PhosphoSite_in\n"; | |
443 | |
444 while (<IN3>) { | |
445 chomp; | |
446 my (@x) = split(/\t/); | |
447 for my $i (0 .. $#x) { | |
448 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; | |
449 } | |
450 if ($line != 0) { | |
451 if (exists ($kinases_PhosphoSite -> {$x[0]})) { | |
452 #do nothing | |
453 } | |
454 else { | |
455 $kinases_PhosphoSite -> {$x[0]} = $x[0]; | |
456 push (@kinases_PhosphoSite, $x[0]); | |
457 } | |
458 my $offset = 0; | |
459 # Replace the superfluous lower case s, t and y | |
460 my @lowercase = ('s','t','y'); | |
461 my @uppercase = ('S','T','Y'); | |
462 for my $k (0 .. 2) { | |
463 my $site = 0; | |
464 while ($site != -1) { | |
465 $site = index($x[11],$lowercase[$k], $offset); | |
466 if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);} | |
467 $offset = $site + 1; | |
468 } | |
469 } | |
470 my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2 | |
471 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { | |
472 #do nothing | |
473 } | |
474 else { | |
475 $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp; | |
476 } | |
477 } | |
478 $line++; | |
479 } | |
480 close IN3; | |
481 | |
482 | |
483 ############################################################################################################################### | |
484 # Read PhosphoSite regulatory site data: | |
485 # 1) make a "regulatory_sites_PhosphoSite" hash | |
486 # | |
487 ############################################################################################################################### | |
488 | |
489 | |
490 my (%regulatory_sites_PhosphoSite); | |
491 my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes); | |
492 | |
493 my $line = 0; | |
494 | |
495 open (IN4, "$PhosphoSite_molecular_function") or die "I couldn't find $PhosphoSite_molecular_function\n"; | |
496 print "Reading the PhosphoSite regulatory site data: $PhosphoSite_molecular_function\n"; | |
497 | |
498 while (<IN4>) { | |
499 chomp; | |
500 my (@x) = split(/\t/); | |
501 for my $i (0 .. $#x) { | |
502 $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; | |
503 } | |
504 if ($line != 0) { | |
505 if (!exists($regulatory_sites_PhosphoSite{$x[9]})) { | |
506 $regulatory_sites_PhosphoSite{$x[9]} = $x[9]; | |
507 $domain{$x[9]} = $x[10]; | |
508 $ON_FUNCTION{$x[9]} = $x[11]; | |
509 $ON_PROCESS{$x[9]} = $x[12]; | |
510 $ON_PROT_INTERACT{$x[9]} = $x[13]; | |
511 $ON_OTHER_INTERACT{$x[9]} = $x[14]; | |
512 $notes{$x[9]} = $x[19]; | |
513 } | |
514 else { | |
515 # $domain | |
516 if ($domain{$x[9]} eq "") { | |
517 $domain{$x[9]} = $domain{$x[10]}; | |
518 } elsif ($x[10] eq "") { | |
519 # do nothing | |
520 } | |
521 else { | |
522 $domain{$x[9]} = $domain{$x[9]}." / ".$x[10]; | |
523 } | |
524 | |
525 # $ON_FUNCTION | |
526 if ($ON_FUNCTION{$x[9]} eq "") { | |
527 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]}; | |
528 } elsif ($x[10] eq "") { | |
529 # do nothing | |
530 } | |
531 else { | |
532 $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10]; | |
533 } | |
534 | |
535 # $ON_PROCESS | |
536 if ($ON_PROCESS{$x[9]} eq "") { | |
537 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]}; | |
538 } elsif ($x[10] eq "") { | |
539 # do nothing | |
540 } | |
541 else { | |
542 $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10]; | |
543 } | |
544 | |
545 # $ON_PROT_INTERACT | |
546 if ($ON_PROT_INTERACT{$x[9]} eq "") { | |
547 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]}; | |
548 } elsif ($x[10] eq "") { | |
549 # do nothing | |
550 } | |
551 else { | |
552 $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10]; | |
553 } | |
554 | |
555 # $ON_OTHER_INTERACT | |
556 if ($ON_OTHER_INTERACT{$x[9]} eq "") { | |
557 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]}; | |
558 } elsif ($x[10] eq "") { | |
559 # do nothing | |
560 } | |
561 else { | |
562 $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10]; | |
563 } | |
564 | |
565 # $notes | |
566 if ($notes{$x[9]} eq "") { | |
567 $notes{$x[9]} = $notes{$x[10]}; | |
568 } elsif ($x[10] eq "") { | |
569 # do nothing | |
570 } | |
571 else { | |
572 $notes{$x[9]} = $notes{$x[9]}." / ".$x[10]; | |
573 } | |
574 | |
575 } | |
576 } | |
577 $line++; | |
578 } | |
579 close IN4; | |
580 | |
581 ############################################################################################################################### | |
582 # | |
583 # Read the data file: | |
584 # 1) find sequences that match the NetworKIN predictions | |
585 # 2) find motifs that match the observed sequences | |
586 # | |
587 ############################################################################################################################### | |
588 | |
589 my ($formatted_sequence, %unique_motifs); | |
590 my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches); | |
591 my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2); | |
592 | |
593 foreach my $peptide (keys %data) { | |
594 # find the unique phospho-motifs for this $peptide | |
595 my @all_motifs = (); | |
596 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { | |
597 my $tmp_motif = $p_motifs{$peptide}[$i]; | |
598 push(@all_motifs, $tmp_motif); | |
599 } | |
600 for my $j (0 .. $#all_motifs) { | |
601 $all_motifs[$j] =~ s/\d+-\[\s//; $all_motifs[$j] =~ s/\s\]\-\d+//; | |
602 } | |
603 | |
604 my %seen = (); | |
605 foreach my $a (@all_motifs) { | |
606 if (exists($seen{$a})) { next; } else { | |
607 push(@{$unique_motifs{$peptide}}, $a); | |
608 $seen{$a} = 1; | |
609 } | |
610 } | |
611 | |
612 # count the number of phospo-sites in the motif | |
613 my $number_pY = 0; | |
614 my $number_pSTY = 0; | |
615 if ($phospho_type eq 'y') {while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {$number_pY++;}} | |
616 if ($phospho_type eq 'sty') {while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {$number_pSTY++;}} | |
617 | |
618 # search each of the unique motifs for matches | |
619 for my $i (0 .. $#{$unique_motifs{$peptide}}) { | |
620 my $tmp_motif = ${$unique_motifs{$peptide}}[$i]; | |
621 if (($number_pY == 1) || ($number_pSTY == 1)) { | |
622 my $seq_plus5aa = 0; | |
623 my $seq_plus7aa = 0; | |
624 $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); | |
625 if ($phospho_type eq 'y') { | |
626 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; | |
627 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; | |
628 } | |
629 elsif ($phospho_type eq "sty") { | |
630 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1]; | |
631 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1]; | |
632 } | |
633 for my $i (0 .. $#kinases_observed) { | |
634 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha | |
635 if (exists($p_sequence_kinase -> {$tmp})) { | |
636 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; | |
637 } | |
638 } | |
639 for my $i (0 .. $#motif_sequence) { | |
640 if ($peptide =~ /$motif_sequence[$i]/) { | |
641 $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; | |
642 } | |
643 } | |
644 for my $i (0 .. $#kinases_PhosphoSite) { | |
645 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 | |
646 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { | |
647 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; | |
648 } | |
649 } | |
650 if (exists($regulatory_sites_PhosphoSite{$seq_plus7aa})) { | |
651 $domain_2{$peptide} = $domain{$seq_plus7aa}; | |
652 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; | |
653 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; | |
654 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; | |
655 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; | |
656 $notes_2{$peptide} = $notes{$seq_plus7aa}; | |
657 } | |
658 } | |
659 elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 | |
660 $formatted_sequence = $tmp_motif; | |
661 #Create the sequences with only one phosphorylation site | |
662 #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329 | |
663 | |
664 my (@sites, $offset, $next_p_site); | |
665 $sites[0] = index($tmp_motif, "p"); | |
666 $offset = $sites[0] + 1; | |
667 while ($next_p_site != -1) { | |
668 $next_p_site = index($tmp_motif, "p", $offset); | |
669 if ($next_p_site != -1) { | |
670 push (@sites, $next_p_site); | |
671 } | |
672 $offset = $next_p_site+1; | |
673 } | |
674 | |
675 my @pSTY_sequences; | |
676 for my $n (0 .. $#sites) { | |
677 $pSTY_sequences[$n] = $tmp_motif; | |
678 for (my $m = $#sites; $m >= 0; $m--) { | |
679 if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";} | |
680 } | |
681 } | |
682 | |
683 my @formatted_sequences; | |
684 for my $k (0 .. $#sites) { | |
685 $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); | |
686 } | |
687 | |
688 for my $k (0 .. $#formatted_sequences) { | |
689 if ($phospho_type eq 'y') { | |
690 $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence[$k]))[1]; | |
691 $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence[$k]))[1]; | |
692 } | |
693 elsif ($phospho_type eq "sty") { | |
694 $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence[$k]))[1]; | |
695 $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence[$k]))[1]; | |
696 } | |
697 for my $i (0 .. $#kinases_observed) { | |
698 my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha | |
699 if (exists($p_sequence_kinase -> {$tmp})) { | |
700 $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; | |
701 } | |
702 } | |
703 for my $i (0 .. $#motif_sequence) { | |
704 if ($pSTY_sequence =~ /$motif_sequence[$i]/) { | |
705 $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; | |
706 } | |
707 } | |
708 for my $i (0 .. $#kinases_PhosphoSite) { | |
709 my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 | |
710 if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { | |
711 $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; | |
712 } | |
713 } | |
714 if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { | |
715 # $domain | |
716 if ($domain_2{$peptide} eq "") { | |
717 $domain_2{$peptide} = $domain{$seq_plus7aa}; | |
718 } | |
719 elsif ($domain{$seq_plus7aa} eq "") { | |
720 # do nothing | |
721 } | |
722 else { | |
723 $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; | |
724 } | |
725 | |
726 # $ON_FUNCTION_2 | |
727 if ($ON_FUNCTION_2{$peptide} eq "") { | |
728 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; | |
729 } | |
730 elsif ($ON_FUNCTION{$seq_plus7aa} eq "") { | |
731 # do nothing | |
732 } | |
733 else { | |
734 $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; | |
735 } | |
736 | |
737 # $ON_PROCESS_2 | |
738 if ($ON_PROCESS_2{$peptide} eq "") { | |
739 $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; | |
740 } | |
741 elsif ($ON_PROCESS{$seq_plus7aa} eq "") { | |
742 # do nothing | |
743 } | |
744 else { | |
745 $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; | |
746 } | |
747 | |
748 # $ON_PROT_INTERACT_2 | |
749 if ($ON_PROT_INTERACT_2{$peptide} eq "") { | |
750 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; | |
751 } | |
752 elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") { | |
753 # do nothing | |
754 } | |
755 else { | |
756 $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; | |
757 } | |
758 | |
759 # $ON_OTHER_INTERACT_2 | |
760 if ($ON_OTHER_INTERACT_2{$peptide} eq "") { | |
761 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; | |
762 } | |
763 elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") { | |
764 # do nothing | |
765 } | |
766 else { | |
767 $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; | |
768 } | |
769 | |
770 # $notes_2 | |
771 if ($notes_2{$peptide} eq "") { | |
772 $notes_2{$peptide} = $notes{$seq_plus7aa}; | |
773 } | |
774 elsif ($notes{$seq_plus7aa} eq "") { | |
775 # do nothing | |
776 } | |
777 else { | |
778 $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; | |
779 } | |
780 } | |
781 } | |
782 } | |
783 } | |
784 } | |
785 | |
786 | |
787 ############################################################################################################################### | |
788 # | |
789 # Print to the output file | |
790 # | |
791 ############################################################################################################################### | |
792 open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; | |
793 | |
794 # print the header info | |
795 print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t"; | |
796 | |
797 # print the PhosphoSite regulatory data | |
798 print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t"; | |
799 | |
800 # print the sample names | |
801 for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; } | |
802 | |
803 # print the kinases and groups | |
804 for my $i (0 .. $#kinases_observed) { | |
805 my $temp = $kinases_observed[$i]."_NetworKIN"; | |
806 print OUT "$temp\t"; | |
807 } | |
808 for my $i (0 .. $#motif_sequence) { | |
809 print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t"; | |
810 } | |
811 for my $i (0 .. $#kinases_PhosphoSite) { | |
812 my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite"; | |
813 if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } | |
814 if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } | |
815 } | |
816 | |
817 | |
818 foreach my $peptide (keys %data) { | |
819 # Print the peptide itself | |
820 print OUT "$peptide\t"; | |
821 | |
822 # skip over failed matches | |
823 if ($matched_sequences{$peptide} eq "Failed match") { | |
824 print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; | |
825 } else { | |
826 # Print just the protein description | |
827 my @description = (); | |
828 for $i (0 .. $#{$names{$peptide}}) { | |
829 my $long_name = $names{$peptide}[$i]; | |
830 my @naming_parts = split(/\sOS/, $long_name); | |
831 my @front_half = split(/\s/, $naming_parts[0]); | |
832 push(@description, join(" ", @front_half[1..($#front_half)])); | |
833 } | |
834 print OUT join(" /// ", @description), "\t"; | |
835 | |
836 # Print just the gene name | |
837 my @gene = (); | |
838 my %seen = (); | |
839 for $i (0 .. $#{$names{$peptide}}) { | |
840 my $tmp_gene = $names{$peptide}[$i]; | |
841 $tmp_gene =~ s/^.*GN=//; | |
842 $tmp_gene =~ s/\s.*//; | |
843 if (!exists($seen{$tmp_gene})) { | |
844 push(@gene, $tmp_gene); | |
845 $seen{$tmp_gene} = $tmp_gene; | |
846 } | |
847 } | |
848 print OUT join(" /// ", @gene), "\t"; | |
849 | |
850 # print the entire names | |
851 print OUT join(" /// ", @{$names{$peptide}}), "\t"; | |
852 | |
853 # Print the phospho-residues | |
854 for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { | |
855 if ($i < $#{ $matched_sequences{$peptide} }) { | |
856 @tmp_p_residues = @{$p_residues{$peptide}{$i}}; | |
857 for my $j (0 .. $#tmp_p_residues) { | |
858 if ($j < $#tmp_p_residues) { | |
859 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
860 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; | |
861 } | |
862 elsif ($j == $#tmp_p_residues) { | |
863 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
864 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; | |
865 } | |
866 } | |
867 } | |
868 elsif ($i == $#{ $matched_sequences{$peptide} }) { | |
869 @tmp_p_residues = @{$p_residues{$peptide}{$i}}; | |
870 for my $j (0 .. $#tmp_p_residues) { | |
871 if ($j < $#tmp_p_residues) { | |
872 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
873 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; | |
874 } | |
875 elsif ($j == $#tmp_p_residues) { | |
876 my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data | |
877 print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t"; | |
878 } | |
879 } | |
880 } | |
881 } | |
882 | |
883 # Print the UNIQUE phospho-motifs | |
884 print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; | |
885 | |
886 # Print the accessions | |
887 print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; | |
888 | |
889 # print ALL motifs with residue numbers | |
890 print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; | |
891 } | |
892 | |
893 # Print the PhosphoSite regulatory data | |
894 | |
895 if (exists($domain_2{$peptide})) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; } | |
896 if (exists($ON_FUNCTION_2{$peptide})) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; } | |
897 if (exists($ON_PROCESS_2{$peptide})) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; } | |
898 if (exists($ON_PROT_INTERACT_2{$peptide})) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } | |
899 if (exists($ON_OTHER_INTERACT_2{$peptide})) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } | |
900 if (exists($notes_2{$peptide})) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; } | |
901 | |
902 # Print the data | |
903 @tmp_data = (); foreach (@{$data{$peptide}}) { push(@tmp_data, $_); } | |
904 print OUT join("\t", @tmp_data), "\t"; | |
905 | |
906 # print the kinase-substrate data | |
907 for my $i (0 .. $#kinases_observed) { | |
908 if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { | |
909 print OUT "X\t"; | |
910 } | |
911 else { print OUT "\t";} | |
912 } | |
913 for my $i (0 .. $#motif_sequence) { | |
914 if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) { | |
915 print OUT "X\t"; | |
916 # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug | |
917 } | |
918 else { print OUT "\t";} | |
919 } | |
920 for my $i (0 .. $#kinases_PhosphoSite) { | |
921 if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) { | |
922 print OUT "X\t"; | |
923 } | |
924 elsif (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i == $#kinases_PhosphoSite)) { | |
925 print OUT "X\n"; | |
926 } | |
927 elsif (!exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) { | |
928 print OUT "\t"; | |
929 } | |
930 else { | |
931 print OUT "\n"; | |
932 } | |
933 } | |
934 } | |
935 | |
936 close OUT; | |
937 | |
938 | |
939 my @timeData = localtime(time); | |
940 print "\nFinished $timeData[2]:$timeData[1]:$timeData[0]\n\n"; | |
941 | |
942 ############################################################################################################################### | |
943 sub replace_pSpTpY { | |
944 my ($formatted_sequence, $phospho_type) = @_; | |
945 if ($phospho_type eq 'y') { | |
946 $formatted_sequence =~ s/pS/S/g; | |
947 $formatted_sequence =~ s/pT/T/g; | |
948 $formatted_sequence =~ s/pY/y/g; | |
949 } | |
950 elsif ($phospho_type eq "sty") { | |
951 $formatted_sequence =~ s/pS/s/g; | |
952 $formatted_sequence =~ s/pT/t/g; | |
953 $formatted_sequence =~ s/pY/y/g; | |
954 } | |
955 $formatted_sequence; | |
956 } | |
957 |