view PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 0:56658e35798d draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/phosphopeptide_kinase_mapping commit d256bec9d43378291734e2b2a93bdbfcc2d83f61"
author galaxyp
date Thu, 04 Nov 2021 19:37:36 +0000
parents
children
line wrap: on
line source

#!/usr/local/bin/perl

use Getopt::Std;

###############################################################################################################################
#	perl Kinase_enrichment_analysis_complete_v0.pl
#
#	Nick Graham, USC
#	2016-02-27
#
#	Built from scripts written by NG at UCLA in Tom Graeber's lab:
#		CombinePhosphoSites.pl
#		Retrieve_p_motifs.pl
#		NetworKIN_Motif_Finder_v7.pl
#
#	Given a list of phospho-peptides, find protein information and upstream kinases.
#	Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
#
###############################################################################################################################

my ($file_in, $average_or_sum, $file_out, $phospho_type);
my ($fasta_in, $networkin_in, $motifs_in, $PhosphoSite_in, $PhosphoSite_molecular_function);

##########
## opts ##
##########
## input files
# i : path to input outputfile_STEP2.txt
# f : path to fasta
# n : path to NetworKIN_201612_cutoffscore2.0.txt
# m : path to pSTY_Motifs.txt
# p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
# r : path to 2017-03_PSP_Regulatory_sites.txt
## options
# P : phospho_type
# F : function
## output files
# o : path to output file

sub usage()
    {
        print STDERR <<"EOH";
    This program given a list of phospho-peptides, finds protein information and upstream kinases.
    usage: $0 [-hvd] [-f file]
     -h : this (help) message
     -i : path to input outputfile_STEP2.txt
     -f : path to fasta
     -n : path to NetworKIN_201612_cutoffscore2.0.txt
     -m : path to pSTY_Motifs.txt
     -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
     -r : path to 2017-03_PSP_Regulatory_sites.txt
     -P : phospho_type
     -F : function
     -o : path to output file
    example: $0
EOH
        exit;
    }

my %opts;
getopts('i:f:n:m:p:r:o:P:F:h', \%opts) ;

if (exists($opts{'h'})) {
    usage();
}
if (!exists($opts{'i'}) || !-e $opts{'i'}) {
	die('Input File not found');
} else {
	$file_in = $opts{'i'};
}
if (!exists($opts{'f'}) || !-e $opts{'f'}) {
	die('Input Fasta File not found');
} else {
	$fasta_in = $opts{'f'};
}
if (!exists($opts{'n'}) || !-e $opts{'n'}) {
	die('Input NetworKIN File not found');
} else {
	$networkin_in = $opts{'n'};
}
if (!exists($opts{'m'}) || !-e $opts{'m'}) {
	die('Input pSTY_Motifs File not found');
} else {
	$motifs_in = $opts{'m'};
}
if (!exists($opts{'p'}) || !-e $opts{'p'}) {
	die('Input PSP_Kinase_Substrate_Dataset File not found');
} else {
	$PhosphoSite_in = $opts{'p'};
}
if (!exists($opts{'r'}) || !-e $opts{'r'}) {
	die('Input PSP_Regulatory_sites File not found');
} else {
	$PhosphoSite_molecular_function = $opts{'r'};
}
if (exists($opts{'P'})) {
        $phospho_type = $opts{'P'};
}
else {
        $phospho_type = "sty";
}
if (exists($opts{'F'})) {
        $average_or_sum = $opts{'F'};
}
else {
        $average_or_sum = "sum";
}
if (exists($opts{'o'})) {
        $file_out = $opts{'o'};
}
else {
        $file_out = "output.tsv";
}


###############################################################################################################################
# Print the relevant file names to the screen
###############################################################################################################################
# print "\nData file:  $data_in\nFASTA file:  $fasta_in\nSpecies:  $species\nOutput file:  $motifs_out\n\n";
print "\nData file:  $file_in\nAverage or sum identical p-sites?  $average_or_sum\nOutput file:  $file_out\n\n";
print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PhosphoSite_in\nPhosphosite functional data:  $PhosphoSite_molecular_function\nFASTA file:  $fasta_in\n\n";


print "\nPhospho-residues(s) = $phospho_type\n";
if ($phospho_type ne 'y') {
	if ($phospho_type ne 'sty') {
		die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
	}
}

###############################################################################################################################
# read the input data file
# average or sum identical phospho-sites, depending on the value of $average_or_sum
###############################################################################################################################

open (IN, "$file_in") or die "I couldn't find the input file:  $file_in\n";

die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;


my (@samples, %data, @tmp_data, %n);
my $line = 0;

while (<IN>) {
	chomp;
	my @x = split(/\t/);
	for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n]  =~ s/\n//g; $x[$n]  =~ s/\"//g;}
	
	# Read in the samples
	if ($line == 0) {
		for my $n (1 .. $#x) {
			push (@samples, $x[$n]); 
		}
		$line++;
	} else {
		# check whether we have already seen a phospho-peptide
		if (exists($data{$x[0]})) { 
			if ($average_or_sum eq "sum") {		# add the data
				# unload the data
				@tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
				# add the new data and repack
				for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
				$all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
				
			} elsif ($average_or_sum eq "average") {		# average the data
				# unload the data
				@tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
				# average with the new data and repack
				for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
				$n{$x[0]}++;
				$data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
			}
		} 
		# if the phospho-sequence has not been seen, save the data
		else {
			for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
			$n{$x[0]} = 1;
		}
	}
}
close(IN);


###############################################################################################################################
# Search the FASTA database for phospho-sites and motifs
#
# based on Retrieve_p_peptide_motifs_v2.pl
###############################################################################################################################


###############################################################################################################################
#
#	Read in the Data file:
#		1) make @p_peptides array as in the original file
#		2) make @non_p_peptides array w/o residue modifications (p, #, other)
#
###############################################################################################################################

my (@p_peptides, @non_p_peptides);
foreach my $peptide (keys %data) {
	$peptide =~ s/s/pS/g;	$peptide =~ s/t/pT/g;	$peptide =~ s/y/pY/g;
	push (@p_peptides, $peptide);
	$peptide =~ s/p//g;
	push(@non_p_peptides, $peptide);
}

###############################################################################################################################
#
#	Read in the FASTA sequence file, save them to the @sequences array
#
###############################################################################################################################

open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";

my (@accessions, @names, @sequences);

print "Reading FASTA file $fasta_in\n";
while (<IN1>) {
	chomp;
	my (@x) = split(/\|/);
	for my $i (0 .. $#x) { 
		$x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g; }
	if ($x[0] =~ /^>/) {
		$x[0] =~ s/\>//g;
		push (@names, $x[2]);
		push (@accessions, $x[1]);
	} elsif ($x[0] =~ /^\w/) {
		$sequences[$#accessions] = $sequences[$#accessions].$x[0];
	}
}
close IN1;
print "Done Reading FASTA file $fasta_in\n\n";


###############################################################################################################################
#
#	Match the non_p_peptides to the @sequences array:
#		1) Format the motifs +/- 10 residues around the phospho-site
#		2) Print the original data plus the phospho-motif to the output file
#
###############################################################################################################################

print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n";

my (%matched_sequences, %accessions, %names, %sites, @tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites);

for my $j (0 .. $#p_peptides) {
	@tmp_matches = (); @tmp_accessions = (); @tmp_names = (); @tmp_sites = ();
	
	#Find the matching protein sequence(s) for the peptide
	my $site = -1; my $match = 0;
	for my $k (0 .. $#sequences) {
		$site = index($sequences[$k], $non_p_peptides[$j]);
		if ($site != -1) {
			push(@tmp_matches, $sequences[$k]);
			push(@tmp_accessions, $accessions[$k]);
			push(@tmp_names, $names[$k]);
			push(@tmp_sites, $site);
			$site = -1; $match++;
		}
	}
	
	if ($match == 0) {	# Check to see if no match was found.  Skip to next if no match found.
		print "Warning:  Failed match for $p_peptides[$j]\n";
		$matched_sequences{$p_peptides[$j]} = "Failed match";
		next;
	} else {
		$matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
		$accessions{$p_peptides[$j]} = [ @tmp_accessions ];
		$names{$p_peptides[$j]} = [ @tmp_names ];
		$sites{$p_peptides[$j]} = [ @tmp_sites ];
	}
}

my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);

for my $peptide_to_match ( keys %matched_sequences ) {
	next if ($peptide_to_match eq "Failed match");
	my @matches = @{$matched_sequences{$peptide_to_match}};
	@tmp_motifs_array = ();
	for my $i (0 .. $#matches) {
		
		# Find the location of the phospo-site in the sequence(s)
		$tmp_site = 0; my $offset = 0;
		my $tmp_p_peptide = $peptide_to_match;
		$tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
		
		# Find all phosphorylated residues in the p_peptide
		@p_sites = ();  
		while ($tmp_site != -1) {
			$tmp_site = index($tmp_p_peptide, 'p', $offset);
			if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
			$offset = $tmp_site + 1;
			$tmp_p_peptide =~ s/p//;
		}
		
		@tmp_p_residues = ();
		for my $l (0 .. $#p_sites) {
			push (@tmp_p_residues, $p_sites[$l]+$sites{$peptide_to_match}[$i]);
		
		# Match the sequences around the phospho residues to find the motifs
		my ($desired_residues_L, $desired_residues_R);
		if ($tmp_p_residues[0] - 10 < 0) {	#check to see if there are fewer than 10 residues left of the first p-site
			# eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
			$desired_residues_L = $tmp_p_residues[0];  
		}
		else {
			$desired_residues_L = 10;
		}
		my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
		if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) {	#check to see if there are fewer than 10 residues right of the last p-site
			$desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);  
			# eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
			# print "Line 170:  seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
		}
		else {
			$desired_residues_R = 10;
		}
		
		my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
		$tmp_motif = substr($matched_sequences{$peptide_to_match}[$i], $tmp_p_residues[0] - $desired_residues_L, $total_length);
		
		# Put the "p" back in front of the appropriate phospho-residue(s).
		my (@tmp_residues, $tmp_position);
		for my $m (0 .. $#p_sites) {
			# print "Line 183: $p_sites[$m]\n";
			if ($m == 0) {$tmp_position = $desired_residues_L;}
			else {$tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];}
			# print "Line 187: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\n";
			push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
			if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
			if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
			if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
		}
		
		$tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
		
		# Comment out on 8.10.13 to remove the numbers from motifs
		my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
		my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
		$tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
		push(@tmp_motifs_array, $tmp_motif);
		$residues{$peptide_to_match}{$i} = [ @tmp_residues ];
		$p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
		}
	$p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
	}		### this bracket could be in the wrong place
}


###############################################################################################################################
#
#  Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
#
###############################################################################################################################



###############################################################################################################################
#
#	Read the NetworKIN_predictions file:
#		1) make a "kinases_observed" array
#		2) annotate the phospho-substrates with the appropriate kinase
#
###############################################################################################################################

my (@kinases_observed, $kinases);
my ($p_sequence_kinase, $p_sequence, $kinase);

open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
print "\nReading the NetworKIN data:  $networkin_in\n";
while (<IN1>) {
	chomp;
	my (@x) = split(/\t/);
	for my $i (0 .. $#x) {
		$x[$i] =~ s/\r//g; 	$x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
	}
	next if ($x[0] eq "#substrate");
	if (exists ($kinases -> {$x[2]})) {
		#do nothing
	}
	else {
		$kinases -> {$x[2]} = $x[2];
		push (@kinases_observed, $x[2]);
	}
	my $tmp = $x[10]."_".$x[2];	#eg, REEILsEMKKV_PKCalpha
	if (exists($p_sequence_kinase -> {$tmp})) {
		#do nothing
	}
	else {
		$p_sequence_kinase -> {$tmp} = $tmp;
	}
}
close IN1;

###############################################################################################################################
#
#	Read the Kinase motifs file:
#		1) make a "motif_sequence" array
#
###############################################################################################################################

my (@motif_sequence, %motif_type, %motif_count);

open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
print "Reading the Motifs file:  $motifs_in\n";

while (<IN2>) {
	chomp;
	my (@x) = split(/\t/);
	for my $i (0 .. 2) {
		$x[$i] =~ s/\r//g;
		$x[$i]  =~ s/\n//g;
		$x[$i]  =~ s/\"//g;
		}
	if (exists ($motif_type{$x[1]})) {
		$motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
	} else {
		$motif_type{$x[1]} = $x[2];
		$motif_count{$x[1]} = 0;
		push (@motif_sequence, $x[1]);
	}
}
close (IN2);


###############################################################################################################################
#  6.28.2011
#	Read PhosphoSite data:
#		1) make a "kinases_PhosphoSite" array
#		2) annotate the phospho-substrates with the appropriate kinase
#
###############################################################################################################################


my (@kinases_PhosphoSite, $kinases_PhosphoSite);
my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);

my $line = 0;

open (IN3, "$PhosphoSite_in") or die "I couldn't find $PhosphoSite_in\n";
print "Reading the PhosphoSite data:  $PhosphoSite_in\n";

while (<IN3>) {
	chomp;
	my (@x) = split(/\t/);
	for my $i (0 .. $#x) {
		$x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
		}
	if ($line != 0) {
		if (exists ($kinases_PhosphoSite -> {$x[0]})) {
			#do nothing
		}
		else {
			$kinases_PhosphoSite -> {$x[0]} = $x[0];
			push (@kinases_PhosphoSite, $x[0]);
		}
		my $offset = 0;
		# Replace the superfluous lower case s, t and y
		my @lowercase = ('s','t','y');
		my @uppercase = ('S','T','Y');
		for my $k (0 .. 2) {
			my $site = 0;
			while ($site != -1) {
				$site = index($x[11],$lowercase[$k], $offset);
				if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
				$offset = $site + 1;
			}
		}
		my $tmp = $x[11]."_".$x[0];		#eg, RTPGRPLsSYGMDSR_PAK2
		if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
			#do nothing
		}
		else {
			$p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
		}
	}
	$line++;
}
close IN3;


###############################################################################################################################
#  Read PhosphoSite regulatory site data:
#		1) make a "regulatory_sites_PhosphoSite" hash
#
###############################################################################################################################


my (%regulatory_sites_PhosphoSite);
my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes);

my $line = 0;

open (IN4, "$PhosphoSite_molecular_function") or die "I couldn't find $PhosphoSite_molecular_function\n";
print "Reading the PhosphoSite regulatory site data:  $PhosphoSite_molecular_function\n";

while (<IN4>) {
	chomp;
	my (@x) = split(/\t/);
	for my $i (0 .. $#x) {
		$x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
	}
	if ($line != 0) {
		if (!exists($regulatory_sites_PhosphoSite{$x[9]})) {
			$regulatory_sites_PhosphoSite{$x[9]} = $x[9];
			$domain{$x[9]} = $x[10];
			$ON_FUNCTION{$x[9]} = $x[11];
			$ON_PROCESS{$x[9]} = $x[12];
			$ON_PROT_INTERACT{$x[9]} = $x[13];
			$ON_OTHER_INTERACT{$x[9]} = $x[14];
			$notes{$x[9]} = $x[19];
		}
		else {
			# $domain
			if ($domain{$x[9]} eq "") {
				$domain{$x[9]} = $domain{$x[10]};
			} elsif ($x[10] eq "") {
				# do nothing
			}
			else {
				$domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
			}
			
			# $ON_FUNCTION
			if ($ON_FUNCTION{$x[9]} eq "") {
				$ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
			} elsif ($x[10] eq "") {
				# do nothing
			}
			else {
				$ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
			}
			
			# $ON_PROCESS
			if ($ON_PROCESS{$x[9]} eq "") {
				$ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
			} elsif ($x[10] eq "") {
				# do nothing
			}
			else {
				$ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
			}
			
			# $ON_PROT_INTERACT
			if ($ON_PROT_INTERACT{$x[9]} eq "") {
				$ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
			} elsif ($x[10] eq "") {
				# do nothing
			}
			else {
				$ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
			}
			
			# $ON_OTHER_INTERACT
			if ($ON_OTHER_INTERACT{$x[9]} eq "") {
				$ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
			} elsif ($x[10] eq "") {
				# do nothing
			}
			else {
				$ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
			}
			
			# $notes
			if ($notes{$x[9]} eq "") {
				$notes{$x[9]} = $notes{$x[10]};
			} elsif ($x[10] eq "") {
				# do nothing
			}
			else {
				$notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
			}
			
		}
	}
$line++;
}
close IN4;

###############################################################################################################################
#
#	Read the data file:
#		1) find sequences that match the NetworKIN predictions
#		2) find motifs that match the observed sequences
#
###############################################################################################################################

my ($formatted_sequence, %unique_motifs);
my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches);
my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2);

foreach my $peptide (keys %data) {
	# find the unique phospho-motifs for this $peptide
	my @all_motifs = ();
	for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
		my $tmp_motif = $p_motifs{$peptide}[$i];
		push(@all_motifs, $tmp_motif);
	}
	for my $j (0 .. $#all_motifs) {
		$all_motifs[$j] =~ s/\d+-\[\s//;  $all_motifs[$j] =~ s/\s\]\-\d+//;
	}
	
	my %seen = ();
	foreach my $a (@all_motifs) {
		if (exists($seen{$a})) { next; } else {
			push(@{$unique_motifs{$peptide}}, $a);
			$seen{$a} = 1;
		}
	}
	
	# count the number of phospo-sites in the motif
	my $number_pY = 0;
	my $number_pSTY = 0;
	if ($phospho_type eq 'y') {while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {$number_pY++;}}
	if ($phospho_type eq 'sty') {while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {$number_pSTY++;}}
	
	# search each of the unique motifs for matches
	for my $i (0 .. $#{$unique_motifs{$peptide}}) {
		my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
		if (($number_pY == 1) || ($number_pSTY == 1)) {
			my $seq_plus5aa = 0;
			my $seq_plus7aa = 0;
			$formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
			if ($phospho_type eq 'y') {
				$seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
				$seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
			}
			elsif ($phospho_type eq "sty") {
				$seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
				$seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
			}
			for my $i (0 .. $#kinases_observed) {
				my $tmp = $seq_plus5aa."_".$kinases_observed[$i];	#eg, should be PGRPLsSYGMD_PKCalpha
				if (exists($p_sequence_kinase -> {$tmp})) {
					$kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
				}
			}
			for my $i (0 .. $#motif_sequence) {
				if ($peptide =~ /$motif_sequence[$i]/) {
					$kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
				}
			}
			for my $i (0 .. $#kinases_PhosphoSite) {
				my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];	#eg, should be RTPGRPLsSYGMDSR_PAK2
				if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
					$kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
				}
			}
			if (exists($regulatory_sites_PhosphoSite{$seq_plus7aa})) {
				$domain_2{$peptide} = $domain{$seq_plus7aa};
				$ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
				$ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
				$ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
				$ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
				$notes_2{$peptide} = $notes{$seq_plus7aa};
			}
		}
		elsif (($number_pY > 1) || ($number_pSTY > 1)) {  #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
			$formatted_sequence = $tmp_motif;
			#Create the sequences with only one phosphorylation site
			#eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes  1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329  and  1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329

			my (@sites, $offset, $next_p_site);
			$sites[0] = index($tmp_motif, "p"); 
			$offset = $sites[0] + 1;
			while ($next_p_site != -1) {
				$next_p_site = index($tmp_motif, "p", $offset);
				if ($next_p_site != -1) {
					push (@sites, $next_p_site);
				}
				$offset = $next_p_site+1;
			}
			
			my @pSTY_sequences;
			for my $n (0 .. $#sites) {
				$pSTY_sequences[$n] = $tmp_motif;
				for (my $m = $#sites; $m >= 0; $m--) {
					if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
				}
			}
			
			my @formatted_sequences;
			for my $k (0 .. $#sites) {
				$formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
			}

			for my $k (0 .. $#formatted_sequences) {
				if ($phospho_type eq 'y') {
					$seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence[$k]))[1];
					$seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence[$k]))[1];
				}
				elsif ($phospho_type eq "sty") {
					$seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence[$k]))[1];
					$seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence[$k]))[1];
				}
				for my $i (0 .. $#kinases_observed) {
					my $tmp = $seq_plus5aa."_".$kinases_observed[$i];	#eg, should look like REEILsEMKKV_PKCalpha
					if (exists($p_sequence_kinase -> {$tmp})) {
						$kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
					}
				}
				for my $i (0 .. $#motif_sequence) {
					if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
						$kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
					}
				}
				for my $i (0 .. $#kinases_PhosphoSite) {
					my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];	#eg, should be RTPGRPLsSYGMDSR_PAK2
					if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
						$kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
					}
				}
				if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
					# $domain
					if ($domain_2{$peptide} eq "") {
						$domain_2{$peptide} = $domain{$seq_plus7aa};
					}
					elsif ($domain{$seq_plus7aa} eq "") {
						# do nothing
					}
					else {
						$domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; 
					}
					
					# $ON_FUNCTION_2
					if ($ON_FUNCTION_2{$peptide} eq "") {
						$ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
					}
					elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
						# do nothing
					}
					else {
						$ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; 
					}
					
					# $ON_PROCESS_2
					if ($ON_PROCESS_2{$peptide} eq "") {
						$ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
					}
					elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
						# do nothing
					}
					else {
						$ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; 
					}
					
					# $ON_PROT_INTERACT_2
					if ($ON_PROT_INTERACT_2{$peptide} eq "") {
						$ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
					}
					elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
						# do nothing
					}
					else {
						$ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; 
					}
					
					# $ON_OTHER_INTERACT_2
					if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
						$ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
					}
					elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
						# do nothing
					}
					else {
						$ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; 
					}
					
					# $notes_2
					if ($notes_2{$peptide} eq "") {
						$notes_2{$peptide} = $notes{$seq_plus7aa};
					}
					elsif ($notes{$seq_plus7aa} eq "") {
						# do nothing
					}
					else {
						$notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; 
					}
				}
			}
		}
	}
}


###############################################################################################################################
#
# Print to the output file
#
###############################################################################################################################
open (OUT, ">$file_out") || die "could not open the fileout: $file_out";

# print the header info
print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";

# print the PhosphoSite regulatory data
print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";

# print the sample names
for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }

# print the kinases and groups
for my $i (0 .. $#kinases_observed) {
	my $temp = $kinases_observed[$i]."_NetworKIN";
	print OUT "$temp\t";
}
for my $i (0 .. $#motif_sequence) {
	print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t";
}
for my $i (0 .. $#kinases_PhosphoSite) {
	my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite";
	if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
	if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
}


foreach my $peptide (keys %data) {
	# Print the peptide itself
	print OUT "$peptide\t";
	
	# skip over failed matches
	if ($matched_sequences{$peptide} eq "Failed match") {
		print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
	} else {
		# Print just the protein description
		my @description = ();
		for $i (0 .. $#{$names{$peptide}}) {
			my $long_name = $names{$peptide}[$i];
			my @naming_parts = split(/\sOS/, $long_name);
			my @front_half = split(/\s/, $naming_parts[0]);
			push(@description, join(" ", @front_half[1..($#front_half)]));
		}
		print OUT join(" /// ", @description), "\t";
		
		# Print just the gene name
		my @gene = ();
		my %seen = ();
		for $i (0 .. $#{$names{$peptide}}) {
			my $tmp_gene = $names{$peptide}[$i];
			$tmp_gene =~ s/^.*GN=//;
			$tmp_gene =~ s/\s.*//;
			if (!exists($seen{$tmp_gene})) {
				push(@gene, $tmp_gene);
				$seen{$tmp_gene} = $tmp_gene;
			}
		}
		print OUT join(" /// ", @gene), "\t";
		
		# print the entire names
		print OUT join(" /// ", @{$names{$peptide}}), "\t";
		
		# Print the phospho-residues
		for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
			if ($i < $#{ $matched_sequences{$peptide} }) {
				@tmp_p_residues = @{$p_residues{$peptide}{$i}};
				for my $j (0 .. $#tmp_p_residues) {
					if ($j < $#tmp_p_residues) {
						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
					}
					elsif ($j == $#tmp_p_residues) {
						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
					}
				}
			}
			elsif ($i == $#{ $matched_sequences{$peptide} }) {
				@tmp_p_residues = @{$p_residues{$peptide}{$i}};
				for my $j (0 .. $#tmp_p_residues) {
					if ($j < $#tmp_p_residues) {
						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
					}
					elsif ($j == $#tmp_p_residues) {
						my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;		# added 12.05.2012 for Justin's data
						print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t";
					}
				}
			}
		}
		
		# Print the UNIQUE phospho-motifs
		print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
		
		# Print the accessions
		print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
		
		# print ALL motifs with residue numbers
		print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
	}
	
	# Print the PhosphoSite regulatory data
	
	if (exists($domain_2{$peptide})) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; }
	if (exists($ON_FUNCTION_2{$peptide})) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; }
	if (exists($ON_PROCESS_2{$peptide})) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; }
	if (exists($ON_PROT_INTERACT_2{$peptide})) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
	if (exists($ON_OTHER_INTERACT_2{$peptide})) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
	if (exists($notes_2{$peptide})) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; }
	
	# Print the data
	@tmp_data = (); foreach (@{$data{$peptide}}) { push(@tmp_data, $_); }
	print OUT join("\t", @tmp_data), "\t";
	
	# print the kinase-substrate data
	for my $i (0 .. $#kinases_observed) {
		if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
			print OUT "X\t";
		}
		else { print OUT "\t";}
	}
	for my $i (0 .. $#motif_sequence) {
		if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) {
			print OUT "X\t";
			# print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n";			#debug
		}
		else { print OUT "\t";}
	}
	for my $i (0 .. $#kinases_PhosphoSite) {
		if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) {
			print OUT "X\t";
		}
		elsif (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i == $#kinases_PhosphoSite)) {
			print OUT "X\n";
		}
		elsif (!exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]}) && ($i < $#kinases_PhosphoSite)) {
			print OUT "\t";
		}
		else {
			print OUT "\n";
		}
	}
}

close OUT;


my @timeData = localtime(time);
print "\nFinished $timeData[2]:$timeData[1]:$timeData[0]\n\n";

###############################################################################################################################
sub replace_pSpTpY {
	my ($formatted_sequence, $phospho_type) = @_;
	if ($phospho_type eq 'y') {
		$formatted_sequence =~ s/pS/S/g;
		$formatted_sequence =~ s/pT/T/g;
		$formatted_sequence =~ s/pY/y/g;
		}
	elsif ($phospho_type eq "sty") {
		$formatted_sequence =~ s/pS/s/g;
		$formatted_sequence =~ s/pT/t/g;
		$formatted_sequence =~ s/pY/y/g;
		}
	$formatted_sequence;
}