Mercurial > repos > eschen42 > mqppep_anova
changeset 6:922d309640db draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9dfb7e07a3673d7de4b0a1b7e6ce1b75a8a4f42b"
author | eschen42 |
---|---|
date | Fri, 11 Mar 2022 20:04:05 +0000 |
parents | d4d531006735 |
children | d728198f1ba5 |
files | MaxQuantProcessingScript.R PhosphoPeptide_Upstream_Kinase_Mapping.pl mqppep_mrgfltr.py repository_dependencies.xml search_ppep.py |
diffstat | 5 files changed, 159 insertions(+), 79 deletions(-) [+] |
line wrap: on
line diff
--- a/MaxQuantProcessingScript.R Thu Mar 10 23:42:48 2022 +0000 +++ b/MaxQuantProcessingScript.R Fri Mar 11 20:04:05 2022 +0000 @@ -521,6 +521,14 @@ # --- quant_write <- cbind(metadata_df[, "Sequence window"], quant_data) colnames(quant_write)[1] <- "Sequence.Window" +write.table( + quant_write, + file = quant_file_name, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE +) # ... @@ -568,14 +576,6 @@ pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") # ... -write.table( - quant_data_qc_collapsed, - file = quant_file_name, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE -) # Compute and visualize breakdown of pY, pS, and pT before enrichment filter # ---
--- a/PhosphoPeptide_Upstream_Kinase_Mapping.pl Thu Mar 10 23:42:48 2022 +0000 +++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl Fri Mar 11 20:04:05 2022 +0000 @@ -25,7 +25,8 @@ ############################################################################################################################### use strict; -use warnings; +#ACE use warnings; +use warnings 'FATAL' => 'all'; use Getopt::Std; use DBD::SQLite::Constants qw/:file_open/; @@ -37,6 +38,9 @@ #use Data::Dump qw(dump); my $USE_SEARCH_PPEP_PY = 1; +#my $FAILED_MATCH_SEQ = "Failed match"; +my $FAILED_MATCH_SEQ = 'No Sequence'; +my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name'; my $dirname = dirname(__FILE__); my %opts; @@ -45,7 +49,7 @@ my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in); my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n); my $line = 0; -my @failed_match = ("Failed match"); +my @failed_match = ($FAILED_MATCH_SEQ); my @failed_matches; my (%all_data); my (@p_peptides, @non_p_peptides); @@ -571,6 +575,37 @@ print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n"; +###################### + $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); + $stmth = $dbh->prepare(" + INSERT INTO UniProtKB ( + Uniprot_ID, + Description, + Organism_Name, + Organism_ID, + Gene_Name, + PE, + SV, + Sequence, + Database + ) VALUES ( + 'No Uniprot_ID', + 'NO_GENE_SYMBOL No Description', + 'No Organism_Name', + 0, + '$FAILED_MATCH_GENE_NAME', + '0', + '0', + '$FAILED_MATCH_SEQ', + 'No Database' + ) + "); + if (not $stmth->execute()) { + print "Error inserting dummy row into UniProtKB: $stmth->errstr\n"; + } + $dbh->disconnect if ( defined $dbh ); +###################### + @timeData = localtime(time); print "\n--- Start search at " . format_localtime_iso8601() ."\n"; @@ -579,6 +614,7 @@ $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); } else { $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in"); + #ACE DELETEME $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); } if ($i) { print "python $dirname/search_ppep.py -u $db_out -p $file_in\n exited with exit code $i\n"; @@ -628,7 +664,7 @@ my %ppep_to_count_lut; print "start select peptide counts " . format_localtime_iso8601() . "\n"; -$stmth = $dbh->prepare(" +my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" SELECT DISTINCT phosphopeptide , count(*) as i @@ -639,10 +675,10 @@ ORDER BY phosphopeptide "); -if (not $stmth->execute()) { - die "Error fetching peptide counts: $stmth->errstr\n"; +if (not $uniprotkb_pep_ppep_view_stmth->execute()) { + die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n"; } -while (my @row = $stmth->fetchrow_array) { +while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { $ppep_to_count_lut{$row[0]} = $row[1]; #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n"; } @@ -662,7 +698,7 @@ my %ppep_to_row_lut; print "start select all records without qualification " . format_localtime_iso8601() . "\n"; -$stmth = $dbh->prepare(" +$uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" SELECT DISTINCT accession , peptide @@ -679,8 +715,8 @@ ORDER BY phosphopeptide "); -if (not $stmth->execute()) { - die "Error fetching all records without qualification: $stmth->errstr\n"; +if (not $uniprotkb_pep_ppep_view_stmth->execute()) { + die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n"; } my $current_ppep; my $counter = 0; @@ -689,7 +725,7 @@ @tmp_accessions = (); @tmp_names = (); @tmp_sites = (); -while (my @row = $stmth->fetchrow_array) { +while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { # Identify phosphopeptide for current row; # it is an error for it to change when the counter is not zero. $current_ppep = $row[$COL_PHOSPHOPEPTIDE]; @@ -833,7 +869,8 @@ my $arg2 = $tmp_p_residues[0] - $desired_residues_L; my $arg1 = $matched_sequences{$peptide_to_match}[$i]; - if (length($arg1) > $arg2 + $total_length - 1) { + if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) { + #ACE print "\$tmp_motif = substr($arg1, $arg2, $total_length)\n"; $tmp_motif = substr($arg1, $arg2, $total_length); #ACE print "tmp_motif = $tmp_motif\ti = $i\tpeptide_to_match = $peptide_to_match\tmatched_sequences{peptide_to_match}[i] = $matched_sequences{$peptide_to_match}[$i]\targ2 = $arg2\targ3 = $total_length\n"; @@ -1664,21 +1701,7 @@ print "Writing PSP_Regulatory_site records\n"; -#ACE $stmth = $dbh->prepare(" -#ACE INSERT INTO PSP_Regulatory_site ( -#ACE DOMAIN, -#ACE ON_FUNCTION, -#ACE ON_PROCESS, -#ACE ON_PROT_INTERACT, -#ACE ON_OTHER_INTERACT, -#ACE NOTES, -#ACE SITE_PLUSMINUS_7AA, -#ACE ORGANISM, -#ACE PROTEIN -#ACE ) VALUES (?,?,?,?,?,?,?,?,?) -#ACE "); - -$stmth = $dbh->prepare(" +my $psp_regulatory_site_stmth = $dbh->prepare(" INSERT INTO PSP_Regulatory_site ( DOMAIN, ON_FUNCTION, @@ -1694,17 +1717,16 @@ foreach my $peptide (keys %data) { if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) { #ACE print "writing domain $domain_2{$peptide} for regulatory site(s) $seq_plus7aa_2{$peptide}\n"; #ACE - $stmth->bind_param(1, $domain_2{$peptide}); - $stmth->bind_param(2, $ON_FUNCTION_2{$peptide}); - $stmth->bind_param(3, $ON_PROCESS_2{$peptide}); - $stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide}); - $stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide}); - $stmth->bind_param(6, $notes_2{$peptide}); - $stmth->bind_param(7, $seq_plus7aa_2{$peptide}); - $stmth->bind_param(8, $organism_2{$peptide}); - #ACE $stmth->bind_param(9, $psp_regsite_protein_2{$peptide}); - if (not $stmth->execute()) { - print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $stmth->errstr\n"; + $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide}); + if (not $psp_regulatory_site_stmth->execute()) { + print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n"; } else { #ACE print "added domain for $domain_2{$peptide}\n"; } @@ -1714,7 +1736,7 @@ } $dbh->{AutoCommit} = $auto_commit; -# auto_commit implicitly finishes stmth, apparently # $stmth->finish; +# auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish; $dbh->disconnect if ( defined $dbh ); @@ -1872,9 +1894,21 @@ push (@ppep_metadata, $ppep_id); push (@ppep_intensity, $peptide); + my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR'; # skip over failed matches - if ($matched_sequences{$peptide} eq "Failed match") { + print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond; + if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) { + # column 2: Protein description + # column 3: Gene name(s) + # column 4: FASTA name + # column 5: phospho-residues + # Column 6: UNIQUE phospho-motifs + # Column 7: accessions + # Column 8: ALL motifs with residue numbers + # 2 3 4 5 6 7 8 print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; + print "No match found for '$peptide' in sequence database\n"; + $gene_names = '$FAILED_MATCH_GENE_NAME'; } else { my @description = (); my %seen = (); @@ -1904,15 +1938,15 @@ print OUT $gene_names, "\t"; push (@ppep_metadata, join(" /// ", @gene)); - # print the entire names # column 4: FASTA name print OUT join(" /// ", @{$names{$peptide}}), "\t"; push (@ppep_metadata, join(" /// ", @{$names{$peptide}})); - # Print the phospho-residues - # column 5: + # column 5: phospho-residues my $tmp_for_insert = ""; + my $foobar; for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { + print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond; if ($i < $#{ $matched_sequences{$peptide} }) { if (defined $p_residues{$peptide}{$i}) { @tmp_p_residues = @{$p_residues{$peptide}{$i}}; @@ -1931,34 +1965,52 @@ } } elsif ($i == $#{ $matched_sequences{$peptide} }) { + my $there_were_sites = 0; if (defined $p_residues{$peptide}{$i}) { @tmp_p_residues = @{$p_residues{$peptide}{$i}}; - for my $j (0 .. $#tmp_p_residues) { - if ($j < $#tmp_p_residues) { - my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data - print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; - $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; - } - elsif ($j == $#tmp_p_residues) { - my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data - print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t"; - $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing"; + if ($#tmp_p_residues > 0) { + for my $j (0 .. $#tmp_p_residues) { + if ($j < $#tmp_p_residues) { + if (defined $p_residues{$peptide}{$i}[$j]) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + $foobar = $residues{$peptide}{$i}[$j]; + if (defined $foobar) { + print OUT "$foobar"; + print OUT "$tmp_site_for_printing, "; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + $there_were_sites = 1; + } + } + } + elsif ($j == $#tmp_p_residues) { + if (defined $p_residues{$peptide}{$i}[$j]) { + $foobar = $residues{$peptide}{$i}[$j]; + if (defined $foobar) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "$foobar"; + print OUT "$tmp_site_for_printing\t"; + #ACE print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t"; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing"; + $there_were_sites = 1; + } + } + } } } - } else { + } + if (0 == $there_were_sites) { print OUT "\t"; } } } + print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond; push (@ppep_metadata, $tmp_for_insert); - # Print the UNIQUE phospho-motifs - # Column 6: + # Column 6: UNIQUE phospho-motifs print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}})); - # Print the accessions - # Column 7: + # Column 7: accessions if (defined $accessions{$peptide}) { print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}})); @@ -1967,8 +2019,7 @@ push (@ppep_metadata, ""); } - # print ALL motifs with residue numbers - # Column 8: + # Column 8: ALL motifs with residue numbers if (defined $p_motifs{$peptide}) { print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}}));
--- a/mqppep_mrgfltr.py Thu Mar 10 23:42:48 2022 +0000 +++ b/mqppep_mrgfltr.py Fri Mar 11 20:04:05 2022 +0000 @@ -44,6 +44,19 @@ return None +def whine(func, *args, handle=lambda e: e, **kwargs): + + try: + return func(*args, **kwargs) + except Exception as e: + print("Warning: For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + return None + + def ppep_join(x): x = [i for i in x if N_A != i] result = "%s" % " | ".join(x) @@ -683,16 +696,16 @@ # message = pe.message)) # ) # ) + if phospho_pep not in PhosphoPep_UniProtSeq_LUT: + raise PreconditionError( + phospho_pep, + "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", + ) if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", ) - if phospho_pep not in PhosphoPep_UniProtSeq_LUT: - raise PreconditionError( - dephospho_pep, - "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", - ) if ( dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] @@ -739,6 +752,10 @@ seq10s = [] while i < len(ploc): start = UniProtSeq.find(dephospho_pep) + # handle case where no sequence was found for dep-pep + if start < 0: + i += 1 + continue psite = ( start + ploc[i] ) # location of phosphoresidue on protein sequence @@ -869,7 +886,6 @@ # val is a string if val not in joined_set: joined_set.add(val) - # joined_value += sep + '; '.join(map(str, val)) joined_value += sep + val sep = "; " # joined_value is a string @@ -914,9 +930,10 @@ return [phospho_pep, result] # Construct list of [string, dictionary] lists - # where the dictionary provides the SwissProt metadata for a phosphopeptide + # where the dictionary provides the SwissProt metadata + # for a phosphopeptide result_list = [ - catch(pseq_to_subdict, psequence) + whine(pseq_to_subdict, psequence) for psequence in data_in[PHOSPHOPEPTIDE_MATCH] ] @@ -1021,7 +1038,8 @@ how="left", ) - # after merging df, select only the columns of interest - note that PROTEIN is absent here + # after merging df, select only the columns of interest; + # note that PROTEIN is absent here merge_df = merge_df[ [ PHOSPHOPEPTIDE, @@ -1033,7 +1051,8 @@ ON_NOTES, ] ] - # combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" + # combine column values of interest + # into one FUNCTION_PHOSPHORESIDUE column" merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( merge_df[ON_PROCESS], sep="; ", na_rep="" )
--- a/repository_dependencies.xml Thu Mar 10 23:42:48 2022 +0000 +++ b/repository_dependencies.xml Fri Mar 11 20:04:05 2022 +0000 @@ -1,5 +1,5 @@ <?xml version="1.0" ?> <repositories description="Suite for preprocessing and ANOVA of MaxQuant results using LC-MS proteomics data from phosphoproteomic enrichment."> - <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="8be24b489992"/> - <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="cfc65ae227f8"/> + <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="b91809a18dbe"/> + <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="d4d531006735"/> </repositories> \ No newline at end of file
--- a/search_ppep.py Thu Mar 10 23:42:48 2022 +0000 +++ b/search_ppep.py Fri Mar 11 20:04:05 2022 +0000 @@ -516,6 +516,16 @@ for row in ker.fetchall(): print(row[0]) + # link peptides not found in sequence database to a dummy sequence-record + ker.execute( + """ + INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end) + SELECT id, 'No Uniprot_ID', 0, 0 + FROM deppep + WHERE id NOT IN (SELECT deppep_id FROM deppep_UniProtKB) + """ + ) + con.commit() ker.execute("vacuum") con.close()