changeset 6:922d309640db draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9dfb7e07a3673d7de4b0a1b7e6ce1b75a8a4f42b"
author eschen42
date Fri, 11 Mar 2022 20:04:05 +0000
parents d4d531006735
children d728198f1ba5
files MaxQuantProcessingScript.R PhosphoPeptide_Upstream_Kinase_Mapping.pl mqppep_mrgfltr.py repository_dependencies.xml search_ppep.py
diffstat 5 files changed, 159 insertions(+), 79 deletions(-) [+]
line wrap: on
line diff
--- a/MaxQuantProcessingScript.R	Thu Mar 10 23:42:48 2022 +0000
+++ b/MaxQuantProcessingScript.R	Fri Mar 11 20:04:05 2022 +0000
@@ -521,6 +521,14 @@
 # ---
 quant_write <- cbind(metadata_df[, "Sequence window"], quant_data)
 colnames(quant_write)[1] <- "Sequence.Window"
+write.table(
+  quant_write,
+  file = quant_file_name,
+  sep = "\t",
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
+)
 # ...
 
 
@@ -568,14 +576,6 @@
 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
 # ...
 
-write.table(
-  quant_data_qc_collapsed,
-  file = quant_file_name,
-  sep = "\t",
-  quote = FALSE,
-  col.names = TRUE,
-  row.names = FALSE
-)
 
 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter
 # ---
--- a/PhosphoPeptide_Upstream_Kinase_Mapping.pl	Thu Mar 10 23:42:48 2022 +0000
+++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl	Fri Mar 11 20:04:05 2022 +0000
@@ -25,7 +25,8 @@
 ###############################################################################################################################
 
 use strict;
-use warnings;
+#ACE use warnings;
+use warnings 'FATAL' => 'all';
 
 use Getopt::Std;
 use DBD::SQLite::Constants qw/:file_open/;
@@ -37,6 +38,9 @@
 #use Data::Dump qw(dump);
 
 my $USE_SEARCH_PPEP_PY = 1;
+#my $FAILED_MATCH_SEQ = "Failed match";
+my $FAILED_MATCH_SEQ = 'No Sequence';
+my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name';
 
 my $dirname = dirname(__FILE__);
 my %opts;
@@ -45,7 +49,7 @@
 my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in);
 my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n);
 my $line = 0;
-my @failed_match = ("Failed match");
+my @failed_match = ($FAILED_MATCH_SEQ);
 my @failed_matches;
 my (%all_data);
 my (@p_peptides, @non_p_peptides);
@@ -571,6 +575,37 @@
 
 print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n";
 
+######################
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
+  $stmth = $dbh->prepare("
+  INSERT INTO UniProtKB (
+    Uniprot_ID,
+    Description,
+    Organism_Name,
+    Organism_ID,
+    Gene_Name,
+    PE,
+    SV,
+    Sequence,
+    Database
+  ) VALUES (
+    'No Uniprot_ID',
+    'NO_GENE_SYMBOL No Description',
+    'No Organism_Name',
+    0,
+    '$FAILED_MATCH_GENE_NAME',
+    '0',
+    '0',
+    '$FAILED_MATCH_SEQ',
+    'No Database'
+  )
+  ");
+  if (not $stmth->execute()) {
+      print "Error inserting dummy row into UniProtKB: $stmth->errstr\n";
+  }
+  $dbh->disconnect if ( defined $dbh );
+######################
+
 @timeData = localtime(time);
 print "\n--- Start search at " . format_localtime_iso8601() ."\n";
 
@@ -579,6 +614,7 @@
   $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose");
 } else {
   $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in");
+  #ACE DELETEME $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose");
 }
 if ($i) {
   print "python $dirname/search_ppep.py -u $db_out -p $file_in\n  exited with exit code $i\n";
@@ -628,7 +664,7 @@
 
 my %ppep_to_count_lut;
 print "start select peptide counts " . format_localtime_iso8601() . "\n";
-$stmth = $dbh->prepare("
+my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
     SELECT DISTINCT
       phosphopeptide
     , count(*) as i
@@ -639,10 +675,10 @@
     ORDER BY
       phosphopeptide
 ");
-if (not $stmth->execute()) {
-    die "Error fetching peptide counts: $stmth->errstr\n";
+if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
+    die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n";
 }
-while (my @row = $stmth->fetchrow_array) {
+while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
   $ppep_to_count_lut{$row[0]} = $row[1];
   #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n";
 }
@@ -662,7 +698,7 @@
 
 my %ppep_to_row_lut;
 print "start select all records without qualification " . format_localtime_iso8601() . "\n";
-$stmth = $dbh->prepare("
+$uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
     SELECT DISTINCT
       accession
     , peptide
@@ -679,8 +715,8 @@
     ORDER BY
       phosphopeptide
 ");
-if (not $stmth->execute()) {
-    die "Error fetching all records without qualification: $stmth->errstr\n";
+if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
+    die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n";
 }
 my $current_ppep;
 my $counter = 0;
@@ -689,7 +725,7 @@
 @tmp_accessions = ();
 @tmp_names = ();
 @tmp_sites = ();
-while (my @row = $stmth->fetchrow_array) {
+while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
     # Identify phosphopeptide for current row;
     #   it is an error for it to change when the counter is not zero.
     $current_ppep = $row[$COL_PHOSPHOPEPTIDE];
@@ -833,7 +869,8 @@
             my $arg2 = $tmp_p_residues[0] - $desired_residues_L;
             my $arg1 = $matched_sequences{$peptide_to_match}[$i];
 
-            if (length($arg1) > $arg2 + $total_length - 1) {
+            if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) {
+                #ACE print "\$tmp_motif = substr($arg1, $arg2, $total_length)\n";
                 $tmp_motif = substr($arg1, $arg2, $total_length);
                 #ACE print "tmp_motif = $tmp_motif\ti = $i\tpeptide_to_match = $peptide_to_match\tmatched_sequences{peptide_to_match}[i] = $matched_sequences{$peptide_to_match}[$i]\targ2 = $arg2\targ3 = $total_length\n";
 
@@ -1664,21 +1701,7 @@
 
 print "Writing PSP_Regulatory_site records\n";
 
-#ACE $stmth = $dbh->prepare("
-#ACE     INSERT INTO PSP_Regulatory_site (
-#ACE       DOMAIN,
-#ACE       ON_FUNCTION,
-#ACE       ON_PROCESS,
-#ACE       ON_PROT_INTERACT,
-#ACE       ON_OTHER_INTERACT,
-#ACE       NOTES,
-#ACE       SITE_PLUSMINUS_7AA,
-#ACE       ORGANISM,
-#ACE       PROTEIN
-#ACE     ) VALUES (?,?,?,?,?,?,?,?,?)
-#ACE     ");
-
-$stmth = $dbh->prepare("
+my $psp_regulatory_site_stmth = $dbh->prepare("
     INSERT INTO PSP_Regulatory_site (
       DOMAIN,
       ON_FUNCTION,
@@ -1694,17 +1717,16 @@
 foreach my $peptide (keys %data) {
     if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) {
         #ACE print "writing domain $domain_2{$peptide} for regulatory site(s) $seq_plus7aa_2{$peptide}\n"; #ACE
-        $stmth->bind_param(1, $domain_2{$peptide});
-        $stmth->bind_param(2, $ON_FUNCTION_2{$peptide});
-        $stmth->bind_param(3, $ON_PROCESS_2{$peptide});
-        $stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide});
-        $stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide});
-        $stmth->bind_param(6, $notes_2{$peptide});
-        $stmth->bind_param(7, $seq_plus7aa_2{$peptide});
-        $stmth->bind_param(8, $organism_2{$peptide});
-        #ACE $stmth->bind_param(9, $psp_regsite_protein_2{$peptide});
-        if (not $stmth->execute()) {
-            print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $stmth->errstr\n";
+        $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide});
+        if (not $psp_regulatory_site_stmth->execute()) {
+            print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n";
         } else {
             #ACE print "added domain for $domain_2{$peptide}\n";
         }
@@ -1714,7 +1736,7 @@
 }
 
 $dbh->{AutoCommit} = $auto_commit;
-# auto_commit implicitly finishes stmth, apparently # $stmth->finish;
+# auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish;
 $dbh->disconnect if ( defined $dbh );
 
 
@@ -1872,9 +1894,21 @@
     push (@ppep_metadata, $ppep_id);
     push (@ppep_intensity, $peptide);
 
+    my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR';
     # skip over failed matches
-    if ($matched_sequences{$peptide} eq "Failed match") {
+    print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond;
+    if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) {
+        # column 2: Protein description
+        # column 3: Gene name(s)
+        # column 4: FASTA name
+        # column 5: phospho-residues
+        # Column 6: UNIQUE phospho-motifs
+        # Column 7: accessions
+        # Column 8: ALL motifs with residue numbers
+        #          2                                     3   4   5   6   7   8
         print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
+        print "No match found for '$peptide' in sequence database\n";
+        $gene_names = '$FAILED_MATCH_GENE_NAME';
     } else {
         my @description = ();
         my %seen = ();
@@ -1904,15 +1938,15 @@
         print OUT $gene_names, "\t";
         push (@ppep_metadata, join(" /// ", @gene));
 
-        # print the entire names
         # column 4: FASTA name
         print OUT join(" /// ", @{$names{$peptide}}), "\t";
         push (@ppep_metadata, join(" /// ", @{$names{$peptide}}));
 
-        # Print the phospho-residues
-        # column 5:
+        # column 5: phospho-residues
         my $tmp_for_insert = "";
+        my $foobar;
         for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+            print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond;
             if ($i < $#{ $matched_sequences{$peptide} }) {
                 if (defined $p_residues{$peptide}{$i}) {
                     @tmp_p_residues = @{$p_residues{$peptide}{$i}};
@@ -1931,34 +1965,52 @@
                 }
             }
             elsif ($i == $#{ $matched_sequences{$peptide} }) {
+                my $there_were_sites = 0;
                 if (defined $p_residues{$peptide}{$i}) {
                     @tmp_p_residues = @{$p_residues{$peptide}{$i}};
-                    for my $j (0 .. $#tmp_p_residues) {
-                        if ($j < $#tmp_p_residues) {
-                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
-                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
-                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
-                        }
-                        elsif ($j == $#tmp_p_residues) {
-                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
-                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t";
-                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing";
+                    if ($#tmp_p_residues > 0) {
+                        for my $j (0 .. $#tmp_p_residues) {
+                            if ($j < $#tmp_p_residues) {
+                                if (defined $p_residues{$peptide}{$i}[$j]) {
+                                    my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                                    $foobar = $residues{$peptide}{$i}[$j];
+                                    if (defined $foobar) {
+                                        print OUT "$foobar";
+                                        print OUT "$tmp_site_for_printing, ";
+                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                                        $there_were_sites = 1;
+                                    }
+                                }
+                            }
+                            elsif ($j == $#tmp_p_residues) {
+                                if (defined $p_residues{$peptide}{$i}[$j]) {
+                                    $foobar = $residues{$peptide}{$i}[$j];
+                                    if (defined $foobar) {
+                                        my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                                        print OUT "$foobar";
+                                        print OUT "$tmp_site_for_printing\t";
+                                        #ACE print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t";
+                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing";
+                                        $there_were_sites = 1;
+                                    }
+                                }
+                            }
                         }
                     }
-                } else {
+                }
+                if (0 == $there_were_sites) {
                   print OUT "\t";
                 }
             }
         }
+        print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond;
         push (@ppep_metadata, $tmp_for_insert);
 
-        # Print the UNIQUE phospho-motifs
-        # Column 6:
+        # Column 6: UNIQUE phospho-motifs
         print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
         push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}}));
 
-        # Print the accessions
-        # Column 7:
+        # Column 7: accessions
         if (defined $accessions{$peptide}) {
             print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
             push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}}));
@@ -1967,8 +2019,7 @@
             push (@ppep_metadata, "");
         }
 
-        # print ALL motifs with residue numbers
-        # Column 8:
+        # Column 8: ALL motifs with residue numbers
         if (defined $p_motifs{$peptide}) {
             print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
             push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}}));
--- a/mqppep_mrgfltr.py	Thu Mar 10 23:42:48 2022 +0000
+++ b/mqppep_mrgfltr.py	Fri Mar 11 20:04:05 2022 +0000
@@ -44,6 +44,19 @@
         return None
 
 
+def whine(func, *args, handle=lambda e: e, **kwargs):
+
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("Warning: For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        return None
+
+
 def ppep_join(x):
     x = [i for i in x if N_A != i]
     result = "%s" % " | ".join(x)
@@ -683,16 +696,16 @@
             #             message = pe.message))
             #             )
             #         )
+            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    phospho_pep,
+                    "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
+                )
             if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
                 raise PreconditionError(
                     dephospho_pep,
                     "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
                 )
-            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
-                raise PreconditionError(
-                    dephospho_pep,
-                    "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
-                )
             if (
                 dephospho_pep
                 != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
@@ -739,6 +752,10 @@
                 seq10s = []
                 while i < len(ploc):
                     start = UniProtSeq.find(dephospho_pep)
+                    # handle case where no sequence was found for dep-pep
+                    if start < 0:
+                        i += 1
+                        continue
                     psite = (
                         start + ploc[i]
                     )  # location of phosphoresidue on protein sequence
@@ -869,7 +886,6 @@
                             # val is a string
                             if val not in joined_set:
                                 joined_set.add(val)
-                                # joined_value += sep + '; '.join(map(str, val))
                                 joined_value += sep + val
                                 sep = "; "
                     # joined_value is a string
@@ -914,9 +930,10 @@
             return [phospho_pep, result]
 
         # Construct list of [string, dictionary] lists
-        #   where the dictionary provides the SwissProt metadata for a phosphopeptide
+        #   where the dictionary provides the SwissProt metadata
+        #   for a phosphopeptide
         result_list = [
-            catch(pseq_to_subdict, psequence)
+            whine(pseq_to_subdict, psequence)
             for psequence in data_in[PHOSPHOPEPTIDE_MATCH]
         ]
 
@@ -1021,7 +1038,8 @@
             how="left",
         )
 
-        # after merging df, select only the columns of interest - note that PROTEIN is absent here
+        # after merging df, select only the columns of interest;
+        #   note that PROTEIN is absent here
         merge_df = merge_df[
             [
                 PHOSPHOPEPTIDE,
@@ -1033,7 +1051,8 @@
                 ON_NOTES,
             ]
         ]
-        # combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
+        # combine column values of interest
+        #   into one FUNCTION_PHOSPHORESIDUE column"
         merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(
             merge_df[ON_PROCESS], sep="; ", na_rep=""
         )
--- a/repository_dependencies.xml	Thu Mar 10 23:42:48 2022 +0000
+++ b/repository_dependencies.xml	Fri Mar 11 20:04:05 2022 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" ?>
 <repositories description="Suite for preprocessing and ANOVA of MaxQuant results using LC-MS proteomics data from phosphoproteomic enrichment.">
-    <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="8be24b489992"/>
-    <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="cfc65ae227f8"/>
+    <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="b91809a18dbe"/>
+    <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="d4d531006735"/>
 </repositories>
\ No newline at end of file
--- a/search_ppep.py	Thu Mar 10 23:42:48 2022 +0000
+++ b/search_ppep.py	Fri Mar 11 20:04:05 2022 +0000
@@ -516,6 +516,16 @@
     for row in ker.fetchall():
         print(row[0])
 
+    # link peptides not found in sequence database to a dummy sequence-record
+    ker.execute(
+        """
+        INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end)
+          SELECT id, 'No Uniprot_ID', 0, 0
+          FROM   deppep
+          WHERE  id NOT IN (SELECT deppep_id FROM deppep_UniProtKB)
+        """
+    )
+
     con.commit()
     ker.execute("vacuum")
     con.close()