annotate mqppep_mrgfltr.py @ 3:f9c13bc8e7ad draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182-dirty"
author eschen42
date Mon, 07 Mar 2022 19:26:06 +0000
parents c1403d18c189
children d4d531006735
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1 #!/usr/bin/env python
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
2
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
3 # Import the packages needed
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
4 import argparse
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
5 import os.path
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
6 import sys
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
7
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
8 import pandas
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
9 import re
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
10 import time
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
11 import sqlite3 as sql
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
12 from codecs import getreader as cx_getreader
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
13 import sys
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
14 import numpy as np
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
15
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
16 # for sorting list of lists using operator.itemgetter
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
17 import operator
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
18
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
19 # for formatting stack-trace
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
20 import traceback
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
21
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
22 # for Aho-Corasick search for fixed set of substrings
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
23 import ahocorasick
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
24 import operator
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
25 import hashlib
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
26
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
27 # for shutil.copyfile(src, dest)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
28 import shutil
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
29
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
30 # global constants
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
31 N_A = 'N/A'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
32
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
33 # ref: https://stackoverflow.com/a/8915613/15509512
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
34 # answers: "How to handle exceptions in a list comprehensions"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
35 # usage:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
36 # from math import log
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
37 # eggs = [1,3,0,3,2]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
38 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
39 # producing:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
40 # for <built-in function log>
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
41 # with args (0,)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
42 # exception: math domain error
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
43 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
44 def catch(func, *args, handle=lambda e : e, **kwargs):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
45 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
46 return func(*args, **kwargs)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
47 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
48 print("For %s" % str(func))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
49 print(" with args %s" % str(args))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
50 print(" caught exception: %s" % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
51 (ty, va, tb) = sys.exc_info()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
52 print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
53 exit(-1)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
54 return None # was handle(e)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
55
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
56 def ppep_join(x):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
57 x = [i for i in x if N_A != i]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
58 result = "%s" % ' | '.join(x)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
59 if result != "":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
60 return result
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
61 else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
62 return N_A
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
63
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
64 def melt_join(x):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
65 tmp = {key.lower(): key for key in x}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
66 result = "%s" % ' | '.join([tmp[key] for key in tmp])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
67 return result
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
68
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
69 def __main__():
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
70 # Parse Command Line
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
71 parser = argparse.ArgumentParser(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
72 description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
73 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
74
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
75 # inputs:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
76 # Phosphopeptide data for experimental results, including the intensities
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
77 # and the mapping to kinase domains, in tabular format.
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
78 parser.add_argument(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
79 '--phosphopeptides', '-p',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
80 nargs=1,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
81 required=True,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
82 dest='phosphopeptides',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
83 help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
84 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
85 # UniProtKB/SwissProt DB input, SQLite
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
86 parser.add_argument(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
87 '--ppep_mapping_db', '-d',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
88 nargs=1,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
89 required=True,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
90 dest='ppep_mapping_db',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
91 help='UniProtKB/SwissProt SQLite Database'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
92 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
93 #ACE # PhosPhositesPlus DB input, csv
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
94 #ACE parser.add_argument(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
95 #ACE '--psp_regulatory_sites', '-s',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
96 #ACE nargs=1,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
97 #ACE required=True,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
98 #ACE dest='psp_regulatory_sites_csv',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
99 #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
100 #ACE )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
101 # species to limit records chosed from PhosPhositesPlus
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
102 parser.add_argument(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
103 '--species', '-x',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
104 nargs=1,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
105 required=False,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
106 default=[],
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
107 dest='species',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
108 help='limit PhosphoSitePlus records to indicated species (field may be empty)'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
109 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
110
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
111 # outputs:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
112 # tabular output
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
113 parser.add_argument(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
114 '--mrgfltr_tab', '-o',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
115 nargs=1,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
116 required=True,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
117 dest='mrgfltr_tab',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
118 help='Tabular output file for results'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
119 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
120 # CSV output
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
121 parser.add_argument(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
122 '--mrgfltr_csv', '-c',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
123 nargs=1,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
124 required=True,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
125 dest='mrgfltr_csv',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
126 help='CSV output file for results'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
127 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
128 # SQLite output
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
129 parser.add_argument(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
130 '--mrgfltr_sqlite', '-S',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
131 nargs=1,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
132 required=True,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
133 dest='mrgfltr_sqlite',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
134 help='SQLite output file for results'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
135 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
136
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
137 # "Make it so!" (parse the arguments)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
138 options = parser.parse_args()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
139 print("options: " + str(options))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
140
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
141 # determine phosphopeptide ("upstream map") input tabular file access
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
142 if options.phosphopeptides is None:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
143 exit('Argument "phosphopeptides" is required but not supplied')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
144 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
145 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
146 input_file = open(upstream_map_filename_tab, 'r')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
147 input_file.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
148 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
149 exit('Error parsing phosphopeptides argument: %s' % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
150
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
151 # determine input SQLite access
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
152 if options.ppep_mapping_db is None:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
153 exit('Argument "ppep_mapping_db" is required but not supplied')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
154 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
155 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
156 input_file = open(uniprot_sqlite, 'rb')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
157 input_file.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
158 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
159 exit('Error parsing ppep_mapping_db argument: %s' % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
160
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
161 # copy input SQLite dataset to output SQLite dataset
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
162 if options.mrgfltr_sqlite is None:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
163 exit('Argument "mrgfltr_sqlite" is required but not supplied')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
164 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
165 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
166 shutil.copyfile(uniprot_sqlite, output_sqlite)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
167 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
168 exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
169
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
170 #ACE # determine psp_regulatory_sites CSV access
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
171 #ACE if options.psp_regulatory_sites_csv is None:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
172 #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
173 #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
174 #ACE try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
175 #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
176 #ACE input_file = open(phosphosite_filename_csv, 'r')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
177 #ACE input_file.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
178 #ACE except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
179 #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
180 #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
181
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
182 # determine species to limit records from PSP_Regulatory_Sites
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
183 if options.species is None:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
184 exit('Argument "species" is required (and may be empty) but not supplied')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
185 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
186 if len(options.species) > 0:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
187 species = options.species[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
188 else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
189 species = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
190 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
191 exit('Error parsing species argument: %s' % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
192
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
193 # determine tabular output destination
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
194 if options.mrgfltr_tab is None:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
195 exit('Argument "mrgfltr_tab" is required but not supplied')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
196 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
197 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
198 output_file = open(output_filename_tab, 'w')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
199 output_file.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
200 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
201 exit('Error parsing mrgfltr_tab argument: %s' % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
202
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
203 # determine CSV output destination
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
204 if options.mrgfltr_csv is None:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
205 exit('Argument "mrgfltr_csv" is required but not supplied')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
206 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
207 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
208 output_file = open(output_filename_csv, 'w')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
209 output_file.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
210 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
211 exit('Error parsing mrgfltr_csv argument: %s' % str(e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
212
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
213
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
214 def mqpep_getswissprot():
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
215
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
216 ###############################################
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
217 # copied from Excel Output Script.ipynb BEGIN #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
218 ###############################################
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
219
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
220 ########### String Constants #################
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
221 DEPHOSPHOPEP = 'DephosphoPep'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
222 DESCRIPTION = 'Description'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
223 FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
224 GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
225 ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
226 ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
227 ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
228 ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
229 ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
230 PHOSPHOPEPTIDE = 'Phosphopeptide'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
231 PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
232 PHOSPHORESIDUE = 'Phosphoresidue'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
233 PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
234 SEQUENCE = 'Sequence'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
235 SEQUENCE10 = 'Sequence10'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
236 SEQUENCE7 = 'Sequence7'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
237 SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
238 SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
239 UNIPROT_ID = 'UniProt_ID'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
240 UNIPROT_SEQ_AND_META_SQL = '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
241 select Uniprot_ID, Description, Gene_Name, Sequence,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
242 Organism_Name, Organism_ID, PE, SV
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
243 from UniProtKB
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
244 order by Sequence, UniProt_ID
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
245 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
246 UNIPROT_UNIQUE_SEQ_SQL = '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
247 select distinct Sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
248 from UniProtKB
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
249 group by Sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
250 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
251 PPEP_PEP_UNIPROTSEQ_SQL = '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
252 select distinct phosphopeptide, peptide, sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
253 from uniprotkb_pep_ppep_view
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
254 order by sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
255 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
256 PPEP_MELT_SQL = '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
257 SELECT DISTINCT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
258 phospho_peptide AS 'p_peptide',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
259 kinase_map AS 'characterization',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
260 'X' AS 'X'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
261 FROM ppep_gene_site_view
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
262 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
263 # CREATE TABLE PSP_Regulatory_site (
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
264 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
265 # domain TEXT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
266 # ON_FUNCTION TEXT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
267 # ON_PROCESS TEXT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
268 # ON_PROT_INTERACT TEXT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
269 # ON_OTHER_INTERACT TEXT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
270 # notes TEXT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
271 # organism TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
272 # );
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
273 PSP_REGSITE_SQL = '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
274 SELECT DISTINCT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
275 SITE_PLUSMINUS_7AA ,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
276 DOMAIN ,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
277 ON_FUNCTION ,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
278 ON_PROCESS ,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
279 ON_PROT_INTERACT ,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
280 ON_OTHER_INTERACT ,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
281 NOTES ,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
282 ORGANISM
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
283 FROM PSP_Regulatory_site
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
284 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
285 PPEP_ID_SQL ='''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
286 SELECT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
287 id AS 'ppep_id',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
288 seq AS 'ppep_seq'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
289 FROM ppep
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
290 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
291 MRGFLTR_DDL ='''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
292 DROP VIEW IF EXISTS mrgfltr_metadata_view;
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
293 DROP TABLE IF EXISTS mrgfltr_metadata;
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
294 CREATE TABLE mrgfltr_metadata
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
295 ( ppep_id INTEGER REFERENCES ppep(id)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
296 , Sequence10 TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
297 , Sequence7 TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
298 , GeneName TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
299 , Phosphoresidue TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
300 , UniProtID TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
301 , Description TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
302 , FunctionPhosphoresidue TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
303 , PutativeUpstreamDomains TEXT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
304 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
305 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
306 ;
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
307 CREATE VIEW mrgfltr_metadata_view AS
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
308 SELECT DISTINCT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
309 ppep.seq AS phospho_peptide
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
310 , Sequence10
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
311 , Sequence7
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
312 , GeneName
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
313 , Phosphoresidue
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
314 , UniProtID
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
315 , Description
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
316 , FunctionPhosphoresidue
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
317 , PutativeUpstreamDomains
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
318 FROM
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
319 ppep, mrgfltr_metadata
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
320 WHERE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
321 mrgfltr_metadata.ppep_id = ppep.id
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
322 ORDER BY
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
323 ppep.seq
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
324 ;
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
325 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
326
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
327 CITATION_INSERT_STMT = '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
328 INSERT INTO Citation (
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
329 ObjectName,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
330 CitationData
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
331 ) VALUES (?,?)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
332 '''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
333 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
334 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
335
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
336 MRGFLTR_METADATA_COLUMNS = [
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
337 'ppep_id',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
338 'Sequence10',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
339 'Sequence7',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
340 'GeneName',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
341 'Phosphoresidue',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
342 'UniProtID',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
343 'Description',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
344 'FunctionPhosphoresidue',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
345 'PutativeUpstreamDomains'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
346 ]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
347
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
348 ########### String Constants (end) ############
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
349
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
350 class Error(Exception):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
351 """Base class for exceptions in this module."""
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
352 pass
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
353
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
354 class PreconditionError(Error):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
355 """Exception raised for errors in the input.
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
356
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
357 Attributes:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
358 expression -- input expression in which the error occurred
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
359 message -- explanation of the error
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
360 """
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
361
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
362 def __init__(self, expression, message):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
363 self.expression = expression
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
364 self.message = message
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
365
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
366 #start_time = time.clock() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
367 start_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
368
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
369 #get keys from upstream tabular file using readline()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
370 # ref: https://stackoverflow.com/a/16713581/15509512
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
371 # answer to "Use codecs to read file with correct encoding"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
372 file1_encoded = open(upstream_map_filename_tab, 'rb')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
373 file1 = cx_getreader("latin-1")(file1_encoded)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
374
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
375 count = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
376 upstream_map_p_peptide_list = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
377 re_tab = re.compile('^[^\t]*')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
378 while True:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
379 count += 1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
380 # Get next line from file
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
381 line = file1.readline()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
382 # if line is empty
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
383 # end of file is reached
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
384 if not line:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
385 break
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
386 if count > 1:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
387 m = re_tab.match(line)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
388 upstream_map_p_peptide_list.append(m[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
389 file1.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
390 file1_encoded.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
391
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
392 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
393 re_phos = re.compile('p')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
394 dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
395
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
396 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
397 print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
398
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
399 ## ----------- Get SwissProt data from SQLite database (start) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
400 # build UniProt sequence LUT and list of unique SwissProt sequences
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
401
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
402 # Open SwissProt SQLite database
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
403 conn = sql.connect(uniprot_sqlite)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
404 cur = conn.cursor()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
405
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
406 # Set up structures to hold SwissProt data
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
407
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
408 uniprot_Sequence_List = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
409 UniProtSeqLUT = {}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
410
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
411 # Execute query for unique seqs without fetching the results yet
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
412 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
413
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
414 while batch := uniprot_unique_seq_cur.fetchmany(size=50):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
415 if None == batch:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
416 # handle case where no records are returned
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
417 break
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
418 for row in batch:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
419 Sequence = row[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
420 UniProtSeqLUT[(Sequence,DESCRIPTION)] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
421 UniProtSeqLUT[(Sequence,GENE_NAME) ] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
422 UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
423 UniProtSeqLUT[ Sequence ] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
424
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
425 # Execute query for seqs and metadata without fetching the results yet
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
426 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
427
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
428 while batch := uniprot_seq_and_meta.fetchmany(size=50):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
429 if None == batch:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
430 # handle case where no records are returned
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
431 break
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
432 for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
433 uniprot_Sequence_List.append(Sequence)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
434 UniProtSeqLUT[Sequence] = Sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
435 UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
436 UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
437 if OS != N_A:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
438 Description += ' OS=' + OS
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
439 if OX != N_A:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
440 Description += ' OX=' + str(int(OX))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
441 if Gene_Name != N_A:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
442 Description += ' GN=' + Gene_Name
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
443 if PE != N_A:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
444 Description += ' PE=' + PE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
445 if SV != N_A:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
446 Description += ' SV=' + SV
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
447 UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
448
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
449 # Close SwissProt SQLite database; clean up local variables
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
450 conn.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
451 Sequence = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
452 UniProt_ID = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
453 Description = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
454 Gene_Name = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
455
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
456 ## ----------- Get SwissProt data from SQLite database (finish) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
457
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
458 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
459 print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
460
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
461 ## ----------- Get SwissProt data from SQLite database (start) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
462 # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
463 #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
464
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
465 # Open SwissProt SQLite database
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
466 conn = sql.connect(uniprot_sqlite)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
467 cur = conn.cursor()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
468
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
469 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
470 DephosphoPep_UniProtSeq_LUT = {}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
471
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
472 # Set up dictionary to accumulate results
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
473 PhosphoPep_UniProtSeq_LUT = {}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
474
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
475 # Execute query for tuples without fetching the results yet
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
476 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
477
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
478 while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
479 if None == batch:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
480 # handle case where no records are returned
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
481 break
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
482 for (phospho_pep, dephospho_pep, sequence) in batch:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
483 #do interesting stuff here...
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
484 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
485 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
486 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
487 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
488 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
489 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
490 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
491 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
492 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
493
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
494 #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
495 if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
496 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
497 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
498 if phospho_pep != phospho_pep:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
499 print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
500 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
501 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
502 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
503 r = list(zip(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
504 [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]],
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
505 [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]],
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
506 [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
507 ))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
508 # Sort by `UniProt_ID`
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
509 # ref: https://stackoverflow.com/a/4174955/15509512
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
510 r = sorted(r, key=operator.itemgetter(0))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
511 # Get one tuple for each `phospho_pep`
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
512 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
513 for (upid, gn, desc) in r:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
514 # Append pseudo-tuple per UniProt_ID but only when it is not present
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
515 if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
516 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
517 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
518 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
519
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
520 # Close SwissProt SQLite database; clean up local variables
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
521 conn.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
522 # wipe local variables
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
523 phospho_pep = dephospho_pep = sequence = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
524 upid = gn = desc = r = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
525
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
526 ## ----------- Get SwissProt data from SQLite database (finish) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
527
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
528 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
529 print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
530
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
531 print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list))))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
532
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
533 #Read in Upstream tabular file
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
534 # We are discarding the intensity data; so read it as text
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
535 upstream_data = pandas.read_table(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
536 upstream_map_filename_tab,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
537 dtype='str',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
538 index_col = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
539 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
540
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
541 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
542 print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
543
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
544 upstream_data.index = upstream_map_p_peptide_list
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
545
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
546
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
547 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
548 print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
549
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
550
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
551 #trim upstream_data to include only the upstream map columns
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
552 old_cols = upstream_data.columns.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
553 i = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
554 first_intensity = -1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
555 last_intensity = -1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
556 intensity_re = re.compile('Intensity.*')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
557 for col_name in old_cols:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
558 m = intensity_re.match(col_name)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
559 if m:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
560 last_intensity = i
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
561 if first_intensity == -1:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
562 first_intensity = i
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
563 i += 1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
564 #print('last intensity = %d' % last_intensity)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
565 col_PKCalpha = last_intensity + 2
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
566 col_firstIntensity = first_intensity
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
567
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
568 data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
569
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
570 if upstream_data.empty:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
571 print("upstream_data is empty")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
572 exit(0)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
573
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
574 data_in = upstream_data.copy(deep=True)[data_in_cols]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
575
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
576 # Convert floating-point integers to int64 integers
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
577 # ref: https://stackoverflow.com/a/68497603/15509512
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
578 data_in[list(data_in.columns[1:])] = data_in[
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
579 list(data_in.columns[1:])].astype('float64').apply(np.int64)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
580
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
581 #create another phosphopeptide column that will be used to join later;
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
582 # MAY need to change depending on Phosphopeptide column position
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
583 #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
584 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
585
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
586
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
587
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
588
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
589 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
590 print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
591
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
592 # Produce a dictionary of metadata for a single phosphopeptide.
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
593 # This is a replacement of `UniProtInfo_subdict` in the original code.
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
594 def pseq_to_subdict(phospho_pep):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
595 #ACE print("calling pseq_to_subdict, %s" % phospho_pep);
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
596 # Strip "p" from phosphopeptide sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
597 dephospho_pep = re_phos.sub('',phospho_pep)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
598
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
599 # Determine number of phosphoresidues in phosphopeptide
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
600 numps = len(phospho_pep) - len(dephospho_pep)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
601
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
602 # Determine location(s) of phosphoresidue(s) in phosphopeptide
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
603 # (used later for Phosphoresidue, Sequence7, and Sequence10)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
604 ploc = [] #list of p locations
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
605 i = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
606 p = phospho_pep
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
607 while i < numps:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
608 ploc.append(p.find("p"))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
609 p = p[:p.find("p")] + p[p.find("p")+1:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
610 i +=1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
611
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
612
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
613 # Establish nested dictionary
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
614 result = {}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
615 result[SEQUENCE] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
616 result[UNIPROT_ID] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
617 result[DESCRIPTION] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
618 result[GENE_NAME] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
619 result[PHOSPHORESIDUE] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
620 result[SEQUENCE7] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
621 result[SEQUENCE10] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
622
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
623 # Add stripped sequence to dictionary
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
624 result[SEQUENCE].append(dephospho_pep)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
625
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
626 # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
627 dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
628
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
629 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
630 ### Caller may elect to:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
631 ## try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
632 ## ...
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
633 ## except PreconditionError as pe:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
634 ## print("'{expression}': {message}".format(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
635 ## expression = pe.expression,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
636 ## message = pe.message))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
637 ## )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
638 ## )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
639 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
640 raise PreconditionError( dephospho_pep,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
641 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
642 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
643 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
644 raise PreconditionError( dephospho_pep,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
645 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
646 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
647 if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
648 raise PreconditionError( dephospho_pep,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
649 "dephosphorylated phosphopeptide does not match " +
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
650 "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " +
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
651 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
652 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
653 result[SEQUENCE] = [dephospho_pep]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
654 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
655 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
656 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
657 if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
658 raise PreconditionError( dephospho_pep,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
659 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
660 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
661 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
662 if len (UniProtSeqList) < 1:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
663 print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
664 # raise PreconditionError(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
665 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
666 # 'value has zero length'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
667 # )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
668 for UniProtSeq in UniProtSeqList:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
669 i = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
670 phosphoresidues = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
671 seq7s_set = set()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
672 seq7s = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
673 seq10s_set = set()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
674 seq10s = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
675 while i < len(ploc):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
676 start = UniProtSeq.find(dephospho_pep)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
677 psite = start+ploc[i] #location of phosphoresidue on protein sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
678
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
679 #add Phosphoresidue
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
680 phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
681 phosphoresidues.append(phosphosite)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
682
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
683 #Add Sequence7
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
684 if psite < 7: #phospho_pep at N terminus
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
685 seq7 = str(UniProtSeq)[:psite+8]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
686 if seq7[psite] == "S": #if phosphosresidue is serine
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
687 pres = "s"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
688 elif seq7[psite] == "T": #if phosphosresidue is threonine
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
689 pres = "t"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
690 elif seq7[psite] == "Y": #if phosphoresidue is tyrosine
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
691 pres = "y"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
692 else: # if not pSTY
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
693 pres = "?"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
694 seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
695 while len(seq7) < 15: #add appropriate number of "_" to the front
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
696 seq7 = "_" + seq7
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
697 elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
698 seq7 = str(UniProtSeq)[psite-7:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
699 if seq7[7] == "S":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
700 pres = "s"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
701 elif seq7[7] == "T":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
702 pres = "t"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
703 elif seq7[7] == "Y":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
704 pres = "y"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
705 else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
706 pres = "?"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
707 seq7 = seq7[:7] + pres + seq7[8:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
708 while len(seq7) < 15: #add appropriate number of "_" to the back
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
709 seq7 = seq7 + "_"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
710 else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
711 seq7 = str(UniProtSeq)[psite-7:psite+8]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
712 pres = "" #phosphoresidue
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
713 if seq7[7] == "S": #if phosphosresidue is serine
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
714 pres = "s"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
715 elif seq7[7] == "T": #if phosphosresidue is threonine
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
716 pres = "t"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
717 elif seq7[7] == "Y": #if phosphoresidue is tyrosine
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
718 pres = "y"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
719 else: # if not pSTY
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
720 pres = "?"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
721 seq7 = seq7[:7] + pres + seq7[8:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
722 if seq7 not in seq7s_set:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
723 seq7s.append(seq7)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
724 seq7s_set.add(seq7)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
725
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
726 #add Sequence10
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
727 if psite < 10: #phospho_pep at N terminus
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
728 seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
729 elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
730 seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
731 else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
732 seq10 = str(UniProtSeq)[psite-10:psite+11]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
733 seq10 = seq10[:10] + "p" + seq10[10:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
734 if seq10 not in seq10s_set:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
735 seq10s.append(seq10)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
736 seq10s_set.add(seq10)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
737
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
738 i+=1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
739
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
740 result[PHOSPHORESIDUE].append(phosphoresidues)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
741 result[SEQUENCE7].append(seq7s)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
742 # result[SEQUENCE10] is a list of lists of strings
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
743 result[SEQUENCE10].append(seq10s)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
744
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
745
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
746
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
747
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
748 r = list(zip(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
749 result[UNIPROT_ID],
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
750 result[GENE_NAME],
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
751 result[DESCRIPTION],
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
752 result[PHOSPHORESIDUE]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
753 ))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
754 # Sort by `UniProt_ID`
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
755 # ref: https://stackoverflow.com//4174955/15509512
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
756 s = sorted(r, key=operator.itemgetter(0))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
757
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
758 result[UNIPROT_ID] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
759 result[GENE_NAME] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
760 result[DESCRIPTION] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
761 result[PHOSPHORESIDUE] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
762
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
763 for r in s:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
764 result[UNIPROT_ID].append(r[0])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
765 result[GENE_NAME].append(r[1])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
766 result[DESCRIPTION].append(r[2])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
767 result[PHOSPHORESIDUE].append(r[3])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
768
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
769
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
770 #convert lists to strings in the dictionary
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
771 for key,value in result.items():
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
772 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
773 result[key] = '; '.join(map(str, value))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
774 elif key in [SEQUENCE10]:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
775 # result[SEQUENCE10] is a list of lists of strings
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
776 joined_value = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
777 joined_set = set()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
778 sep = ''
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
779 for valL in value:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
780 # valL is a list of strings
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
781 for val in valL:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
782 # val is a string
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
783 if val not in joined_set:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
784 joined_set.add(val)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
785 #joined_value += sep + '; '.join(map(str, val))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
786 joined_value += sep + val
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
787 sep = '; '
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
788 # joined_value is a string
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
789 result[key] = joined_value
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
790
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
791
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
792 newstring = '; '.join(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
793 [', '.join(l) for l in result[PHOSPHORESIDUE]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
794 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
795 ### #separate the isoforms in PHOSPHORESIDUE column with ";"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
796 ### oldstring = result[PHOSPHORESIDUE]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
797 ### oldlist = list(oldstring)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
798 ### newstring = ""
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
799 ### i = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
800 ### for e in oldlist:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
801 ### if e == ";":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
802 ### if numps > 1:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
803 ### if i%numps:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
804 ### newstring = newstring + ";"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
805 ### else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
806 ### newstring = newstring + ","
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
807 ### else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
808 ### newstring = newstring + ";"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
809 ### i +=1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
810 ### else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
811 ### newstring = newstring + e
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
812 result[PHOSPHORESIDUE] = newstring
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
813
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
814
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
815 #separate sequence7's by |
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
816 oldstring = result[SEQUENCE7]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
817 oldlist = oldstring
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
818 newstring = ""
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
819 for l in oldlist:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
820 for e in l:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
821 if e == ";":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
822 newstring = newstring + " |"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
823 elif len(newstring) > 0 and 1 > newstring.count(e):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
824 newstring = newstring + " | " + e
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
825 elif 1 > newstring.count(e):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
826 newstring = newstring + e
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
827 result[SEQUENCE7] = newstring
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
828
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
829
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
830 return [phospho_pep, result]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
831
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
832 # Construct list of [string, dictionary] lists
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
833 # where the dictionary provides the SwissProt metadata for a phosphopeptide
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
834 result_list = [
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
835 catch(pseq_to_subdict,psequence)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
836 for psequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
837 in data_in[PHOSPHOPEPTIDE_MATCH]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
838 ]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
839
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
840
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
841 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
842 print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
843
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
844 # Construct dictionary from list of lists
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
845 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
846 UniProt_Info = {
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
847 result[0]:result[1]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
848 for result
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
849 in result_list
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
850 if result is not None
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
851 }
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
852
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
853
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
854 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
855 print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
856
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
857 #cosmetic: add N_A to phosphopeptide rows with no hits
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
858 p_peptide_list = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
859 for key in UniProt_Info:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
860 p_peptide_list.append(key)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
861 for nestedKey in UniProt_Info[key]:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
862 if UniProt_Info[key][nestedKey] == "":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
863 UniProt_Info[key][nestedKey] = N_A
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
864
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
865 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
866 print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
867
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
868 #convert UniProt_Info dictionary to dataframe
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
869 uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
870
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
871 #reorder columns to match expected output file
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
872 uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
873
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
874
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
875 cols = uniprot_df.columns.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
876 #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
877 #uniprot_df = uniprot_df[cols]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
878 uniprot_df = uniprot_df[[
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
879 PHOSPHOPEPTIDE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
880 SEQUENCE10,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
881 SEQUENCE7,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
882 GENE_NAME,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
883 PHOSPHORESIDUE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
884 UNIPROT_ID,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
885 DESCRIPTION
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
886 ]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
887
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
888
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
889 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
890 print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
891
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
892 #concat to split then groupby to collapse
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
893 seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | '))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
894 for _, row in uniprot_df.iterrows()]).reset_index()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
895 seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
896
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
897 # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
898 #read in PhosphoSitePlus Regulatory Sites dataset
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
899 #ACE if (True):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
900 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
901 conn = sql.connect(uniprot_sqlite)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
902 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
903 # Close SwissProt SQLite database
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
904 conn.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
905 #ACE # Array indexes are zero-based
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
906 #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
907 #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
908 #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
909 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
910 #ACE print(regsites_df)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
911 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
912 #ACE else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
913 #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
914 #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
915 #ACE #ACE # Array indexes are zero-based
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
916 #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
917 #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
918 #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
919 #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
920 #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
921 #ACE #ACE ]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
922 #ACE #ACE REGSITE_COL_SITE7AA = 9
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
923 #ACE #ACE REGSITE_COL_PROTEIN = 1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
924 #ACE #ACE REGSITE_COL_DOMAIN = 10
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
925 #ACE #ACE REGSITE_COL_PMIDs = 15
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
926
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
927 # ... -------------- end read PSP_Regulatory_sites ------------------------------------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
928
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
929
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
930 #keep only the human entries in dataframe
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
931 if len(species) > 0:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
932 print('Limit PhosphoSitesPlus records to species "' + species + '"')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
933 regsites_df = regsites_df[regsites_df.ORGANISM == species]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
934
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
935 #merge the seq7 df with the regsites df based off of the sequence7
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
936 merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
937 #ACE print(merge_df.columns.tolist()) #ACE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
938
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
939 #after merging df, select only the columns of interest - note that PROTEIN is absent here
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
940 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
941 #ACE print(merge_df.columns.tolist()) #ACE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
942 #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
943 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
944 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
945 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
946 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
947
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
948 #remove the columns that were combined
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
949 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
950
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
951 #ACE print(merge_df) #ACE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
952 #ACE print(merge_df.columns.tolist()) #ACE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
953
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
954 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
955 print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
956
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
957 #cosmetic changes to Function Phosphoresidue column
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
958 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
959
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
960 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
961 print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
962
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
963 i = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
964 while i < len(fp_series):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
965 #remove the extra ";" so that it looks more professional
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
966 if fp_series[i] == "; ; ; ; ": #remove ; from empty hits
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
967 fp_series[i] = ""
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
968 while fp_series[i].endswith("; "): #remove ; from the ends
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
969 fp_series[i] = fp_series[i][:-2]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
970 while fp_series[i].startswith("; "): #remove ; from the beginning
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
971 fp_series[i] = fp_series[i][2:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
972 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
973 fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
974 fp_series[i] = fp_series[i].replace("; ; ", "; ")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
975
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
976 #turn blanks into N_A to signify the info was searched for but cannot be found
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
977 if fp_series[i] == "":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
978 fp_series[i] = N_A
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
979
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
980 i += 1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
981 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
982
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
983 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
984 print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
985
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
986 #merge uniprot df with merge df
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
987 uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
988
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
989 #collapse the merged df
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
990 uniprot_regsites_collapsed_df = pandas.DataFrame(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
991 uniprot_regsites_merged_df
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
992 .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
993 .apply(lambda x: ppep_join(x)))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
994 #.apply(lambda x: "%s" % ' | '.join(x)))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
995
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
996
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
997 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
998 print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
999
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1000 uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1001
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1002
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1003 #rename columns
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1004 uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp']
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1005
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1006
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1007
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1008 #select columns to be merged to uniprot_df
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1009 #ACE cols = regsites_df.columns.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1010 #ACE print(cols) #ACE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1011 #ACE if len(cols) > 8:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1012 #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1013 #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1014 #ACE print(cols) #ACE
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1015 #ACE regsite_merge_df = regsites_df[cols]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1016
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1017 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1018 print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1019
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1020 #add columns based on Sequence7 matching site_+/-7_AA
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1021 uniprot_regsite_df = pandas.merge(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1022 left=uniprot_df,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1023 right=uniprot_regsites_collapsed_df,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1024 how='left',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1025 left_on=PHOSPHOPEPTIDE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1026 right_on='ppp')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1027
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1028 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1029 print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1030
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1031 data_in.rename(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1032 {'Protein description': PHOSPHOPEPTIDE},
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1033 axis='columns',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1034 inplace=True
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1035 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1036
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1037
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1038
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1039 sort_start_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1040
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1041 #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1042 res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1043 data_in = data_in.loc[res2]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1044
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1045 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1046 print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1047
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1048
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1049
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1050 cols = [old_cols[0]] + old_cols[col_PKCalpha-1:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1051 upstream_data = upstream_data[cols]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1052
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1053 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1054 print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1055
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1056
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1057 #### #rename upstream columns in new list
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1058 #### new_cols = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1059 #### for name in cols:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1060 #### if "_NetworKIN" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1061 #### name = name.split("_")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1062 #### if " motif" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1063 #### name = name.split(" motif")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1064 #### if " sequence " in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1065 #### name = name.split(" sequence")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1066 #### if "_Phosida" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1067 #### name = name.split("_")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1068 #### if "_PhosphoSite" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1069 #### name = name.split("_")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1070 #### new_cols.append(name)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1071
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1072 #rename upstream columns in new list
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1073 def col_rename(name):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1074 if "_NetworKIN" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1075 name = name.split("_")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1076 if " motif" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1077 name = name.split(" motif")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1078 if " sequence " in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1079 name = name.split(" sequence")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1080 if "_Phosida" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1081 name = name.split("_")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1082 if "_PhosphoSite" in name:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1083 name = name.split("_")[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1084 return name
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1085
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1086 new_cols = [col_rename(col) for col in cols]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1087 upstream_data.columns = new_cols
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1088
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1089
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1090
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1091 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1092 print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1093
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1094
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1095 # Create upstream_data_cast as a copy of upstream_data
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1096 # but with first column substituted by the phosphopeptide sequence
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1097 upstream_data_cast = upstream_data.copy()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1098 new_cols_cast = new_cols
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1099 new_cols_cast[0] = 'p_peptide'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1100 upstream_data_cast.columns = new_cols_cast
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1101 upstream_data_cast['p_peptide'] = upstream_data.index
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1102 new_cols_cast0 = new_cols_cast[0]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1103
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1104 # --- -------------- begin read upstream_data_melt ------------------------------------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1105 ## ----------- Get melted kinase mapping data from SQLite database (start) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1106 conn = sql.connect(uniprot_sqlite)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1107 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1108 # Close SwissProt SQLite database
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1109 conn.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1110 upstream_data_melt = upstream_data_melt_df.copy()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1111 upstream_data_melt.columns = ['p_peptide', 'characterization', 'X']
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1112 upstream_data_melt['characterization'] = [
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1113 col_rename(s)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1114 for s in upstream_data_melt['characterization']
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1115 ]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1116
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1117 print('%0.6f upstream_data_melt_df initially has %d rows' %
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1118 (end_time - start_time, len(upstream_data_melt.axes[0]))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1119 , file=sys.stderr)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1120 # ref: https://stackoverflow.com/a/27360130/15509512
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1121 # e.g. df.drop(df[df.score < 50].index, inplace=True)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1122 upstream_data_melt.drop(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1123 upstream_data_melt[upstream_data_melt.X != 'X'].index,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1124 inplace = True
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1125 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1126 print('%0.6f upstream_data_melt_df pre-dedup has %d rows' %
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1127 (end_time - start_time, len(upstream_data_melt.axes[0]))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1128 , file=sys.stderr)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1129 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1130 #ACE print(upstream_data_melt)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1131 ## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1132 # ... -------------- end read upstream_data_melt --------------------------------------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1133
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1134 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1135 print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1136 # ... end read upstream_data_melt
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1137
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1138 upstream_data_melt_index = upstream_data_melt.index
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1139 upstream_data_melt_p_peptide = upstream_data_melt['p_peptide']
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1140
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1141 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1142 print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1143
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1144 upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1145
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1146 melt_dict = {}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1147 for key in upstream_map_p_peptide_list:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1148 melt_dict[key] = []
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1149
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1150 for el in upstream_delta_melt_LoL:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1151 (p_peptide, characterization, X) = tuple(el)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1152 if p_peptide in melt_dict:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1153 melt_dict[p_peptide].append(characterization)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1154 else:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1155 exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1156
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1157
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1158 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1159 print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1160
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1161
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1162 # for key in upstream_map_p_peptide_list:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1163 # melt_dict[key] = ' | '.join(melt_dict[key])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1164
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1165 for key in upstream_map_p_peptide_list:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1166 melt_dict[key] = melt_join(melt_dict[key])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1167
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1168 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1169 print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1170
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1171 # map_dict is a dictionary of dictionaries
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1172 map_dict = {}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1173 for key in upstream_map_p_peptide_list:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1174 map_dict[key] = {}
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1175 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1176
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1177
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1178 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1179 print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1180
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1181 #convert map_dict to dataframe
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1182 map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1183 map_df["p-peptide"] = map_df.index #make index a column too
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1184 cols_map_df = map_df.columns.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1185 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1186 map_df = map_df[cols_map_df]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1187
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1188 #join map_df to uniprot_regsite_df
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1189 output_df = uniprot_regsite_df.merge(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1190 map_df,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1191 how="left",
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1192 left_on=PHOSPHOPEPTIDE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1193 right_on="p-peptide")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1194
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1195 output_df = output_df[
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1196 [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1197 UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1198 PUTATIVE_UPSTREAM_DOMAINS
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1199 ]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1200 ]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1201
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1202
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1203 # cols_output_prelim = output_df.columns.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1204 #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1205 # print("cols_output_prelim")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1206 # print(cols_output_prelim)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1207 #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1208 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1209 #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1210 # print("cols_output with p-peptide")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1211 # print(cols_output)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1212 #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1213 # cols_output = [col for col in cols_output if not col == "p-peptide"]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1214 #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1215 # print("cols_output")
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1216 # print(cols_output)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1217 #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1218 # output_df = output_df[cols_output]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1219
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1220 #join output_df back to quantitative columns in data_in df
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1221 quant_cols = data_in.columns.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1222 quant_cols = quant_cols[1:]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1223 quant_data = data_in[quant_cols]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1224
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1225 ## ----------- Write merge/filter metadata to SQLite database (start) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1226 # Open SwissProt SQLite database
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1227 conn = sql.connect(output_sqlite)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1228 cur = conn.cursor()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1229
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1230 cur.executescript(MRGFLTR_DDL)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1231
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1232 cur.execute(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1233 CITATION_INSERT_STMT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1234 ('mrgfltr_metadata_view', CITATION_INSERT_PSP)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1235 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1236 cur.execute(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1237 CITATION_INSERT_STMT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1238 ('mrgfltr_metadata', CITATION_INSERT_PSP)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1239 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1240 cur.execute(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1241 CITATION_INSERT_STMT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1242 ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1243 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1244 cur.execute(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1245 CITATION_INSERT_STMT,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1246 ('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1247 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1248
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1249 # Read ppep-to-sequence LUT
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1250 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1251 #ACE ppep_lut_df.info(verbose=True)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1252 # write only metadata for merged/filtered records to SQLite
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1253 mrgfltr_metadata_df = output_df.copy()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1254 # replace phosphopeptide seq with ppep.id
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1255 mrgfltr_metadata_df = ppep_lut_df.merge(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1256 mrgfltr_metadata_df,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1257 left_on='ppep_seq',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1258 right_on=PHOSPHOPEPTIDE,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1259 how='inner'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1260 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1261 mrgfltr_metadata_df.drop(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1262 columns=[PHOSPHOPEPTIDE, 'ppep_seq'],
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1263 inplace=True
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1264 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1265 #rename columns
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1266 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1267 #ACE mrgfltr_metadata_df.info(verbose=True)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1268 mrgfltr_metadata_df.to_sql(
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1269 'mrgfltr_metadata',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1270 con=conn,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1271 if_exists='append',
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1272 index=False,
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1273 method='multi'
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1274 )
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1275
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1276 # Close SwissProt SQLite database
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1277 conn.close()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1278 ## ----------- Write merge/filter metadata to SQLite database (finish) -----------
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1279
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1280 output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1281 output_cols = output_df.columns.tolist()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1282 output_cols = output_cols[:-1]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1283 output_df = output_df[output_cols]
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1284
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1285 #cosmetic changes to Upstream column
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1286 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1287 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1288 i = 0
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1289 while i < len(us_series):
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1290 #turn blanks into N_A to signify the info was searched for but cannot be found
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1291 if us_series[i] == "":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1292 us_series[i] = N_A
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1293 i += 1
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1294 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1295
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1296 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1297 print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1298
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1299 (output_rows, output_cols) = output_df.shape
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1300
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1301 #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1302 output_df = output_df.convert_dtypes(convert_integer=True)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1303
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1304
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1305 #Output onto Final CSV file
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1306 output_df.to_csv(output_filename_csv, index=False)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1307 output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1308
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1309 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1310 print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1311
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1312 print('{:>10} phosphopeptides written to output'.format(str(output_rows)))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1313
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1314 end_time = time.process_time() #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1315 print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1316
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1317
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1318 #Rev. 7/1/2016
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1319 #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1320 #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1321 #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1322 # read from SwissProt SQLite database
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1323 #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1324
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1325 #############################################
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1326 # copied from Excel Output Script.ipynb END #
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1327 #############################################
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1328
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1329 try:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1330 catch(mqpep_getswissprot,)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1331 exit(0)
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1332 except Exception as e:
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1333 exit('Internal error running mqpep_getswissprot(): %s' % (e))
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1334
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1335 if __name__ == "__main__":
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1336 __main__()
c1403d18c189 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
eschen42
parents:
diff changeset
1337