Mercurial > repos > eschen42 > mqppep_anova
view mqppep_mrgfltr.py @ 3:f9c13bc8e7ad draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182-dirty"
author | eschen42 |
---|---|
date | Mon, 07 Mar 2022 19:26:06 +0000 |
parents | c1403d18c189 |
children | d4d531006735 |
line wrap: on
line source
#!/usr/bin/env python # Import the packages needed import argparse import os.path import sys import pandas import re import time import sqlite3 as sql from codecs import getreader as cx_getreader import sys import numpy as np # for sorting list of lists using operator.itemgetter import operator # for formatting stack-trace import traceback # for Aho-Corasick search for fixed set of substrings import ahocorasick import operator import hashlib # for shutil.copyfile(src, dest) import shutil # global constants N_A = 'N/A' # ref: https://stackoverflow.com/a/8915613/15509512 # answers: "How to handle exceptions in a list comprehensions" # usage: # from math import log # eggs = [1,3,0,3,2] # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) # producing: # for <built-in function log> # with args (0,) # exception: math domain error # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] def catch(func, *args, handle=lambda e : e, **kwargs): try: return func(*args, **kwargs) except Exception as e: print("For %s" % str(func)) print(" with args %s" % str(args)) print(" caught exception: %s" % str(e)) (ty, va, tb) = sys.exc_info() print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) exit(-1) return None # was handle(e) def ppep_join(x): x = [i for i in x if N_A != i] result = "%s" % ' | '.join(x) if result != "": return result else: return N_A def melt_join(x): tmp = {key.lower(): key for key in x} result = "%s" % ' | '.join([tmp[key] for key in tmp]) return result def __main__(): # Parse Command Line parser = argparse.ArgumentParser( description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.' ) # inputs: # Phosphopeptide data for experimental results, including the intensities # and the mapping to kinase domains, in tabular format. parser.add_argument( '--phosphopeptides', '-p', nargs=1, required=True, dest='phosphopeptides', help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format' ) # UniProtKB/SwissProt DB input, SQLite parser.add_argument( '--ppep_mapping_db', '-d', nargs=1, required=True, dest='ppep_mapping_db', help='UniProtKB/SwissProt SQLite Database' ) #ACE # PhosPhositesPlus DB input, csv #ACE parser.add_argument( #ACE '--psp_regulatory_sites', '-s', #ACE nargs=1, #ACE required=True, #ACE dest='psp_regulatory_sites_csv', #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header' #ACE ) # species to limit records chosed from PhosPhositesPlus parser.add_argument( '--species', '-x', nargs=1, required=False, default=[], dest='species', help='limit PhosphoSitePlus records to indicated species (field may be empty)' ) # outputs: # tabular output parser.add_argument( '--mrgfltr_tab', '-o', nargs=1, required=True, dest='mrgfltr_tab', help='Tabular output file for results' ) # CSV output parser.add_argument( '--mrgfltr_csv', '-c', nargs=1, required=True, dest='mrgfltr_csv', help='CSV output file for results' ) # SQLite output parser.add_argument( '--mrgfltr_sqlite', '-S', nargs=1, required=True, dest='mrgfltr_sqlite', help='SQLite output file for results' ) # "Make it so!" (parse the arguments) options = parser.parse_args() print("options: " + str(options)) # determine phosphopeptide ("upstream map") input tabular file access if options.phosphopeptides is None: exit('Argument "phosphopeptides" is required but not supplied') try: upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) input_file = open(upstream_map_filename_tab, 'r') input_file.close() except Exception as e: exit('Error parsing phosphopeptides argument: %s' % str(e)) # determine input SQLite access if options.ppep_mapping_db is None: exit('Argument "ppep_mapping_db" is required but not supplied') try: uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) input_file = open(uniprot_sqlite, 'rb') input_file.close() except Exception as e: exit('Error parsing ppep_mapping_db argument: %s' % str(e)) # copy input SQLite dataset to output SQLite dataset if options.mrgfltr_sqlite is None: exit('Argument "mrgfltr_sqlite" is required but not supplied') try: output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) shutil.copyfile(uniprot_sqlite, output_sqlite) except Exception as e: exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e)) #ACE # determine psp_regulatory_sites CSV access #ACE if options.psp_regulatory_sites_csv is None: #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied') #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv) #ACE try: #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0]) #ACE input_file = open(phosphosite_filename_csv, 'r') #ACE input_file.close() #ACE except Exception as e: #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e)) #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv) # determine species to limit records from PSP_Regulatory_Sites if options.species is None: exit('Argument "species" is required (and may be empty) but not supplied') try: if len(options.species) > 0: species = options.species[0] else: species = '' except Exception as e: exit('Error parsing species argument: %s' % str(e)) # determine tabular output destination if options.mrgfltr_tab is None: exit('Argument "mrgfltr_tab" is required but not supplied') try: output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) output_file = open(output_filename_tab, 'w') output_file.close() except Exception as e: exit('Error parsing mrgfltr_tab argument: %s' % str(e)) # determine CSV output destination if options.mrgfltr_csv is None: exit('Argument "mrgfltr_csv" is required but not supplied') try: output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) output_file = open(output_filename_csv, 'w') output_file.close() except Exception as e: exit('Error parsing mrgfltr_csv argument: %s' % str(e)) def mqpep_getswissprot(): ############################################### # copied from Excel Output Script.ipynb BEGIN # ############################################### ########### String Constants ################# DEPHOSPHOPEP = 'DephosphoPep' DESCRIPTION = 'Description' FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)' GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites PHOSPHOPEPTIDE = 'Phosphopeptide' PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match' PHOSPHORESIDUE = 'Phosphoresidue' PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains' SEQUENCE = 'Sequence' SEQUENCE10 = 'Sequence10' SEQUENCE7 = 'Sequence7' SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA' SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA' UNIPROT_ID = 'UniProt_ID' UNIPROT_SEQ_AND_META_SQL = ''' select Uniprot_ID, Description, Gene_Name, Sequence, Organism_Name, Organism_ID, PE, SV from UniProtKB order by Sequence, UniProt_ID ''' UNIPROT_UNIQUE_SEQ_SQL = ''' select distinct Sequence from UniProtKB group by Sequence ''' PPEP_PEP_UNIPROTSEQ_SQL = ''' select distinct phosphopeptide, peptide, sequence from uniprotkb_pep_ppep_view order by sequence ''' PPEP_MELT_SQL = ''' SELECT DISTINCT phospho_peptide AS 'p_peptide', kinase_map AS 'characterization', 'X' AS 'X' FROM ppep_gene_site_view ''' # CREATE TABLE PSP_Regulatory_site ( # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, # domain TEXT, # ON_FUNCTION TEXT, # ON_PROCESS TEXT, # ON_PROT_INTERACT TEXT, # ON_OTHER_INTERACT TEXT, # notes TEXT, # organism TEXT # ); PSP_REGSITE_SQL = ''' SELECT DISTINCT SITE_PLUSMINUS_7AA , DOMAIN , ON_FUNCTION , ON_PROCESS , ON_PROT_INTERACT , ON_OTHER_INTERACT , NOTES , ORGANISM FROM PSP_Regulatory_site ''' PPEP_ID_SQL =''' SELECT id AS 'ppep_id', seq AS 'ppep_seq' FROM ppep ''' MRGFLTR_DDL =''' DROP VIEW IF EXISTS mrgfltr_metadata_view; DROP TABLE IF EXISTS mrgfltr_metadata; CREATE TABLE mrgfltr_metadata ( ppep_id INTEGER REFERENCES ppep(id) , Sequence10 TEXT , Sequence7 TEXT , GeneName TEXT , Phosphoresidue TEXT , UniProtID TEXT , Description TEXT , FunctionPhosphoresidue TEXT , PutativeUpstreamDomains TEXT , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE ) ; CREATE VIEW mrgfltr_metadata_view AS SELECT DISTINCT ppep.seq AS phospho_peptide , Sequence10 , Sequence7 , GeneName , Phosphoresidue , UniProtID , Description , FunctionPhosphoresidue , PutativeUpstreamDomains FROM ppep, mrgfltr_metadata WHERE mrgfltr_metadata.ppep_id = ppep.id ORDER BY ppep.seq ; ''' CITATION_INSERT_STMT = ''' INSERT INTO Citation ( ObjectName, CitationData ) VALUES (?,?) ''' CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' MRGFLTR_METADATA_COLUMNS = [ 'ppep_id', 'Sequence10', 'Sequence7', 'GeneName', 'Phosphoresidue', 'UniProtID', 'Description', 'FunctionPhosphoresidue', 'PutativeUpstreamDomains' ] ########### String Constants (end) ############ class Error(Exception): """Base class for exceptions in this module.""" pass class PreconditionError(Error): """Exception raised for errors in the input. Attributes: expression -- input expression in which the error occurred message -- explanation of the error """ def __init__(self, expression, message): self.expression = expression self.message = message #start_time = time.clock() #timer start_time = time.process_time() #timer #get keys from upstream tabular file using readline() # ref: https://stackoverflow.com/a/16713581/15509512 # answer to "Use codecs to read file with correct encoding" file1_encoded = open(upstream_map_filename_tab, 'rb') file1 = cx_getreader("latin-1")(file1_encoded) count = 0 upstream_map_p_peptide_list = [] re_tab = re.compile('^[^\t]*') while True: count += 1 # Get next line from file line = file1.readline() # if line is empty # end of file is reached if not line: break if count > 1: m = re_tab.match(line) upstream_map_p_peptide_list.append(m[0]) file1.close() file1_encoded.close() # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed re_phos = re.compile('p') dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ] end_time = time.process_time() #timer print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr) ## ----------- Get SwissProt data from SQLite database (start) ----------- # build UniProt sequence LUT and list of unique SwissProt sequences # Open SwissProt SQLite database conn = sql.connect(uniprot_sqlite) cur = conn.cursor() # Set up structures to hold SwissProt data uniprot_Sequence_List = [] UniProtSeqLUT = {} # Execute query for unique seqs without fetching the results yet uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) while batch := uniprot_unique_seq_cur.fetchmany(size=50): if None == batch: # handle case where no records are returned break for row in batch: Sequence = row[0] UniProtSeqLUT[(Sequence,DESCRIPTION)] = [] UniProtSeqLUT[(Sequence,GENE_NAME) ] = [] UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = [] UniProtSeqLUT[ Sequence ] = [] # Execute query for seqs and metadata without fetching the results yet uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) while batch := uniprot_seq_and_meta.fetchmany(size=50): if None == batch: # handle case where no records are returned break for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch: uniprot_Sequence_List.append(Sequence) UniProtSeqLUT[Sequence] = Sequence UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID) UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name) if OS != N_A: Description += ' OS=' + OS if OX != N_A: Description += ' OX=' + str(int(OX)) if Gene_Name != N_A: Description += ' GN=' + Gene_Name if PE != N_A: Description += ' PE=' + PE if SV != N_A: Description += ' SV=' + SV UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description) # Close SwissProt SQLite database; clean up local variables conn.close() Sequence = '' UniProt_ID = '' Description = '' Gene_Name = '' ## ----------- Get SwissProt data from SQLite database (finish) ----------- end_time = time.process_time() #timer print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr) ## ----------- Get SwissProt data from SQLite database (start) ----------- # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) ) # Open SwissProt SQLite database conn = sql.connect(uniprot_sqlite) cur = conn.cursor() # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide DephosphoPep_UniProtSeq_LUT = {} # Set up dictionary to accumulate results PhosphoPep_UniProtSeq_LUT = {} # Execute query for tuples without fetching the results yet ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50): if None == batch: # handle case where no records are returned break for (phospho_pep, dephospho_pep, sequence) in batch: #do interesting stuff here... PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = [] DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = [] DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = [] DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = [] DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence)) if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]: DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence) for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: if phospho_pep != phospho_pep: print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep)) if phospho_pep not in PhosphoPep_UniProtSeq_LUT: PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep r = list(zip( [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]], [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]], [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]] )) # Sort by `UniProt_ID` # ref: https://stackoverflow.com/a/4174955/15509512 r = sorted(r, key=operator.itemgetter(0)) # Get one tuple for each `phospho_pep` # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] for (upid, gn, desc) in r: # Append pseudo-tuple per UniProt_ID but only when it is not present if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]: DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid) DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc) DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn) # Close SwissProt SQLite database; clean up local variables conn.close() # wipe local variables phospho_pep = dephospho_pep = sequence = 0 upid = gn = desc = r = '' ## ----------- Get SwissProt data from SQLite database (finish) ----------- end_time = time.process_time() #timer print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr) print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list)))) #Read in Upstream tabular file # We are discarding the intensity data; so read it as text upstream_data = pandas.read_table( upstream_map_filename_tab, dtype='str', index_col = 0 ) end_time = time.process_time() #timer print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer upstream_data.index = upstream_map_p_peptide_list end_time = time.process_time() #timer print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer #trim upstream_data to include only the upstream map columns old_cols = upstream_data.columns.tolist() i = 0 first_intensity = -1 last_intensity = -1 intensity_re = re.compile('Intensity.*') for col_name in old_cols: m = intensity_re.match(col_name) if m: last_intensity = i if first_intensity == -1: first_intensity = i i += 1 #print('last intensity = %d' % last_intensity) col_PKCalpha = last_intensity + 2 col_firstIntensity = first_intensity data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1] if upstream_data.empty: print("upstream_data is empty") exit(0) data_in = upstream_data.copy(deep=True)[data_in_cols] # Convert floating-point integers to int64 integers # ref: https://stackoverflow.com/a/68497603/15509512 data_in[list(data_in.columns[1:])] = data_in[ list(data_in.columns[1:])].astype('float64').apply(np.int64) #create another phosphopeptide column that will be used to join later; # MAY need to change depending on Phosphopeptide column position #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index end_time = time.process_time() #timer print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer # Produce a dictionary of metadata for a single phosphopeptide. # This is a replacement of `UniProtInfo_subdict` in the original code. def pseq_to_subdict(phospho_pep): #ACE print("calling pseq_to_subdict, %s" % phospho_pep); # Strip "p" from phosphopeptide sequence dephospho_pep = re_phos.sub('',phospho_pep) # Determine number of phosphoresidues in phosphopeptide numps = len(phospho_pep) - len(dephospho_pep) # Determine location(s) of phosphoresidue(s) in phosphopeptide # (used later for Phosphoresidue, Sequence7, and Sequence10) ploc = [] #list of p locations i = 0 p = phospho_pep while i < numps: ploc.append(p.find("p")) p = p[:p.find("p")] + p[p.find("p")+1:] i +=1 # Establish nested dictionary result = {} result[SEQUENCE] = [] result[UNIPROT_ID] = [] result[DESCRIPTION] = [] result[GENE_NAME] = [] result[PHOSPHORESIDUE] = [] result[SEQUENCE7] = [] result[SEQUENCE10] = [] # Add stripped sequence to dictionary result[SEQUENCE].append(dephospho_pep) # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep] # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT ### Caller may elect to: ## try: ## ... ## except PreconditionError as pe: ## print("'{expression}': {message}".format( ## expression = pe.expression, ## message = pe.message)) ## ) ## ) if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT' ) if phospho_pep not in PhosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT' ) if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]: raise PreconditionError( dephospho_pep, "dephosphorylated phosphopeptide does not match " + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] ) result[SEQUENCE] = [dephospho_pep] result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT' ) UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] if len (UniProtSeqList) < 1: print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep) # raise PreconditionError( # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", # 'value has zero length' # ) for UniProtSeq in UniProtSeqList: i = 0 phosphoresidues = [] seq7s_set = set() seq7s = [] seq10s_set = set() seq10s = [] while i < len(ploc): start = UniProtSeq.find(dephospho_pep) psite = start+ploc[i] #location of phosphoresidue on protein sequence #add Phosphoresidue phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1) phosphoresidues.append(phosphosite) #Add Sequence7 if psite < 7: #phospho_pep at N terminus seq7 = str(UniProtSeq)[:psite+8] if seq7[psite] == "S": #if phosphosresidue is serine pres = "s" elif seq7[psite] == "T": #if phosphosresidue is threonine pres = "t" elif seq7[psite] == "Y": #if phosphoresidue is tyrosine pres = "y" else: # if not pSTY pres = "?" seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8] while len(seq7) < 15: #add appropriate number of "_" to the front seq7 = "_" + seq7 elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus seq7 = str(UniProtSeq)[psite-7:] if seq7[7] == "S": pres = "s" elif seq7[7] == "T": pres = "t" elif seq7[7] == "Y": pres = "y" else: pres = "?" seq7 = seq7[:7] + pres + seq7[8:] while len(seq7) < 15: #add appropriate number of "_" to the back seq7 = seq7 + "_" else: seq7 = str(UniProtSeq)[psite-7:psite+8] pres = "" #phosphoresidue if seq7[7] == "S": #if phosphosresidue is serine pres = "s" elif seq7[7] == "T": #if phosphosresidue is threonine pres = "t" elif seq7[7] == "Y": #if phosphoresidue is tyrosine pres = "y" else: # if not pSTY pres = "?" seq7 = seq7[:7] + pres + seq7[8:] if seq7 not in seq7s_set: seq7s.append(seq7) seq7s_set.add(seq7) #add Sequence10 if psite < 10: #phospho_pep at N terminus seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11] elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:] else: seq10 = str(UniProtSeq)[psite-10:psite+11] seq10 = seq10[:10] + "p" + seq10[10:] if seq10 not in seq10s_set: seq10s.append(seq10) seq10s_set.add(seq10) i+=1 result[PHOSPHORESIDUE].append(phosphoresidues) result[SEQUENCE7].append(seq7s) # result[SEQUENCE10] is a list of lists of strings result[SEQUENCE10].append(seq10s) r = list(zip( result[UNIPROT_ID], result[GENE_NAME], result[DESCRIPTION], result[PHOSPHORESIDUE] )) # Sort by `UniProt_ID` # ref: https://stackoverflow.com//4174955/15509512 s = sorted(r, key=operator.itemgetter(0)) result[UNIPROT_ID] = [] result[GENE_NAME] = [] result[DESCRIPTION] = [] result[PHOSPHORESIDUE] = [] for r in s: result[UNIPROT_ID].append(r[0]) result[GENE_NAME].append(r[1]) result[DESCRIPTION].append(r[2]) result[PHOSPHORESIDUE].append(r[3]) #convert lists to strings in the dictionary for key,value in result.items(): if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: result[key] = '; '.join(map(str, value)) elif key in [SEQUENCE10]: # result[SEQUENCE10] is a list of lists of strings joined_value = '' joined_set = set() sep = '' for valL in value: # valL is a list of strings for val in valL: # val is a string if val not in joined_set: joined_set.add(val) #joined_value += sep + '; '.join(map(str, val)) joined_value += sep + val sep = '; ' # joined_value is a string result[key] = joined_value newstring = '; '.join( [', '.join(l) for l in result[PHOSPHORESIDUE]] ) ### #separate the isoforms in PHOSPHORESIDUE column with ";" ### oldstring = result[PHOSPHORESIDUE] ### oldlist = list(oldstring) ### newstring = "" ### i = 0 ### for e in oldlist: ### if e == ";": ### if numps > 1: ### if i%numps: ### newstring = newstring + ";" ### else: ### newstring = newstring + "," ### else: ### newstring = newstring + ";" ### i +=1 ### else: ### newstring = newstring + e result[PHOSPHORESIDUE] = newstring #separate sequence7's by | oldstring = result[SEQUENCE7] oldlist = oldstring newstring = "" for l in oldlist: for e in l: if e == ";": newstring = newstring + " |" elif len(newstring) > 0 and 1 > newstring.count(e): newstring = newstring + " | " + e elif 1 > newstring.count(e): newstring = newstring + e result[SEQUENCE7] = newstring return [phospho_pep, result] # Construct list of [string, dictionary] lists # where the dictionary provides the SwissProt metadata for a phosphopeptide result_list = [ catch(pseq_to_subdict,psequence) for psequence in data_in[PHOSPHOPEPTIDE_MATCH] ] end_time = time.process_time() #timer print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer # Construct dictionary from list of lists # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ UniProt_Info = { result[0]:result[1] for result in result_list if result is not None } end_time = time.process_time() #timer print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer #cosmetic: add N_A to phosphopeptide rows with no hits p_peptide_list = [] for key in UniProt_Info: p_peptide_list.append(key) for nestedKey in UniProt_Info[key]: if UniProt_Info[key][nestedKey] == "": UniProt_Info[key][nestedKey] = N_A end_time = time.process_time() #timer print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer #convert UniProt_Info dictionary to dataframe uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info)) #reorder columns to match expected output file uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too cols = uniprot_df.columns.tolist() #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] #uniprot_df = uniprot_df[cols] uniprot_df = uniprot_df[[ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, UNIPROT_ID, DESCRIPTION ]] end_time = time.process_time() #timer print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer #concat to split then groupby to collapse seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | ')) for _, row in uniprot_df.iterrows()]).reset_index() seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE] # --- -------------- begin read PSP_Regulatory_sites --------------------------------- #read in PhosphoSitePlus Regulatory Sites dataset #ACE if (True): ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- conn = sql.connect(uniprot_sqlite) regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) # Close SwissProt SQLite database conn.close() #ACE # Array indexes are zero-based #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT' #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM'] #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also #ACE print(regsites_df) ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) ----------- #ACE else: #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3) #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA #ACE #ACE # Array indexes are zero-based #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID' #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA' #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT' #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES' #ACE #ACE ] #ACE #ACE REGSITE_COL_SITE7AA = 9 #ACE #ACE REGSITE_COL_PROTEIN = 1 #ACE #ACE REGSITE_COL_DOMAIN = 10 #ACE #ACE REGSITE_COL_PMIDs = 15 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ #keep only the human entries in dataframe if len(species) > 0: print('Limit PhosphoSitesPlus records to species "' + species + '"') regsites_df = regsites_df[regsites_df.ORGANISM == species] #merge the seq7 df with the regsites df based off of the sequence7 merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left') #ACE print(merge_df.columns.tolist()) #ACE #after merging df, select only the columns of interest - note that PROTEIN is absent here merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]] #ACE print(merge_df.columns.tolist()) #ACE #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="") merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") #remove the columns that were combined merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]] #ACE print(merge_df) #ACE #ACE print(merge_df.columns.tolist()) #ACE end_time = time.process_time() #timer print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer #cosmetic changes to Function Phosphoresidue column fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) end_time = time.process_time() #timer print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer i = 0 while i < len(fp_series): #remove the extra ";" so that it looks more professional if fp_series[i] == "; ; ; ; ": #remove ; from empty hits fp_series[i] = "" while fp_series[i].endswith("; "): #remove ; from the ends fp_series[i] = fp_series[i][:-2] while fp_series[i].startswith("; "): #remove ; from the beginning fp_series[i] = fp_series[i][2:] fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") fp_series[i] = fp_series[i].replace("; ; ; ", "; ") fp_series[i] = fp_series[i].replace("; ; ", "; ") #turn blanks into N_A to signify the info was searched for but cannot be found if fp_series[i] == "": fp_series[i] = N_A i += 1 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series end_time = time.process_time() #timer print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer #merge uniprot df with merge df uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left") #collapse the merged df uniprot_regsites_collapsed_df = pandas.DataFrame( uniprot_regsites_merged_df .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE] .apply(lambda x: ppep_join(x))) #.apply(lambda x: "%s" % ' | '.join(x))) end_time = time.process_time() #timer print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column #rename columns uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp'] #select columns to be merged to uniprot_df #ACE cols = regsites_df.columns.tolist() #ACE print(cols) #ACE #ACE if len(cols) > 8: #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] #ACE print(cols) #ACE #ACE regsite_merge_df = regsites_df[cols] end_time = time.process_time() #timer print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer #add columns based on Sequence7 matching site_+/-7_AA uniprot_regsite_df = pandas.merge( left=uniprot_df, right=uniprot_regsites_collapsed_df, how='left', left_on=PHOSPHOPEPTIDE, right_on='ppp') end_time = time.process_time() #timer print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer data_in.rename( {'Protein description': PHOSPHOPEPTIDE}, axis='columns', inplace=True ) sort_start_time = time.process_time() #timer #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold()) data_in = data_in.loc[res2] end_time = time.process_time() #timer print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer cols = [old_cols[0]] + old_cols[col_PKCalpha-1:] upstream_data = upstream_data[cols] end_time = time.process_time() #timer print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer #### #rename upstream columns in new list #### new_cols = [] #### for name in cols: #### if "_NetworKIN" in name: #### name = name.split("_")[0] #### if " motif" in name: #### name = name.split(" motif")[0] #### if " sequence " in name: #### name = name.split(" sequence")[0] #### if "_Phosida" in name: #### name = name.split("_")[0] #### if "_PhosphoSite" in name: #### name = name.split("_")[0] #### new_cols.append(name) #rename upstream columns in new list def col_rename(name): if "_NetworKIN" in name: name = name.split("_")[0] if " motif" in name: name = name.split(" motif")[0] if " sequence " in name: name = name.split(" sequence")[0] if "_Phosida" in name: name = name.split("_")[0] if "_PhosphoSite" in name: name = name.split("_")[0] return name new_cols = [col_rename(col) for col in cols] upstream_data.columns = new_cols end_time = time.process_time() #timer print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer # Create upstream_data_cast as a copy of upstream_data # but with first column substituted by the phosphopeptide sequence upstream_data_cast = upstream_data.copy() new_cols_cast = new_cols new_cols_cast[0] = 'p_peptide' upstream_data_cast.columns = new_cols_cast upstream_data_cast['p_peptide'] = upstream_data.index new_cols_cast0 = new_cols_cast[0] # --- -------------- begin read upstream_data_melt ------------------------------------ ## ----------- Get melted kinase mapping data from SQLite database (start) ----------- conn = sql.connect(uniprot_sqlite) upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) # Close SwissProt SQLite database conn.close() upstream_data_melt = upstream_data_melt_df.copy() upstream_data_melt.columns = ['p_peptide', 'characterization', 'X'] upstream_data_melt['characterization'] = [ col_rename(s) for s in upstream_data_melt['characterization'] ] print('%0.6f upstream_data_melt_df initially has %d rows' % (end_time - start_time, len(upstream_data_melt.axes[0])) , file=sys.stderr) # ref: https://stackoverflow.com/a/27360130/15509512 # e.g. df.drop(df[df.score < 50].index, inplace=True) upstream_data_melt.drop( upstream_data_melt[upstream_data_melt.X != 'X'].index, inplace = True ) print('%0.6f upstream_data_melt_df pre-dedup has %d rows' % (end_time - start_time, len(upstream_data_melt.axes[0])) , file=sys.stderr) #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also #ACE print(upstream_data_melt) ## ----------- Get melted kinase mapping data from SQLite database (finish) ----------- # ... -------------- end read upstream_data_melt -------------------------------------- end_time = time.process_time() #timer print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer # ... end read upstream_data_melt upstream_data_melt_index = upstream_data_melt.index upstream_data_melt_p_peptide = upstream_data_melt['p_peptide'] end_time = time.process_time() #timer print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer upstream_delta_melt_LoL = upstream_data_melt.values.tolist() melt_dict = {} for key in upstream_map_p_peptide_list: melt_dict[key] = [] for el in upstream_delta_melt_LoL: (p_peptide, characterization, X) = tuple(el) if p_peptide in melt_dict: melt_dict[p_peptide].append(characterization) else: exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide)) end_time = time.process_time() #timer print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer # for key in upstream_map_p_peptide_list: # melt_dict[key] = ' | '.join(melt_dict[key]) for key in upstream_map_p_peptide_list: melt_dict[key] = melt_join(melt_dict[key]) end_time = time.process_time() #timer print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer # map_dict is a dictionary of dictionaries map_dict = {} for key in upstream_map_p_peptide_list: map_dict[key] = {} map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] end_time = time.process_time() #timer print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer #convert map_dict to dataframe map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict)) map_df["p-peptide"] = map_df.index #make index a column too cols_map_df = map_df.columns.tolist() cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] map_df = map_df[cols_map_df] #join map_df to uniprot_regsite_df output_df = uniprot_regsite_df.merge( map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide") output_df = output_df[ [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE, PUTATIVE_UPSTREAM_DOMAINS ] ] # cols_output_prelim = output_df.columns.tolist() # # print("cols_output_prelim") # print(cols_output_prelim) # # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] # # print("cols_output with p-peptide") # print(cols_output) # # cols_output = [col for col in cols_output if not col == "p-peptide"] # # print("cols_output") # print(cols_output) # # output_df = output_df[cols_output] #join output_df back to quantitative columns in data_in df quant_cols = data_in.columns.tolist() quant_cols = quant_cols[1:] quant_data = data_in[quant_cols] ## ----------- Write merge/filter metadata to SQLite database (start) ----------- # Open SwissProt SQLite database conn = sql.connect(output_sqlite) cur = conn.cursor() cur.executescript(MRGFLTR_DDL) cur.execute( CITATION_INSERT_STMT, ('mrgfltr_metadata_view', CITATION_INSERT_PSP) ) cur.execute( CITATION_INSERT_STMT, ('mrgfltr_metadata', CITATION_INSERT_PSP) ) cur.execute( CITATION_INSERT_STMT, ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF) ) cur.execute( CITATION_INSERT_STMT, ('mrgfltr_metadata', CITATION_INSERT_PSP_REF) ) # Read ppep-to-sequence LUT ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) #ACE ppep_lut_df.info(verbose=True) # write only metadata for merged/filtered records to SQLite mrgfltr_metadata_df = output_df.copy() # replace phosphopeptide seq with ppep.id mrgfltr_metadata_df = ppep_lut_df.merge( mrgfltr_metadata_df, left_on='ppep_seq', right_on=PHOSPHOPEPTIDE, how='inner' ) mrgfltr_metadata_df.drop( columns=[PHOSPHOPEPTIDE, 'ppep_seq'], inplace=True ) #rename columns mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS #ACE mrgfltr_metadata_df.info(verbose=True) mrgfltr_metadata_df.to_sql( 'mrgfltr_metadata', con=conn, if_exists='append', index=False, method='multi' ) # Close SwissProt SQLite database conn.close() ## ----------- Write merge/filter metadata to SQLite database (finish) ----------- output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH) output_cols = output_df.columns.tolist() output_cols = output_cols[:-1] output_df = output_df[output_cols] #cosmetic changes to Upstream column output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) i = 0 while i < len(us_series): #turn blanks into N_A to signify the info was searched for but cannot be found if us_series[i] == "": us_series[i] = N_A i += 1 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series end_time = time.process_time() #timer print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer (output_rows, output_cols) = output_df.shape #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True) output_df = output_df.convert_dtypes(convert_integer=True) #Output onto Final CSV file output_df.to_csv(output_filename_csv, index=False) output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False) end_time = time.process_time() #timer print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer print('{:>10} phosphopeptides written to output'.format(str(output_rows))) end_time = time.process_time() #timer print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer #Rev. 7/1/2016 #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ # read from SwissProt SQLite database #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper ############################################# # copied from Excel Output Script.ipynb END # ############################################# try: catch(mqpep_getswissprot,) exit(0) except Exception as e: exit('Internal error running mqpep_getswissprot(): %s' % (e)) if __name__ == "__main__": __main__()