Mercurial > repos > eschen42 > mqppep_anova

#!/usr/bin/env python

# Import the packages needed
import argparse
import os.path
import sys

import pandas
import re
import time
import sqlite3 as sql
from codecs import getreader as cx_getreader
import sys
import numpy as np

#   for sorting list of lists using operator.itemgetter
import operator

#   for formatting stack-trace
import traceback

#   for Aho-Corasick search for fixed set of substrings
import ahocorasick
import operator
import hashlib

# for shutil.copyfile(src, dest)
import shutil

# global constants
N_A = 'N/A'

# ref: https://stackoverflow.com/a/8915613/15509512
#   answers: "How to handle exceptions in a list comprehensions"
#   usage:
#       from math import log
#       eggs = [1,3,0,3,2]
#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
#   producing:
#       for <built-in function log>
#         with args (0,)
#         exception: math domain error
#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
def catch(func, *args, handle=lambda e : e, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        print("For %s" % str(func))
        print("  with args %s" % str(args))
        print("  caught exception: %s" % str(e))
        (ty, va, tb) = sys.exc_info()
        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
        exit(-1)
        return None # was handle(e)

def ppep_join(x):
    x = [i for i in x if N_A != i]
    result = "%s" % ' | '.join(x)
    if result != "":
        return result
    else:
        return N_A

def melt_join(x):
    tmp = {key.lower(): key for key in x}
    result = "%s" % ' | '.join([tmp[key] for key in tmp])
    return result

def __main__():
    # Parse Command Line
    parser = argparse.ArgumentParser(
        description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.'
        )

    # inputs:
    #   Phosphopeptide data for experimental results, including the intensities
    #   and the mapping to kinase domains, in tabular format.
    parser.add_argument(
        '--phosphopeptides', '-p',
        nargs=1,
        required=True,
        dest='phosphopeptides',
        help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format'
        )
    #   UniProtKB/SwissProt DB input, SQLite
    parser.add_argument(
        '--ppep_mapping_db', '-d',
        nargs=1,
        required=True,
        dest='ppep_mapping_db',
        help='UniProtKB/SwissProt SQLite Database'
        )
    #ACE #   PhosPhositesPlus DB input, csv
    #ACE parser.add_argument(
    #ACE     '--psp_regulatory_sites', '-s',
    #ACE     nargs=1,
    #ACE     required=True,
    #ACE     dest='psp_regulatory_sites_csv',
    #ACE     help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
    #ACE     )
    #   species to limit records chosed from PhosPhositesPlus
    parser.add_argument(
        '--species', '-x',
        nargs=1,
        required=False,
        default=[],
        dest='species',
        help='limit PhosphoSitePlus records to indicated species (field may be empty)'
        )

    # outputs:
    #   tabular output
    parser.add_argument(
        '--mrgfltr_tab', '-o',
        nargs=1,
        required=True,
        dest='mrgfltr_tab',
        help='Tabular output file for results'
        )
    #   CSV output
    parser.add_argument(
        '--mrgfltr_csv', '-c',
        nargs=1,
        required=True,
        dest='mrgfltr_csv',
        help='CSV output file for results'
        )
    #   SQLite output
    parser.add_argument(
        '--mrgfltr_sqlite', '-S',
        nargs=1,
        required=True,
        dest='mrgfltr_sqlite',
        help='SQLite output file for results'
        )

    # "Make it so!" (parse the arguments)
    options = parser.parse_args()
    print("options: " + str(options))

    # determine phosphopeptide ("upstream map") input tabular file access
    if options.phosphopeptides is None:
        exit('Argument "phosphopeptides" is required but not supplied')
    try:
        upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
        input_file = open(upstream_map_filename_tab, 'r')
        input_file.close()
    except Exception as e:
        exit('Error parsing phosphopeptides argument: %s' % str(e))

    # determine input SQLite access
    if options.ppep_mapping_db is None:
        exit('Argument "ppep_mapping_db" is required but not supplied')
    try:
        uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
        input_file = open(uniprot_sqlite, 'rb')
        input_file.close()
    except Exception as e:
        exit('Error parsing ppep_mapping_db argument: %s' % str(e))

    # copy input SQLite dataset to output SQLite dataset
    if options.mrgfltr_sqlite is None:
        exit('Argument "mrgfltr_sqlite" is required but not supplied')
    try:
        output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
        shutil.copyfile(uniprot_sqlite, output_sqlite)
    except Exception as e:
        exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e))

    #ACE # determine psp_regulatory_sites CSV access
    #ACE if options.psp_regulatory_sites_csv is None:
    #ACE     exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
    #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
    #ACE try:
    #ACE     phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
    #ACE     input_file = open(phosphosite_filename_csv, 'r')
    #ACE     input_file.close()
    #ACE except Exception as e:
    #ACE     exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
    #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)

    # determine species to limit records from PSP_Regulatory_Sites
    if options.species is None:
        exit('Argument "species" is required (and may be empty) but not supplied')
    try:
        if len(options.species) > 0:
            species = options.species[0]
        else:
            species = ''
    except Exception as e:
        exit('Error parsing species argument: %s' % str(e))

    # determine tabular output destination
    if options.mrgfltr_tab is None:
        exit('Argument "mrgfltr_tab" is required but not supplied')
    try:
        output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
        output_file = open(output_filename_tab, 'w')
        output_file.close()
    except Exception as e:
        exit('Error parsing mrgfltr_tab argument: %s' % str(e))

    # determine CSV output destination
    if options.mrgfltr_csv is None:
        exit('Argument "mrgfltr_csv" is required but not supplied')
    try:
        output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
        output_file = open(output_filename_csv, 'w')
        output_file.close()
    except Exception as e:
        exit('Error parsing mrgfltr_csv argument: %s' % str(e))


    def mqpep_getswissprot():

        ###############################################
        # copied from Excel Output Script.ipynb BEGIN #
        ###############################################

        ###########  String Constants  #################
        DEPHOSPHOPEP = 'DephosphoPep'
        DESCRIPTION = 'Description'
        FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)'
        GENE_NAME = 'Gene_Name'                     # Gene Name from UniProtKB
        ON_FUNCTION = 'ON_FUNCTION'                 # ON_FUNCTION column from PSP_Regulatory_Sites
        ON_NOTES = 'NOTES'                          # NOTES column from PSP_Regulatory_Sites
        ON_OTHER_INTERACT = 'ON_OTHER_INTERACT'     # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
        ON_PROCESS = 'ON_PROCESS'                   # ON_PROCESS column from PSP_Regulatory_Sites
        ON_PROT_INTERACT = 'ON_PROT_INTERACT'       # ON_PROT_INTERACT column from PSP_Regulatory_Sites
        PHOSPHOPEPTIDE = 'Phosphopeptide'
        PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match'
        PHOSPHORESIDUE = 'Phosphoresidue'
        PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains'
        SEQUENCE = 'Sequence'
        SEQUENCE10 = 'Sequence10'
        SEQUENCE7 = 'Sequence7'
        SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA'
        SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA'
        UNIPROT_ID = 'UniProt_ID'
        UNIPROT_SEQ_AND_META_SQL = '''
            select    Uniprot_ID, Description, Gene_Name, Sequence,
                      Organism_Name, Organism_ID, PE, SV
                 from UniProtKB
             order by Sequence, UniProt_ID
        '''
        UNIPROT_UNIQUE_SEQ_SQL = '''
            select distinct Sequence
                       from UniProtKB
                   group by Sequence
        '''
        PPEP_PEP_UNIPROTSEQ_SQL = '''
            select distinct phosphopeptide, peptide, sequence
                       from uniprotkb_pep_ppep_view
                   order by sequence
        '''
        PPEP_MELT_SQL = '''
            SELECT DISTINCT
                phospho_peptide AS 'p_peptide',
                kinase_map AS 'characterization',
                'X' AS 'X'
            FROM ppep_gene_site_view
        '''
        # CREATE TABLE PSP_Regulatory_site (
        #   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
        #   domain             TEXT,
        #   ON_FUNCTION        TEXT,
        #   ON_PROCESS         TEXT,
        #   ON_PROT_INTERACT   TEXT,
        #   ON_OTHER_INTERACT  TEXT,
        #   notes              TEXT,
        #   organism           TEXT
        # );
        PSP_REGSITE_SQL = '''
            SELECT DISTINCT
              SITE_PLUSMINUS_7AA ,
              DOMAIN             ,
              ON_FUNCTION        ,
              ON_PROCESS         ,
              ON_PROT_INTERACT   ,
              ON_OTHER_INTERACT  ,
              NOTES              ,
              ORGANISM
            FROM PSP_Regulatory_site
        '''
        PPEP_ID_SQL ='''
            SELECT
                id AS 'ppep_id',
                seq AS 'ppep_seq'
            FROM ppep
        '''
        MRGFLTR_DDL ='''
        DROP VIEW  IF EXISTS mrgfltr_metadata_view;
        DROP TABLE IF EXISTS mrgfltr_metadata;
        CREATE TABLE mrgfltr_metadata
          ( ppep_id                 INTEGER REFERENCES ppep(id)
          , Sequence10              TEXT
          , Sequence7               TEXT
          , GeneName                TEXT
          , Phosphoresidue          TEXT
          , UniProtID               TEXT
          , Description             TEXT
          , FunctionPhosphoresidue  TEXT
          , PutativeUpstreamDomains TEXT
          , PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
          )
          ;
        CREATE VIEW mrgfltr_metadata_view AS
          SELECT DISTINCT
              ppep.seq             AS phospho_peptide
            , Sequence10
            , Sequence7
            , GeneName
            , Phosphoresidue
            , UniProtID
            , Description
            , FunctionPhosphoresidue
            , PutativeUpstreamDomains
          FROM
            ppep, mrgfltr_metadata
          WHERE
              mrgfltr_metadata.ppep_id = ppep.id
          ORDER BY
            ppep.seq
            ;
        '''

        CITATION_INSERT_STMT = '''
          INSERT INTO Citation (
            ObjectName,
            CitationData
          ) VALUES (?,?)
          '''
        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'

        MRGFLTR_METADATA_COLUMNS = [
            'ppep_id',
            'Sequence10',
            'Sequence7',
            'GeneName',
            'Phosphoresidue',
            'UniProtID',
            'Description',
            'FunctionPhosphoresidue',
            'PutativeUpstreamDomains'
            ]

        ###########  String Constants (end) ############

        class Error(Exception):
            """Base class for exceptions in this module."""
            pass

        class PreconditionError(Error):
            """Exception raised for errors in the input.

            Attributes:
                expression -- input expression in which the error occurred
                message -- explanation of the error
            """

            def __init__(self, expression, message):
                self.expression = expression
                self.message = message

        #start_time = time.clock() #timer
        start_time = time.process_time() #timer

        #get keys from upstream tabular file using readline()
        # ref: https://stackoverflow.com/a/16713581/15509512
        #      answer to "Use codecs to read file with correct encoding"
        file1_encoded = open(upstream_map_filename_tab, 'rb')
        file1 = cx_getreader("latin-1")(file1_encoded)

        count = 0
        upstream_map_p_peptide_list = []
        re_tab = re.compile('^[^\t]*')
        while True:
            count += 1
            # Get next line from file
            line = file1.readline()
            # if line is empty
            # end of file is reached
            if not line:
                break
            if count > 1:
                m = re_tab.match(line)
                upstream_map_p_peptide_list.append(m[0])
        file1.close()
        file1_encoded.close()

        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
        re_phos = re.compile('p')
        dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ]

        end_time = time.process_time() #timer
        print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr)

        ## ----------- Get SwissProt data from SQLite database (start) -----------
        # build UniProt sequence LUT and list of unique SwissProt sequences

        # Open SwissProt SQLite database
        conn = sql.connect(uniprot_sqlite)
        cur  = conn.cursor()

        # Set up structures to hold SwissProt data

        uniprot_Sequence_List = []
        UniProtSeqLUT = {}

        # Execute query for unique seqs without fetching the results yet
        uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)

        while batch := uniprot_unique_seq_cur.fetchmany(size=50):
            if None == batch:
                # handle case where no records are returned
                break
            for row in batch:
                Sequence = row[0]
                UniProtSeqLUT[(Sequence,DESCRIPTION)] = []
                UniProtSeqLUT[(Sequence,GENE_NAME)  ] = []
                UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = []
                UniProtSeqLUT[ Sequence               ] = []

        # Execute query for seqs and metadata without fetching the results yet
        uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)

        while batch := uniprot_seq_and_meta.fetchmany(size=50):
            if None == batch:
                  # handle case where no records are returned
                break
            for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch:
                uniprot_Sequence_List.append(Sequence)
                UniProtSeqLUT[Sequence] = Sequence
                UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID)
                UniProtSeqLUT[(Sequence,GENE_NAME)  ].append(Gene_Name)
                if OS != N_A:
                    Description += ' OS=' + OS
                if OX != N_A:
                    Description += ' OX=' + str(int(OX))
                if Gene_Name != N_A:
                    Description += ' GN=' + Gene_Name
                if PE != N_A:
                    Description += ' PE=' + PE
                if SV != N_A:
                    Description += ' SV=' + SV
                UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description)

        # Close SwissProt SQLite database; clean up local variables
        conn.close()
        Sequence = ''
        UniProt_ID = ''
        Description = ''
        Gene_Name = ''

        ## ----------- Get SwissProt data from SQLite database (finish) -----------

        end_time = time.process_time() #timer
        print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr)

        ## ----------- Get SwissProt data from SQLite database (start) -----------
        # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT
        #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) )

        # Open SwissProt SQLite database
        conn = sql.connect(uniprot_sqlite)
        cur  = conn.cursor()

        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
        DephosphoPep_UniProtSeq_LUT = {}

        # Set up dictionary to accumulate results
        PhosphoPep_UniProtSeq_LUT = {}

        # Execute query for tuples without fetching the results yet
        ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)

        while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50):
            if None == batch:
                # handle case where no records are returned
                break
            for (phospho_pep, dephospho_pep, sequence) in batch:
                #do interesting stuff here...
                PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
                PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
                if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
                    DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]  = []
                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]    = []
                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]   = []
                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]     = []
                DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)

                #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence))
                if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]:
                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence)
                for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
                    if phospho_pep != phospho_pep:
                        print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep))
                    if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
                        PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
                        PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
                    r = list(zip(
                       [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]],
                       [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]],
                       [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]]
                       ))
                    # Sort by `UniProt_ID`
                    #   ref: https://stackoverflow.com/a/4174955/15509512
                    r = sorted(r, key=operator.itemgetter(0))
                    # Get one tuple for each `phospho_pep`
                    #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
                    for (upid, gn, desc) in r:
                        # Append pseudo-tuple per UniProt_ID but only when it is not present
                        if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]:
                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid)
                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc)
                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn)

        # Close SwissProt SQLite database; clean up local variables
        conn.close()
        # wipe local variables
        phospho_pep = dephospho_pep = sequence = 0
        upid = gn = desc = r = ''

        ## ----------- Get SwissProt data from SQLite database (finish) -----------

        end_time = time.process_time() #timer
        print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr)

        print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list))))

        #Read in Upstream tabular file
        # We are discarding the intensity data; so read it as text
        upstream_data = pandas.read_table(
            upstream_map_filename_tab,
            dtype='str',
            index_col = 0
            )

        end_time = time.process_time() #timer
        print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer

        upstream_data.index = upstream_map_p_peptide_list


        end_time = time.process_time() #timer
        print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer


        #trim upstream_data to include only the upstream map columns
        old_cols = upstream_data.columns.tolist()
        i = 0
        first_intensity = -1
        last_intensity  = -1
        intensity_re = re.compile('Intensity.*')
        for col_name in old_cols:
            m = intensity_re.match(col_name)
            if m:
                last_intensity = i
                if first_intensity == -1:
                    first_intensity = i
            i += 1
        #print('last intensity = %d' % last_intensity)
        col_PKCalpha = last_intensity + 2
        col_firstIntensity = first_intensity

        data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1]

        if upstream_data.empty:
            print("upstream_data is empty")
            exit(0)

        data_in = upstream_data.copy(deep=True)[data_in_cols]

        # Convert floating-point integers to int64 integers
        #   ref: https://stackoverflow.com/a/68497603/15509512
        data_in[list(data_in.columns[1:])] = data_in[
            list(data_in.columns[1:])].astype('float64').apply(np.int64)

        #create another phosphopeptide column that will be used to join later;
        #  MAY need to change depending on Phosphopeptide column position
        #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
        data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index


        end_time = time.process_time() #timer
        print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer

        # Produce a dictionary of metadata for a single phosphopeptide.
        #   This is a replacement of `UniProtInfo_subdict` in the original code.
        def pseq_to_subdict(phospho_pep):
            #ACE print("calling pseq_to_subdict, %s" % phospho_pep);
            # Strip "p" from phosphopeptide sequence
            dephospho_pep = re_phos.sub('',phospho_pep)

            # Determine number of phosphoresidues in phosphopeptide
            numps = len(phospho_pep) - len(dephospho_pep)

            # Determine location(s) of phosphoresidue(s) in phosphopeptide
            #   (used later for Phosphoresidue, Sequence7, and Sequence10)
            ploc = [] #list of p locations
            i = 0
            p = phospho_pep
            while i < numps:
                ploc.append(p.find("p"))
                p = p[:p.find("p")] + p[p.find("p")+1:]
                i +=1


            # Establish nested dictionary
            result = {}
            result[SEQUENCE] = []
            result[UNIPROT_ID] = []
            result[DESCRIPTION] = []
            result[GENE_NAME] = []
            result[PHOSPHORESIDUE] = []
            result[SEQUENCE7] = []
            result[SEQUENCE10] = []

            # Add stripped sequence to dictionary
            result[SEQUENCE].append(dephospho_pep)

            # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
            dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]

            # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
            ### Caller may elect to:
            ## try:
            ##     ...
            ## except PreconditionError as pe:
            ##     print("'{expression}': {message}".format(
            ##             expression = pe.expression,
            ##             message = pe.message))
            ##             )
            ##         )
            if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
                raise PreconditionError( dephospho_pep,
                    'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT'
                    )
            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
                raise PreconditionError( dephospho_pep,
                    'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT'
                    )
            if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]:
                raise PreconditionError( dephospho_pep,
                    "dephosphorylated phosphopeptide does not match " +
                    "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " +
                    PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]
                    )
            result[SEQUENCE] = [dephospho_pep]
            result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]
            result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]
            result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]
            if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
               raise PreconditionError( dephospho_pep,
                    'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT'
                    )
            UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]
            if len (UniProtSeqList) < 1:
               print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep)
               # raise PreconditionError(
               #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
               #      'value has zero length'
               #      )
            for UniProtSeq in UniProtSeqList:
                i = 0
                phosphoresidues = []
                seq7s_set = set()
                seq7s = []
                seq10s_set = set()
                seq10s = []
                while i < len(ploc):
                    start = UniProtSeq.find(dephospho_pep)
                    psite = start+ploc[i] #location of phosphoresidue on protein sequence

                    #add Phosphoresidue
                    phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1)
                    phosphoresidues.append(phosphosite)

                    #Add Sequence7
                    if psite < 7: #phospho_pep at N terminus
                        seq7 = str(UniProtSeq)[:psite+8]
                        if seq7[psite] == "S": #if phosphosresidue is serine
                            pres = "s"
                        elif seq7[psite] == "T": #if phosphosresidue is threonine
                            pres = "t"
                        elif seq7[psite] == "Y": #if phosphoresidue is tyrosine
                            pres = "y"
                        else: # if not pSTY
                            pres = "?"
                        seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8]
                        while len(seq7) < 15: #add appropriate number of "_" to the front
                            seq7 = "_" + seq7
                    elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus
                        seq7 = str(UniProtSeq)[psite-7:]
                        if seq7[7] == "S":
                            pres = "s"
                        elif seq7[7] == "T":
                            pres = "t"
                        elif seq7[7] == "Y":
                            pres = "y"
                        else:
                            pres = "?"
                        seq7 = seq7[:7] + pres + seq7[8:]
                        while len(seq7) < 15: #add appropriate number of "_" to the back
                            seq7 = seq7 + "_"
                    else:
                        seq7 = str(UniProtSeq)[psite-7:psite+8]
                        pres = "" #phosphoresidue
                        if seq7[7] == "S": #if phosphosresidue is serine
                            pres = "s"
                        elif seq7[7] == "T": #if phosphosresidue is threonine
                            pres = "t"
                        elif seq7[7] == "Y": #if phosphoresidue is tyrosine
                            pres = "y"
                        else: # if not pSTY
                            pres = "?"
                        seq7 = seq7[:7] + pres + seq7[8:]
                    if seq7 not in seq7s_set:
                        seq7s.append(seq7)
                        seq7s_set.add(seq7)

                    #add Sequence10
                    if psite < 10: #phospho_pep at N terminus
                        seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11]
                    elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus
                        seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:]
                    else:
                        seq10 = str(UniProtSeq)[psite-10:psite+11]
                        seq10 = seq10[:10] + "p" + seq10[10:]
                    if seq10 not in seq10s_set:
                        seq10s.append(seq10)
                        seq10s_set.add(seq10)

                    i+=1

                result[PHOSPHORESIDUE].append(phosphoresidues)
                result[SEQUENCE7].append(seq7s)
                # result[SEQUENCE10] is a list of lists of strings
                result[SEQUENCE10].append(seq10s)


            r = list(zip(
               result[UNIPROT_ID],
               result[GENE_NAME],
               result[DESCRIPTION],
               result[PHOSPHORESIDUE]
               ))
            # Sort by `UniProt_ID`
            #   ref: https://stackoverflow.com//4174955/15509512
            s = sorted(r, key=operator.itemgetter(0))

            result[UNIPROT_ID] = []
            result[GENE_NAME] = []
            result[DESCRIPTION] = []
            result[PHOSPHORESIDUE] = []

            for r in s:
                result[UNIPROT_ID].append(r[0])
                result[GENE_NAME].append(r[1])
                result[DESCRIPTION].append(r[2])
                result[PHOSPHORESIDUE].append(r[3])


            #convert lists to strings in the dictionary
            for key,value in result.items():
                if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
                    result[key] = '; '.join(map(str, value))
                elif key in [SEQUENCE10]:
                    # result[SEQUENCE10] is a list of lists of strings
                    joined_value = ''
                    joined_set = set()
                    sep = ''
                    for valL in value:
                        # valL is a list of strings
                        for val in valL:
                            # val is a string
                            if val not in joined_set:
                                joined_set.add(val)
                                #joined_value += sep + '; '.join(map(str, val))
                                joined_value += sep + val
                                sep = '; '
                    # joined_value is a string
                    result[key] = joined_value


            newstring = '; '.join(
                [', '.join(l) for l in result[PHOSPHORESIDUE]]
                )
            ### #separate the isoforms in PHOSPHORESIDUE column with ";"
            ### oldstring = result[PHOSPHORESIDUE]
            ### oldlist = list(oldstring)
            ### newstring = ""
            ### i = 0
            ### for e in oldlist:
            ###     if e == ";":
            ###         if numps > 1:
            ###             if i%numps:
            ###                 newstring = newstring + ";"
            ###             else:
            ###                 newstring = newstring + ","
            ###         else:
            ###             newstring = newstring + ";"
            ###         i +=1
            ###     else:
            ###         newstring = newstring + e
            result[PHOSPHORESIDUE] = newstring


            #separate sequence7's by |
            oldstring = result[SEQUENCE7]
            oldlist = oldstring
            newstring = ""
            for l in oldlist:
              for e in l:
                if e == ";":
                    newstring = newstring + " |"
                elif len(newstring) > 0 and 1 > newstring.count(e):
                    newstring = newstring + " | " + e
                elif 1 > newstring.count(e):
                    newstring = newstring + e
            result[SEQUENCE7] = newstring


            return [phospho_pep, result]

        # Construct list of [string, dictionary] lists
        #   where the dictionary provides the SwissProt metadata for a phosphopeptide
        result_list = [
            catch(pseq_to_subdict,psequence)
            for psequence
            in data_in[PHOSPHOPEPTIDE_MATCH]
            ]


        end_time = time.process_time() #timer
        print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer

        # Construct dictionary from list of lists
        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
        UniProt_Info = {
            result[0]:result[1]
            for result
            in result_list
            if result is not None
            }


        end_time = time.process_time() #timer
        print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer

        #cosmetic: add N_A to phosphopeptide rows with no hits
        p_peptide_list = []
        for key in UniProt_Info:
            p_peptide_list.append(key)
            for nestedKey in UniProt_Info[key]:
                if UniProt_Info[key][nestedKey] == "":
                    UniProt_Info[key][nestedKey] = N_A

        end_time = time.process_time() #timer
        print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer

        #convert UniProt_Info dictionary to dataframe
        uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info))

        #reorder columns to match expected output file
        uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too


        cols = uniprot_df.columns.tolist()
        #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
        #uniprot_df = uniprot_df[cols]
        uniprot_df = uniprot_df[[
            PHOSPHOPEPTIDE,
            SEQUENCE10,
            SEQUENCE7,
            GENE_NAME,
            PHOSPHORESIDUE,
            UNIPROT_ID,
            DESCRIPTION
            ]]


        end_time = time.process_time() #timer
        print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer

        #concat to split then groupby to collapse
        seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | '))
                            for _, row in uniprot_df.iterrows()]).reset_index()
        seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE]

        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
        #read in PhosphoSitePlus Regulatory Sites dataset
        #ACE if (True):
        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
        conn = sql.connect(uniprot_sqlite)
        regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
        # Close SwissProt SQLite database
        conn.close()
        #ACE # Array indexes are zero-based
        #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
        #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
        #ACE               , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        #ACE     print(regsites_df)
        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
        #ACE else:
        #ACE     regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
        #ACE     SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
        #ACE     #ACE # Array indexes are zero-based
        #ACE     #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
        #ACE     #ACE RENAME_COLS = [ 'GENE'          , 'PROTEIN'    , 'PROT_TYPE' , 'ACC_ID'          , 'GENE_ID'
        #ACE     #ACE               , 'HU_CHR_LOC'    , 'ORGANISM'   , 'MOD_RSD'   , 'SITE_GRP_ID'     , 'SITE_+/-7_AA'
        #ACE     #ACE               , 'DOMAIN'        , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
        #ACE     #ACE               , 'PMIDs'         , 'LT_LIT'     , 'MS_LIT'    , 'MS_CST'          , 'NOTES'
        #ACE     #ACE               ]
        #ACE     #ACE REGSITE_COL_SITE7AA = 9
        #ACE     #ACE REGSITE_COL_PROTEIN = 1
        #ACE     #ACE REGSITE_COL_DOMAIN = 10
        #ACE     #ACE REGSITE_COL_PMIDs = 15

        # ... -------------- end read PSP_Regulatory_sites ------------------------------------


        #keep only the human entries in dataframe
        if len(species) > 0:
            print('Limit PhosphoSitesPlus records to species "' + species + '"')
            regsites_df = regsites_df[regsites_df.ORGANISM == species]

        #merge the seq7 df with the regsites df based off of the sequence7
        merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left')
        #ACE print(merge_df.columns.tolist()) #ACE

        #after merging df, select only the columns of interest - note that PROTEIN is absent here
        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]]
        #ACE print(merge_df.columns.tolist()) #ACE
        #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="")
        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")

        #remove the columns that were combined
        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]]

        #ACE print(merge_df) #ACE
        #ACE print(merge_df.columns.tolist()) #ACE

        end_time = time.process_time() #timer
        print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer

        #cosmetic changes to Function Phosphoresidue column
        fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])

        end_time = time.process_time() #timer
        print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer

        i = 0
        while i < len(fp_series):
            #remove the extra ";" so that it looks more professional
            if fp_series[i] == "; ; ; ; ": #remove ; from empty hits
                fp_series[i] = ""
            while fp_series[i].endswith("; "): #remove ; from the ends
                fp_series[i] = fp_series[i][:-2]
            while fp_series[i].startswith("; "): #remove ; from the beginning
                fp_series[i] = fp_series[i][2:]
            fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
            fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
            fp_series[i] = fp_series[i].replace("; ; ", "; ")

            #turn blanks into N_A to signify the info was searched for but cannot be found
            if fp_series[i] == "":
                fp_series[i] = N_A

            i += 1
        merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series

        end_time = time.process_time() #timer
        print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer

        #merge uniprot df with merge df
        uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left")

        #collapse the merged df
        uniprot_regsites_collapsed_df = pandas.DataFrame(
            uniprot_regsites_merged_df
            .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE]
            .apply(lambda x: ppep_join(x)))
            #.apply(lambda x: "%s" % ' | '.join(x)))


        end_time = time.process_time() #timer
        print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer

        uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column


        #rename columns
        uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp']


        #select columns to be merged to uniprot_df
        #ACE cols = regsites_df.columns.tolist()
        #ACE print(cols) #ACE
        #ACE if len(cols) > 8:
        #ACE     cols = [cols[9]]+[cols[1]]+cols[10:15]
        #ACE     #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
        #ACE     print(cols) #ACE
        #ACE regsite_merge_df = regsites_df[cols]

        end_time = time.process_time() #timer
        print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer

        #add columns based on Sequence7 matching site_+/-7_AA
        uniprot_regsite_df = pandas.merge(
            left=uniprot_df,
            right=uniprot_regsites_collapsed_df,
            how='left',
            left_on=PHOSPHOPEPTIDE,
            right_on='ppp')

        end_time = time.process_time() #timer
        print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer

        data_in.rename(
            {'Protein description': PHOSPHOPEPTIDE},
            axis='columns',
            inplace=True
            )


        sort_start_time = time.process_time() #timer

        #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
        res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
        data_in = data_in.loc[res2]

        end_time = time.process_time() #timer
        print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer


        cols = [old_cols[0]] + old_cols[col_PKCalpha-1:]
        upstream_data = upstream_data[cols]

        end_time = time.process_time() #timer
        print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer


        #### #rename upstream columns in new list
        #### new_cols = []
        #### for name in cols:
        ####     if "_NetworKIN" in name:
        ####         name = name.split("_")[0]
        ####     if " motif" in name:
        ####         name = name.split(" motif")[0]
        ####     if " sequence " in name:
        ####         name = name.split(" sequence")[0]
        ####     if "_Phosida" in name:
        ####         name = name.split("_")[0]
        ####     if "_PhosphoSite" in name:
        ####         name = name.split("_")[0]
        ####     new_cols.append(name)

        #rename upstream columns in new list
        def col_rename(name):
            if "_NetworKIN" in name:
                name = name.split("_")[0]
            if " motif" in name:
                name = name.split(" motif")[0]
            if " sequence " in name:
                name = name.split(" sequence")[0]
            if "_Phosida" in name:
                name = name.split("_")[0]
            if "_PhosphoSite" in name:
                name = name.split("_")[0]
            return name

        new_cols = [col_rename(col) for col in cols]
        upstream_data.columns = new_cols


        end_time = time.process_time() #timer
        print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer


        # Create upstream_data_cast as a copy of upstream_data
        #   but with first column substituted by the phosphopeptide sequence
        upstream_data_cast = upstream_data.copy()
        new_cols_cast = new_cols
        new_cols_cast[0] = 'p_peptide'
        upstream_data_cast.columns = new_cols_cast
        upstream_data_cast['p_peptide'] = upstream_data.index
        new_cols_cast0 = new_cols_cast[0]

        # --- -------------- begin read upstream_data_melt ------------------------------------
        ## ----------- Get melted kinase mapping data from SQLite database (start) -----------
        conn = sql.connect(uniprot_sqlite)
        upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
        # Close SwissProt SQLite database
        conn.close()
        upstream_data_melt = upstream_data_melt_df.copy()
        upstream_data_melt.columns = ['p_peptide', 'characterization', 'X']
        upstream_data_melt['characterization'] = [
                col_rename(s)
                for s in upstream_data_melt['characterization']
                ]

        print('%0.6f upstream_data_melt_df initially has %d rows' %
              (end_time - start_time, len(upstream_data_melt.axes[0]))
              , file=sys.stderr)
        # ref: https://stackoverflow.com/a/27360130/15509512
        #      e.g. df.drop(df[df.score < 50].index, inplace=True)
        upstream_data_melt.drop(
            upstream_data_melt[upstream_data_melt.X != 'X'].index,
            inplace = True
            )
        print('%0.6f upstream_data_melt_df pre-dedup has %d rows' %
              (end_time - start_time, len(upstream_data_melt.axes[0]))
              , file=sys.stderr)
        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        #ACE     print(upstream_data_melt)
        ## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
        # ... -------------- end read upstream_data_melt --------------------------------------

        end_time = time.process_time() #timer
        print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer
        # ... end read upstream_data_melt

        upstream_data_melt_index = upstream_data_melt.index
        upstream_data_melt_p_peptide = upstream_data_melt['p_peptide']

        end_time = time.process_time() #timer
        print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer

        upstream_delta_melt_LoL = upstream_data_melt.values.tolist()

        melt_dict = {}
        for key in upstream_map_p_peptide_list:
            melt_dict[key] = []

        for el in upstream_delta_melt_LoL:
            (p_peptide, characterization, X) = tuple(el)
            if p_peptide in melt_dict:
                melt_dict[p_peptide].append(characterization)
            else:
                exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide))


        end_time = time.process_time() #timer
        print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer


        # for key in upstream_map_p_peptide_list:
        #     melt_dict[key] = ' | '.join(melt_dict[key])

        for key in upstream_map_p_peptide_list:
            melt_dict[key] = melt_join(melt_dict[key])

        end_time = time.process_time() #timer
        print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer

        # map_dict is a dictionary of dictionaries
        map_dict = {}
        for key in upstream_map_p_peptide_list:
            map_dict[key] = {}
            map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]


        end_time = time.process_time() #timer
        print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer

        #convert map_dict to dataframe
        map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict))
        map_df["p-peptide"] = map_df.index #make index a column too
        cols_map_df = map_df.columns.tolist()
        cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
        map_df = map_df[cols_map_df]

        #join map_df to uniprot_regsite_df
        output_df = uniprot_regsite_df.merge(
            map_df,
            how="left",
            left_on=PHOSPHOPEPTIDE,
            right_on="p-peptide")

        output_df = output_df[
            [  PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE,
                UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE,
                PUTATIVE_UPSTREAM_DOMAINS
                ]
            ]


        # cols_output_prelim = output_df.columns.tolist()
        #
        # print("cols_output_prelim")
        # print(cols_output_prelim)
        #
        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
        #
        # print("cols_output with p-peptide")
        # print(cols_output)
        #
        # cols_output = [col for col in cols_output if not col == "p-peptide"]
        #
        # print("cols_output")
        # print(cols_output)
        #
        # output_df = output_df[cols_output]

        #join output_df back to quantitative columns in data_in df
        quant_cols = data_in.columns.tolist()
        quant_cols = quant_cols[1:]
        quant_data = data_in[quant_cols]

        ## ----------- Write merge/filter metadata to SQLite database (start) -----------
        # Open SwissProt SQLite database
        conn = sql.connect(output_sqlite)
        cur  = conn.cursor()

        cur.executescript(MRGFLTR_DDL)

        cur.execute(
            CITATION_INSERT_STMT,
            ('mrgfltr_metadata_view', CITATION_INSERT_PSP)
            )
        cur.execute(
            CITATION_INSERT_STMT,
            ('mrgfltr_metadata', CITATION_INSERT_PSP)
            )
        cur.execute(
            CITATION_INSERT_STMT,
            ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF)
            )
        cur.execute(
            CITATION_INSERT_STMT,
            ('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
            )

        # Read ppep-to-sequence LUT
        ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
        #ACE ppep_lut_df.info(verbose=True)
        # write only metadata for merged/filtered records to SQLite
        mrgfltr_metadata_df = output_df.copy()
        # replace phosphopeptide seq with ppep.id
        mrgfltr_metadata_df = ppep_lut_df.merge(
            mrgfltr_metadata_df,
            left_on='ppep_seq',
            right_on=PHOSPHOPEPTIDE,
            how='inner'
            )
        mrgfltr_metadata_df.drop(
            columns=[PHOSPHOPEPTIDE, 'ppep_seq'],
            inplace=True
            )
        #rename columns
        mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
        #ACE mrgfltr_metadata_df.info(verbose=True)
        mrgfltr_metadata_df.to_sql(
            'mrgfltr_metadata',
            con=conn,
            if_exists='append',
            index=False,
            method='multi'
            )

        # Close SwissProt SQLite database
        conn.close()
        ## ----------- Write merge/filter metadata to SQLite database (finish) -----------

        output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)
        output_cols = output_df.columns.tolist()
        output_cols = output_cols[:-1]
        output_df = output_df[output_cols]

        #cosmetic changes to Upstream column
        output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
        us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
        i = 0
        while i < len(us_series):
            #turn blanks into N_A to signify the info was searched for but cannot be found
            if us_series[i] == "":
                us_series[i] = N_A
            i += 1
        output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series

        end_time = time.process_time() #timer
        print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer

        (output_rows, output_cols) = output_df.shape

        #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
        output_df = output_df.convert_dtypes(convert_integer=True)


        #Output onto Final CSV file
        output_df.to_csv(output_filename_csv, index=False)
        output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False)

        end_time = time.process_time() #timer
        print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer

        print('{:>10} phosphopeptides written to output'.format(str(output_rows)))

        end_time = time.process_time() #timer
        print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer


        #Rev. 7/1/2016
        #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
        #Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
        #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
        #                read from SwissProt SQLite database
        #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper

        #############################################
        # copied from Excel Output Script.ipynb END #
        #############################################

    try:
        catch(mqpep_getswissprot,)
        exit(0)
    except Exception as e:
        exit('Internal error running mqpep_getswissprot(): %s' % (e))

if __name__ == "__main__":
    __main__()
author	eschen42
date	Mon, 07 Mar 2022 19:26:06 +0000
parents	c1403d18c189
children	d4d531006735