changeset 0:492f98d89e26 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/mzsqlite_psm_align commit 88e2fb9c31fbd687a0956924a870137d1fb9bee3-dirty
author jjohnson
date Tue, 10 Apr 2018 09:57:49 -0400
parents
children fbf2aea7d456
files mzsqlite_psm_align.py mzsqlite_psm_align.xml profmt.py profmt.pyc
diffstat 4 files changed, 1405 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mzsqlite_psm_align.py	Tue Apr 10 09:57:49 2018 -0400
@@ -0,0 +1,935 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2017, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+from __future__ import print_function
+
+import argparse
+import re
+import sys
+import sqlite3 as sqlite
+from time import time
+
+
+from Bio.Seq import reverse_complement, translate
+
+## from bedutil import bed_from_line
+
+## import digest
+
+## from ensembl_rest import get_cdna
+
+import pysam
+from twobitreader import TwoBitFile
+
+from  profmt import PROBAM_DEFAULTS,ProBAM,ProBAMEntry,ProBED,ProBEDEntry
+
+"""
+inputs 
+  proBed 
+  mzIdentML 
+  twobit
+  bam
+
+inputs 
+  mz.sqlite
+  genomic.mapping 
+  bam
+
+CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT);
+CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER);
+CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT);
+CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER);
+
+SELECT 
+FROM spectrum_identification_result_items siri
+ JOIN peptide_evidence pe ON siri.peptide_ref = pe.peptide_ref
+ JOIN db_sequence dbs ON pe.dBSequence_ref = 
+WHERE pe.isDecoy = 'false'
+
+SELECT 
+  psm.spectrumID,
+  psm.spectrumTitle as "QNAME",
+  psm.id,
+  psm.sequence,
+  psm.passThreshold,
+  psm."PeptideShaker PSM confidence",
+  psm."PeptideShaker PSM score",
+  pe.start,
+  pe.end,
+  pe.pre,
+  pe.post,
+  pe.dBSequence_ref
+FROM psm_entries psm 
+JOIN peptide_evidence pe ON psm.id = pe.peptide_ref
+JOIN db_sequence dbs ON pe.dBSequence_ref = dbs.accession
+WHERE pe.isDecoy = 'false'
+AND pe.peptide_ref = 'SFYPEEVSSMVITK_15.99491461956-ATAA-10'
+ORDER BY psm.spectrumID
+  
+
+proBed to SQLite
+or index proBed
+
+for psm in psms:
+  beds = get_bed(protein_acc)
+  cds = ''
+  for bed in beds:
+    bed.seq = twobit[bed.chrom][bed.start,bed.end]
+    cds += bed.get_cds()
+  refprot = translate(cds)
+
+def read_bed(path):
+    pdict = dict()
+    prog = re.compile('^([^\t]+\t[^\t]+\t[^\t]+\t([^\t]+)\t.*)$')
+    with open(path,'r') as bed:
+        for i,line in enumerate(bed):
+            m = prog.match(line)
+            prot = m.groups()[1]
+            pdict[prot] = m.groups()[0]
+    return pdict
+  
+from pyteomics import mzid
+    with mzid.reader(args.mzid) as mzidrdr:
+        for psm in mzidrdr:
+            SpectrumIdentificationItems = psm['SpectrumIdentificationItem']
+            for SpectrumIdentificationItem in SpectrumIdentificationItems:
+                PeptideEvidenceRef = SpectrumIdentificationItem['PeptideEvidenceRef']
+                PepEvs = [r['peptideEvidence_ref'] for r in PeptideEvidenceRef]
+                for PepEv in PepEvs:
+                    PepRef = mzidrdr[PepEv]
+                    dBSequence_ref = PepRef['dBSequence_ref']
+
+spectrum_peptides =  count(distinct sequence) FROM psm_entries WHERE 
+
+1 QNAME String Query template NAME Spectrum name *                                psm.spectrumTitle
+2 FLAG Int Bitwise FLAG Bitwise FLAG                                              map.strand
+3 RNAME String Reference sequence NAME Reference sequence NAME *                  map.chrom
+4 POS Int 1-based leftmost mapping POSition 1-based leftmost mapping POSition     map.start
+5 -MAPQ Int MAPping Quality (Phred-scaled) - 255
+6 CIGAR String Extended CIGAR string (operations: MIDN) CIGAR string *            map.cigar
+7 -RNEXT String Mate Reference NAME ('=' if same as RNAME) - *
+8 -PNEXT Int 1-Based leftmost Mate POSition - 0
+9 TLEN Int observed Template LENgth - 0                                           
+10 SEQ String segment SEQuence Coding sequence *                                  genomic.seq
+11 -QUAL String Query QUALity (ASCII-33=Phred base quality) - *                   
+
+1 QNAME    psm.spectrumTitle
+2 FLAG     map.strand
+3 RNAME    map.chrom
+4 POS      map.start
+5 -MAPQ 
+6 CIGAR    map.cigar
+7 -RNEXT 
+8 -PNEXT 
+9 -TLEN                
+10 SEQ     genomic.seq
+11 -QUAL              
+                                                                                  
+'NH' : 'i' genomic_locations 
+'XO' : 'Z' 
+'XL' : 'i' spectrum_peptides 
+'XP' : 'Z' psm.sequence 
+'YP' : 'Z' peptide_evidence.dBSequence_ref
+'XF' : 'Z' reading_frame
+'XI' : 'f' 
+'XB' : 'Z' 
+'XR' : 'Z' 
+'YB' : 'Z' 
+'YA' : 'Z' 
+'XS' : 'f' 
+'XQ' : 'f' 
+'XC' : 'i' 
+'XA' : 'i' 
+'XM' : 'Z' 
+'XN' : 'i' 
+'XT' : 'i' 
+'XE' : 'i' 
+'XG' : 'A' 
+'XU' : 'Z' 
+
+'NH' : 'i', #number of genomic locations to which the peptide sequence maps
+'XO' : 'Z', #uniqueness of the peptide mapping
+'XL' : 'i', #number of peptides to which the spectrum maps
+'XP' : 'Z', #peptide sequence
+'YP' : 'Z', #Protein accession ID from the original search result
+'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
+'XI' : 'f', #Peptide intensity
+'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
+'XR' : 'Z', #reference peptide sequence
+'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
+'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
+'XS' : 'f', #PSM score
+'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
+'XC' : 'i', #peptide charge
+'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
+'XM' : 'Z', #Modifications
+'XN' : 'i', #Number of missed cleavages in the peptide (XP)
+'XT' : 'i', #Enzyme specificity
+'XE' : 'i', #Enzyme used in the experiment
+'XG' : 'A', #Peptide type
+'XU' : 'Z', #URI
+
+
+Datatype Field name Description Origin
+RNAME	string          chrom                     map.chrom      Reference sequence chromosome 
+POS	uint            chromStart                map            Start position of the first DNA base 
+	uint            chromEnd                  map            End position of the last DNA base 
+QNAME	string          name                      spectrum.title     Unique name 
+	uint            score                           Score 
+	char[1]         strand                          + or - for strand 
+	uint            thickStart                      Coding region start 
+	uint            thickEnd                        Coding region end 
+	uint            reserved                        Always 0 
+	int             blockCount                      Number of blocks 
+	int[blockCount] blockSizes                      Block sizes 
+	int[blockCount] chromStarts                     Block starts 
+YP	string          proteinAccession                Protein accession number 
+XP	string          peptideSequence                 Peptide sequence 
+XO	string          uniqueness                      Peptide uniqueness 
+	string          genomeReferenceVersion          Genome reference version number 
+XS	double          psmScore                        PSM score 
+XQ	double          fdr                             Estimated global false discovery rate 
+XM	string          modifications                   Post-translational modifications 
+XC	int             charge                          Charge value 
+XB	double          expMassToCharge                 Experimental mass to charge value 
+XB	double          calcMassToCharge                Calculated mass to charge value 
+int             psmRank                         Peptide-Spectrum Match rank. 
+string          datasetID                       Dataset Identifier 
+string          uri                             Uniform Resource Identifier 
+
+XG
+    N  Normal peptide. The peptide sequence is contained in the reference protein sequence.
+    V  Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
+    W  Indel peptide. An insertion or deletion is present as compared to the reference.
+    J  Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
+    A  Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
+    M  Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
+    C  Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
+    E  Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
+    B  3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
+    O  Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
+    T  Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
+    R  Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
+    I  Intron peptide. A peptide that is located in an intronic region of the reference isoform.
+    G  Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
+    D  Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
+    U  Unmapped peptide. A peptide that could not be mapped to a reference sequence.
+    X  Unknown.
+
+
+
+SELECT distinct chrom, CASE WHEN strand = '+' THEN start + cds_offset - cds_start ELSE end - cds_offset - cds_start END as "pos"
+FROM feature_cds_map
+WHERE name = acc_name AND cds_offset >= cds_start AND cds_offset < cds_end
+
+sqlite> select * from feature_cds_map WHERE name = 'pre_STRG.28813.4_j_5350_5470';
+pre_STRG.28813.4_j_5350_5470|chr7|5074750|5074857|+|0|107
+pre_STRG.28813.4_j_5350_5470|chr7|5075140|5075153|+|107|120
+
+
+
+SELECT 
+  pe.isDecoy,
+  pe.dBSequence_ref,
+  pe.start,
+  pe.end,
+  sr.spectrumTitle,
+  si.rank,
+  si.chargeState,
+  si.calculatedMassToCharge,
+  si.experimentalMassToCharge
+FROM spectrum_identification_results sr 
+JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
+JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
+WHERE si.id = 'SII_7389_1' 
+ORDER BY si.rank;
+
+SELECT pe.isDecoy, pe.dBSequence_ref, pe.start, pe.end, sr.spectrumTitle, si.rank, si.chargeState, si.calculatedMassToCharge, si.experimentalMassToCharge
+FROM spectrum_identification_results sr 
+JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
+JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
+WHERE si.id = 'SII_7389_1' 
+ORDER BY si.rank;
+
+
+
+CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT);
+CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER);
+CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT);
+CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER);
+
+{'write_probed': 0.08575654029846191, 'PSM_QUERY': 4.704349040985107, 'get_cds': 0.21015286445617676, 'SPECTRUM_PEPTIDES_QUERY': 32.92655086517334, 'PEPTIDE_ACC_QUERY': 425.11919951438904, 'get_mapping': 1.5911591053009033, 'GENOMIC_POS_QUERY': 10.909647226333618}
+"""
+
+
+
+def regex_match(expr, item):
+    return re.match(expr, item) is not None
+
+
+def regex_search(expr, item):
+    return re.search(expr, item) is not None
+
+
+def regex_sub(expr, replace, item):
+    return re.sub(expr, replace, item)
+
+
+def get_connection(sqlitedb_path, addfunctions=True):
+    conn = sqlite.connect(sqlitedb_path)
+    if addfunctions:
+        conn.create_function("re_match", 2, regex_match)
+        conn.create_function("re_search", 2, regex_search)
+        conn.create_function("re_sub", 3, regex_sub)
+    return conn
+
+PSM_QUERY = """\
+SELECT 
+  pe.dBSequence_ref,
+  pe.start,
+  pe.end,
+  pe.pre,
+  pe.post,
+  pep.sequence,
+  sr.id,
+  sr.spectrumTitle,
+  si.rank,
+  si.chargeState,
+  si.calculatedMassToCharge,
+  si.experimentalMassToCharge,
+  si.peptide_ref
+FROM spectrum_identification_results sr 
+JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
+JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
+JOIN peptides pep ON pe.peptide_ref = pep.id
+WHERE pe.isDecoy = 'false'
+ORDER BY sr.spectrumTitle,si.rank
+"""
+
+PEP_MODS_QUERY = """\
+SELECT location, residue, name, modType, '' as "unimod"
+FROM peptide_modifications
+WHERE peptide_ref = :peptide_ref
+ORDER BY location, modType, name
+"""
+
+#number of peptides to which the spectrum maps
+## spectrum_identification_results => spectrum_identification_result_items -> peptide_evidence 
+SPECTRUM_PEPTIDES_QUERY = """\
+SELECT count(distinct pep.sequence)
+FROM spectrum_identification_results sr
+JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
+JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
+JOIN peptides pep ON pe.peptide_ref = pep.id
+WHERE pe.isDecoy = 'false'
+AND sr.id = :sr_id
+GROUP BY sr.id
+"""
+#number of genomic locations to which the peptide sequence maps
+#uniqueness of the peptide mapping
+## peptides => peptide_evidence -> db_sequence -> location
+## proteins_by_peptide
+PEPTIDE_ACC_QUERY = """\
+SELECT 
+  pe.dBSequence_ref,
+  pe.start,
+  pe.end
+FROM peptide_evidence pe
+JOIN peptides pep ON pe.peptide_ref = pep.id
+WHERE pe.isDecoy = 'false'
+AND pep.sequence = :sequence
+"""
+
+MAP_QUERY = """\
+SELECT distinct * 
+FROM feature_cds_map 
+WHERE name = :acc 
+AND :p_start < cds_end 
+AND :p_end >= cds_start 
+ORDER BY name,cds_start,cds_end
+"""
+
+GENOMIC_POS_QUERY = """\
+SELECT distinct chrom, CASE WHEN strand = '+' THEN start + :cds_offset - cds_start ELSE end - :cds_offset - cds_start END as "pos"
+FROM feature_cds_map
+WHERE name = :acc 
+AND :cds_offset >= cds_start 
+AND :cds_offset < cds_end
+"""
+
+FEATURE_CONTAIN_QUERY = """\
+SELECT id,seqid,start,end,featuretype,strand,frame
+FROM features
+WHERE seqid = :seqid AND start <= :start AND end >= :end 
+AND strand = :strand AND featuretype = :ftype
+"""
+
+FEATURE_OVERLAP_QUERY = """\
+SELECT id,seqid,start,end,featuretype,strand,frame
+FROM features
+WHERE seqid = :seqid 
+AND :end >= start AND :start <= end 
+AND strand = :strand AND featuretype = :ftype
+"""
+
+FEATURE_ANY_QUERY = """\
+SELECT id,seqid,start,end,featuretype,strand,CAST(frame AS INTEGER) as "frame", CAST(frame AS INTEGER)==:frame as "in_frame"
+FROM features
+WHERE seqid = :seqid 
+AND :end >= start AND :start <= end 
+AND featuretype in ('CDS','five_prime_utr','three_prime_utr','transcript')
+ORDER BY strand == :strand DESC, featuretype, 
+start <= :start AND end >= :end DESC,
+in_frame DESC, end - start, start DESC, end
+"""
+
+def __main__():
+    parser = argparse.ArgumentParser(
+        description='Generate proBED and proBAM from mz.sqlite')
+    parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML")
+    parser.add_argument('genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table")
+    parser.add_argument(
+        '-R', '--genomeReference', default='Unknown',
+        help='Genome reference sequence in 2bit format')
+    parser.add_argument(
+        '-t', '--twobit', default=None,
+        help='Genome reference sequence in 2bit format')
+    parser.add_argument(
+        '-r', '--reads_bam', default=None,
+        help='reads alignment bam path')
+    parser.add_argument(
+        '-g', '--gffutils_file', default=None,
+        help='gffutils GTF sqlite DB')
+    parser.add_argument(
+        '-B', '--probed', default=None,
+        help='proBed path')
+    parser.add_argument(
+        '-s', '--prosam', default=None,
+        help='proSAM path')
+    parser.add_argument(
+        '-b', '--probam', default=None,
+        help='proBAM path')
+    parser.add_argument(
+        '-l', '--limit', type=int, default=None,
+        help='limit numbers of PSMs for testing')
+    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
+    parser.add_argument('-d', '--debug', action='store_true', help='Debug')
+    args = parser.parse_args()
+
+    def get_sequence(chrom, start, end):
+        if twobit:
+            if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
+                return twobit[chrom][start:end]
+            contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
+            if contig in twobit and 0 <= start < end < len(twobit[contig]):
+                return twobit[contig][start:end]
+            return ''
+        return None
+
+    twobit = TwoBitFile(args.twobit) if args.twobit else None
+    samfile = pysam.AlignmentFile(args.reads_bam, "rb" ) if args.reads_bam else None
+    seqlens = twobit.sequence_sizes()
+
+    probed = open(args.probed,'w') if args.probed else sys.stdout
+    
+    gff_cursor = get_connection(args.gffutils_file).cursor() if args.gffutils_file else None
+    map_cursor = get_connection(args.genomic_mapping_sqlite).cursor()
+    mz_cursor = get_connection(args.mzsqlite_file).cursor()
+
+    unmapped_accs = set()
+    timings = dict()
+    def add_time(name,elapsed):
+        if name in timings:
+            timings[name] += elapsed
+        else:
+            timings[name] = elapsed
+
+    XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*']
+    FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript']
+    def get_peptide_type(exons):
+        ## XG classify peptide
+        ##     N  Normal peptide. The peptide sequence is contained in the reference protein sequence.
+        ##     V  Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
+        ##     W  Indel peptide. An insertion or deletion is present as compared to the reference.
+        ##     J  Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
+        ##     A  Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
+        ##     M  Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
+        ##     C  Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
+        ##     E  Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
+        ##     B  3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
+        ##     O  Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
+        ##     T  Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
+        ##     R  Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
+        ##     I  Intron peptide. A peptide that is located in an intronic region of the reference isoform.
+        ##     G  Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
+        ##     D  Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
+        ##     U  Unmapped peptide. A peptide that could not be mapped to a reference sequence.
+        ##     X  Unknown.
+
+        peptide_type = '*'
+        if gff_cursor:
+            ts = time()
+            etypes = ['*'] * len(exons)
+            efeatures = [None] * len(exons)
+            if args.debug:
+                print('exons:%d\t%s'% (len(exons),etypes),file=sys.stderr)
+            for i,exon in enumerate(exons):
+                (acc,gc,gs,ge,st,cs,ce) = exon
+                fr = cs % 3
+                if args.debug:
+                    print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc,gc,gs,ge,st,cs,ce,fr),file=sys.stderr)
+                ft_params = {"seqid" : str(gc).replace('chr',''), "start" : gs, "end" : ge, 'strand' : st, 'frame' : fr, 'ftype' : 'CDS'}
+                features = [f for f in gff_cursor.execute(FEATURE_ANY_QUERY,ft_params)]
+                efeatures[i] = features
+            for i,exon in enumerate(exons):
+                (acc,gc,gs,ge,st,cs,ce) = exon
+                for f in efeatures[i]:
+                    (id,seqid,start,end,featuretype,strand,frame,in_frame) = f
+                    if args.debug:
+                        print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (id,seqid,start,end,featuretype,strand,frame,in_frame),file=sys.stderr)
+                    if strand == st:
+                        if start <= gs and ge <= end:
+                            if in_frame:
+                               etypes[i] = 'N'
+                               break
+                            elif XG_TYPES.index('O') < XG_TYPES.index(etypes[i]):
+                               etypes[i] = 'O'
+                        break
+                    else:
+                        if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]):
+                            etypes[i] = 'O'
+                peptide_type = etypes[i]
+            te = time()   
+            add_time('pep_type',te - ts)
+        return peptide_type
+    def classify_exon(exon,exons,features):
+            ##     N  Normal peptide. The peptide sequence is contained in the reference protein sequence.
+            # 1 exon, contained, in_frame
+            # 2+ exons, contained, in_frame, on_exon_boundary
+            ##     V  Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
+            # 1 exon, contained, in_frame, AA_mismatch
+            # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch
+            ##     W  Indel peptide. An insertion or deletion is present as compared to the reference.
+            # 1 exon, contained, in_frame, AA_mismatch
+            # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch
+            ##     J  Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
+            # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
+            ##     A  Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
+            # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
+            ##     M  Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
+            ##     C  Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
+            # 1 exon overlaps but not contained 
+            ##     E  Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
+            ##     B  3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
+            # exon overlaps a three_prime_utr
+            ##     O  Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
+            # exon contained but not in_frame
+            ##     T  Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
+            ##     R  Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
+            ##     I  Intron peptide. A peptide that is located in an intronic region of the reference isoform.
+            # exon contained in transcript, not not overlapping any exon
+            ##     G  Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
+            # exonis from different seqs, strand, or transcripts
+            ##     D  Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
+            ##     U  Unmapped peptide. A peptide that could not be mapped to a reference sequence.
+            ##     X  Unknown.
+        return '*'
+
+    def get_variant_cds(exons,ref_prot,peptide,pep_cds):
+        if ref_prot != peptide and samfile:
+            try:
+                if args.debug:
+                    print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr)
+                ts = time()
+                for exon in exons:
+                   (acc,chrom,start,end,strand,c_start,c_end) = exon
+                   a_start = c_start / 3 * 3
+                   a_end = c_end / 3 * 3
+                   if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
+                       pileup = get_exon_pileup(chrom,start,end)
+                       for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]):
+                           if ao == 0 or i == 0:
+                               if ref_prot[ai] != peptide[ai]:
+                                   codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao)
+                                   if args.debug:
+                                       print('%d %d %d   %s :  %s %s %s' % (bi,ai,ao,  peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr)
+                                   if codon:
+                                       pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:]
+                te = time()   
+                add_time('var_cds',te - ts)
+            except Exception as e:
+                print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr)
+        return pep_cds
+
+    def get_mapping(acc,pep_start,pep_end):
+        ts = time()
+        p_start = (pep_start - 1) * 3
+        p_end = pep_end * 3
+        map_params = {"acc" : acc, "p_start" : p_start, "p_end" : p_end}
+        if args.debug:
+            print('%s' % map_params, file=sys.stderr)
+        locs = [l for l in map_cursor.execute(MAP_QUERY,map_params)]
+        exons = []
+        ##       =========	pep
+        ##  ---			continue
+        ##      ---		trim
+        ##          ---		copy
+        ##              ---	trim
+        ##                 ---  break
+        c_end = 0
+        for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(locs):
+            if args.debug:
+                print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
+            c_start = c_end
+            if cds_end < p_start:
+                continue
+            if cds_start >= p_end:
+                break
+            if strand == '+':
+                if cds_start < p_start:
+                    start += p_start - cds_start
+                if cds_end > p_end:
+                    end -= cds_end - p_end
+            else:
+                if cds_start < p_start:
+                    end -= p_start - cds_start
+                if cds_end > p_end:
+                    start += cds_end - p_end
+            c_end = c_start + abs(end - start)
+            if args.debug:
+                print('Pep:  %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
+            exons.append([acc,chrom,start,end,strand,c_start,c_end])
+        te = time()   
+        add_time('get_mapping',te - ts)
+        return exons 
+
+    def get_cds(exons):
+        ts = time()
+        seqs = []
+        for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(exons):
+            seq = get_sequence(chrom, min(start,end), max(start,end))
+            if strand == '-': 
+                seq = reverse_complement(seq)
+            seqs.append(seq)
+        te = time()   
+        add_time('get_cds',te - ts)
+        if args.debug:
+            print('CDS:  %s' % str(seqs),file=sys.stderr)
+        return ''.join(seqs) if seqs else ''
+
+    def genomic_mapping_count(peptide):
+        ts = time()
+        params = {"sequence" : peptide}
+        acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY,params)]
+        te = time()   
+        add_time('PEPTIDE_ACC_QUERY',te - ts)
+        if acc_locs:
+            if len(acc_locs)  == 1:
+                return 1
+            locations = set()
+            for i,acc_loc in enumerate(acc_locs):
+                (acc,pep_start,pep_end) = acc_loc            
+                if acc in unmapped_accs:
+                    continue
+                try:
+                    add_time('GENOMIC_POS_QUERY_COUNT',1)
+                    ts = time()
+                    p_start = pep_start * 3
+                    p_end = pep_end * 3
+                    params = {"acc" : acc, "cds_offset" : p_start}
+                    (start_chrom,start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
+                    params = {"acc" : acc, "cds_offset" : p_end}
+                    (end_chrom,end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
+                    locations.add('%s:%s-%s:%s' % (start_chrom,start_pos,end_chrom,end_pos))
+                    te = time()   
+                    add_time('GENOMIC_POS_QUERY',te - ts)
+                except:
+                    unmapped_accs.add(acc)
+                    print('Unmapped: %s' % acc, file=sys.stderr)
+            return len(locations)
+        return -1
+        
+    def spectrum_peptide_count(spectrum_id):
+        ts = time()
+        params = {"sr_id" : spectrum_id}
+        pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0]
+        te = time()   
+        add_time('SPECTRUM_PEPTIDES_QUERY',te - ts)
+        return pep_count
+
+    def get_exon_pileup(chrom,chromStart,chromEnd):
+        cols = []
+        for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd):
+            if chromStart <= pileupcolumn.reference_pos <= chromEnd:
+                bases = dict()
+                col = {'depth' : 0, 'cov' : pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases' : bases}
+                for pileupread in pileupcolumn.pileups:
+                    if not pileupread.is_del and not pileupread.is_refskip:
+                        col['depth'] += 1
+                        base = pileupread.alignment.query_sequence[pileupread.query_position]
+                        if base not in bases:
+                            bases[base] = 1
+                        else:
+                            bases[base] += 1
+                cols.append(col)
+        return cols
+
+    codon_map = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L",
+                 "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S",
+                 "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*",
+                 "TGT":"C", "TGC":"C", "TGA":"*", "TGG":"W",
+                 "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
+                 "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
+                 "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
+                 "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R",
+                 "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M",
+                 "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
+                 "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K",
+                 "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R",
+                 "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
+                 "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A",
+                 "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E",
+                 "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
+
+    aa_codon_map = dict()
+    for c,a in codon_map.items():
+        aa_codon_map[a] = [c] if a not in aa_codon_map else aa_codon_map[a] + [c]
+
+    aa_na_map =  dict()   # m[aa]{bo : {b1 : [b3]
+    for c,a in codon_map.items():
+        if a not in aa_na_map:
+            aa_na_map[a] = dict()
+        d = aa_na_map[a]
+        for i in range(3):
+            b = c[i]
+            if i < 2:
+                if b not in d:
+                    d[b] = dict() if i < 1 else set()
+                d = d[b]
+            else:
+                d.add(b)
+
+    def get_pep_codon(pileup, idx, aa, ao):
+        try:
+            ts = time()
+            bases = []
+            for i in range(3):
+                if i < ao:
+                    bases.append(list(set([c[i] for c in aa_codon_map[aa]])))
+                else:
+                    bases.append([b for b, cnt in reversed(sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k,v): (v,k)))])
+                print('%s' % bases)
+            for b0 in bases[0]:
+                if b0 not in aa_na_map[aa]:
+                    continue
+                for b1 in bases[1]:
+                    if b1 not in aa_na_map[aa][b0]:
+                        continue
+                    for b2 in bases[2]:
+                        if b2 in aa_na_map[aa][b0][b1]:
+                            return '%s%s%s' % (b0,b1,b2)
+            te = time()   
+            add_time('pep_codon',te - ts)
+        except Exception as e:
+            print("get_pep_codon: %s %s %s %s"
+                      % (aa, ao, idx, pileup), file=sys.stderr)
+            raise e
+        return None  
+
+    def write_probed(chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts,
+                     spectrum,protacc,peptide,uniqueness,genomeReference,score=1000,
+                     psmScore='.', fdr='.', mods='.', charge='.',
+                     expMassToCharge='.', calcMassToCharge='.',
+                     psmRank='.', datasetID='.', uri='.'):
+        probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
+            (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount,
+             ','.join([str(v) for v in blockSizes]),
+             ','.join([str(v) for v in blockStarts]),
+             protacc,peptide,uniqueness, genomeReference,
+             psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri))
+
+    def get_genomic_location(exons):
+        chrom = exons[0][1]
+        strand = exons[0][4]
+        pos = [exon[2] for exon in exons] + [exon[3] for exon in exons]
+        chromStart = min(pos)
+        chromEnd = max(pos)
+        blockCount = len(exons)
+        blockSizes = [abs(exon[3] - exon[2]) for exon in exons]
+        blockStarts = [min(exon[2],exon[3])  - chromStart for exon in exons]
+        return (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts)
+
+    def get_psm_modifications(peptide_ref):
+        mods = []
+        ts = time()
+        params = {"peptide_ref" : peptide_ref}
+        pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)]
+        if pepmods:
+            for (location, residue, name, modType, unimod) in pepmods:
+                mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name,residue)))
+        te = time()
+        add_time('PEP_MODS_QUERY',te - ts)
+        return ';'.join(mods)
+
+
+    """
+    QNAME
+    FLAG
+    RNAME
+    POS
+    CIGAR
+    SEQ
+    'NH' : 'i', #number of genomic locations to which the peptide sequence maps
+    'XO' : 'Z', #uniqueness of the peptide mapping
+    'XL' : 'i', #number of peptides to which the spectrum maps
+    'XP' : 'Z', #peptide sequence
+    'YP' : 'Z', #Protein accession ID from the original search result
+    'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
+    'XI' : 'f', #Peptide intensity
+    'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
+    'XR' : 'Z', #reference peptide sequence
+    'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
+    'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
+    'XS' : 'f', #PSM score
+    'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
+    'XC' : 'i', #peptide charge
+    'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
+    'XM' : 'Z', #Modifications
+    'XN' : 'i', #Number of missed cleavages in the peptide (XP)
+    'XT' : 'i', #Enzyme specificity
+    'XE' : 'i', #Enzyme used in the experiment
+    'XG' : 'A', #Peptide type
+    'XU' : 'Z', #URI
+    """
+    psm_cursor = get_connection(args.mzsqlite_file).cursor()
+    ts = time()
+    psms = psm_cursor.execute(PSM_QUERY)
+    te = time()   
+    add_time('PSM_QUERY',te - ts)
+    proBAM = ProBAM(species=None,assembly=args.genomeReference,seqlens=seqlens,comments=[])
+    proBED = ProBED(species=None,assembly=args.genomeReference,comments=[])
+    for i, psm in enumerate(psms):
+        probam_dict = PROBAM_DEFAULTS.copy()
+        (acc,pep_start,pep_end,aa_pre,aa_post,peptide,spectrum_id,spectrum_title,rank,charge,calcmass,exprmass,pepref) = psm
+        scan_name = spectrum_title if spectrum_title else spectrum_id
+        if args.debug:
+            print('\nPSM: %d\t%s' % (i, '\t'.join([str(v) for v in (acc,pep_start,pep_end,peptide,spectrum_id,scan_name,rank,charge,calcmass,exprmass)])), file=sys.stderr)
+        exons = get_mapping(acc,pep_start,pep_end)
+        if args.debug:
+            print('%s' % exons, file=sys.stderr)
+        if not exons:
+            continue
+        mods = get_psm_modifications(pepref)
+        (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) = get_genomic_location(exons)
+        ref_cds = get_cds(exons)
+        if args.debug:
+            print('%s' % ref_cds, file=sys.stderr)
+        ref_prot = translate(ref_cds)
+        if args.debug:
+            print('%s' % ref_prot, file=sys.stderr)
+            print('%s' % peptide, file=sys.stderr)
+        spectrum_peptides = spectrum_peptide_count(spectrum_id)
+        peptide_locations = genomic_mapping_count(peptide)
+        if args.debug:
+            print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides,peptide_locations), file=sys.stderr)
+        uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]'
+        ts = time()
+        proBEDEntry = ProBEDEntry(chrom,chromStart,chromEnd,
+                                  '%s_%s' % (acc,scan_name),
+                                  1000,strand,
+                                  blockCount,blockSizes,blockStarts,
+                                  acc,peptide,uniqueness,args.genomeReference,
+                                  charge=charge,expMassToCharge=exprmass,calcMassToCharge=calcmass,
+                                  mods=mods if mods else '.', psmRank=rank)
+        proBED.add_entry(proBEDEntry)
+        te = time()   
+        add_time('add_probed',te - ts)
+        if len(ref_prot) != len(peptide):
+            continue
+        ts = time()
+        probam_dict['NH'] = peptide_locations
+        probam_dict['XO'] = uniqueness 
+        probam_dict['XL'] = peptide_locations 
+        probam_dict['XP'] = peptide
+        probam_dict['YP'] = acc
+        probam_dict['XC'] = charge
+        probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass)
+        probam_dict['XR'] = ref_prot  # ? dbSequence
+        probam_dict['YA'] = aa_post
+        probam_dict['YB'] = aa_pre
+        probam_dict['XM'] = mods if mods else '*'
+        flag = 16 if strand == '-' else 0
+        if str(rank)!=str(1) and rank!='*' and rank!=[] and rank!="":
+            flag += 256
+        probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons])
+        ## check for variation from ref_cds
+        pep_cds = get_variant_cds(exons,ref_prot,peptide,ref_cds)
+        peptide_type = '*'
+        ## XG classify peptide
+        probam_dict['XG'] = get_peptide_type(exons)
+        ## probam_dict['MD'] = peptide
+
+        ## FIX  SAM sequence is forward strand
+        seq = pep_cds if strand == '+' else reverse_complement(pep_cds)
+        ## cigar based on plus strand
+        cigar = ''
+        if strand == '+':
+            blkStarts = blockStarts
+            blkSizes = blockSizes
+        else:
+            blkStarts = [x for x in reversed(blockStarts)]
+            blkSizes = [x for x in reversed(blockSizes)]
+        for j in range(blockCount):
+            if j > 0:
+                 intron = blkStarts[j] - (blkStarts[j-1] + blkSizes[j-1])
+                 if intron > 0:
+                     cigar += '%dN' % intron
+            cigar += '%dM' % blkSizes[j]
+        ## Mods TODO
+        proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart+1,
+                                  cigar=cigar,seq=seq,optional=probam_dict)
+        proBAM.add_entry(proBAMEntry)
+        te = time()   
+        add_time('add_probam',te - ts)
+        
+        if args.debug:
+            print('%s' % probam_dict, file=sys.stderr)
+
+        if args.limit and i >= args.limit: 
+            break
+    if args.probed:
+        ts = time()
+        with open(args.probed,'w') as fh:
+            proBED.write(fh)
+        te = time()   
+        add_time('write_probed',te - ts)
+    if args.prosam or args.probam:
+        samfile = args.prosam if args.prosam else 'temp.sam'
+        ts = time()
+        with open(samfile,'w') as fh:
+            proBAM.write(fh)
+        te = time()   
+        add_time('write_prosam',te - ts)
+        if args.probam:
+            ts = time()
+            bamfile = args.prosam.replace('.sam','.bam')
+            pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False)
+            te = time()   
+            add_time('write_probam',te - ts)
+            pysam.index(args.probam)
+
+    print('\n%s\n' % str(timings), file=sys.stderr)
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mzsqlite_psm_align.xml	Tue Apr 10 09:57:49 2018 -0400
@@ -0,0 +1,108 @@
+<tool id="mzsqlite_psm_align" name="MzSQLite ProBED ProBAM" version="0.1.0">
+    <description>from mz.sqlite aand genomic mapping</description>
+    <requirements>
+        <requirement type="package">biopython</requirement>
+        <requirement type="package">twobitreader</requirement>
+        <requirement type="package">pysam</requirement>
+        <requirement type="package">gffutils</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python '$__tool_directory__/mzsqlite_psm_align.py'  
+            #if $ref.ref_source == 'cached':
+                --twobit='$ref.ref_loc.fields.path'
+            #elif $ref.ref_source == 'history':
+                --twobit='$ref.ref_file'
+            #end if
+            #if $gffutilsdb:
+                --gffutils_file '$gffutilsdb'
+            #end if
+            #if $readlignments:
+                --reads_bam '$readlignments'
+            #end if
+            #if 'probed' in $output_formats:
+               --probed '$probed'
+            #end if
+            #if 'prosam' in $output_formats:
+               --prosam '$prosam'
+            #end if
+            #if 'probam' in $output_formats:
+               --probam '$probam'
+            #end if
+            #if $genomicref:
+               --genomeReference $genomicref
+            #else
+               --genomeReference $genomicdb.metadata.dbkey
+            #end if
+            '$mzsqlitedb' '$genomicdb'
+    ]]></command>
+    <inputs>
+        <param name="mzsqlitedb" type="data" format="mz.sqlite" label="mz.sqlite databse"/>
+        <param name="genomicdb" type="data" format="mz.sqlite" label="mz.sqlite databse"/>
+        <conditional name="ref">
+            <param name="ref_source" type="select" label="Source for Genomic Sequence Data">
+                <option value="cached">Locally cached twobit</option>
+                <option value="history">History dataset twobit</option>
+            </param>
+            <when value="cached">
+                <param name="ref_loc" type="select" label="Select reference 2bit file">
+                    <options from_data_table="twobit" />
+                </param>
+            </when>
+            <when value="history">
+                <param name="ref_file" type="data" format="twobit" label="reference 2bit file" />
+            </when>
+        </conditional>
+        <param name="gffutilsdb" type="data" format="sqlite" label="gffutils sqlite database" optional="true"/>
+        <param name="readlignments" type="data" format="bam" label="read alignments bam" optional="true"/>
+        <param name="genomicref" type="text" value="" label="Genome Reference name" optional="true"/>
+        <param name="output_formats" type="select" display="checkboxes" label="outputs" multiple="true">
+            <option value="probam">pro.bam</option>
+            <option value="prosam">pro.sam</option>
+            <option value="probed">pro.bed</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="prosam" format="pro.sam">
+            <filter>'prosam' in output_formats</filter>
+        </data>
+        <data name="probam" format="pro.bam">
+            <filter>'probam' in output_formats</filter>
+        </data>
+        <data name="probed" format="pro.bed">
+            <filter>'probed' in output_formats</filter>
+        </data>
+    </outputs>
+    <help><![CDATA[
+
+Generates proBAM or proBED feature alignment files for peptides identified from a mass spectrometry protein search analysis.
+
+The tool mz_to_sqlite generates the a SQLite database for a mzIdentML file, 
+along with the fasta search database and the spectrum files used in the search.
+
+The genomic mapping sqlite database has this schema:
+
+    CREATE TABLE feature_cds_map (	/* One row for each exon in the search protein */
+        name TEXT, 		/* Accession name of search protein in mzIdentML */
+        chrom TEXT, 		/* Reference genome chromosome for this exon */
+        start INTEGER, 		/* genomic start of the exon (zero-based like BED) */
+        end INTEGER, 		/* genomic end of the exon (non-incluse like BED) */
+        strand TEXT, 		/* genomic strand: '+' or '-' */
+        cds_start INTEGER, 	/* The CDS coding start for this exon (zero-based) */
+        cds_end INTEGER		/* The CDS coding start end this exon (non-inclusive) */
+    );
+
+Example:
+    sqlite> select * from feature_cds_map WHERE name like 'ENSMUSP00000000001%';
+    ENSMUSP00000000001      chr3    108145887       108146005       -       0       118
+    ENSMUSP00000000001      chr3    108123794       108123837       -       118     161
+    ENSMUSP00000000001      chr3    108123541       108123683       -       161     303
+    ENSMUSP00000000001      chr3    108118300       108118458       -       303     461
+    ENSMUSP00000000001      chr3    108115762       108115891       -       461     590
+    ENSMUSP00000000001      chr3    108112472       108112602       -       590     720
+    ENSMUSP00000000001      chr3    108111934       108112088       -       720     874
+    ENSMUSP00000000001      chr3    108109421       108109612       -       874     1065
+
+This schema can describe structural variants as well as canonical transcripts.
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/profmt.py	Tue Apr 10 09:57:49 2018 -0400
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2016, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+import sys,re
+from operator import itemgetter, attrgetter
+from twobitreader import TwoBitFile
+
+"""
+1	QNAME	string	spectrum name	*
+2	FLAG	int	bitwise FLAG (see further)	*
+3	RNAME	string	reference sequence name	*
+4	POS	int	1-based lefmost mapping position	0
+5	MAPQ	int	unused in proBAM	255
+6	CIGAR	string	extended cigar string (see further)	*
+7	RNEXT	string	unused in proBAM	*
+8	PNEXT	int	unused in proBAM	0
+9	TLEN	int	unused in proBAM	0
+10	SEQ	string	coding sequence	*
+11	QUAL	string	unused in proBAM	*
+"""
+"""
+bit	description	FLAG
+0x00	peptide maps to the forward strand	0
+0x10	peptide maps to the reverse strand	16
+0x100	peptide is not the rank=1 peptide for the spectrum	256
+0x400	decoy peptide	1024
+0x4	unmapped peptide	4
+"""
+
+"""
+tag	type	description
+---     ----    -----------
+NH	int	number of genomic locations to which the peptide sequence maps
+XL	int	number of peptides to which the spectrum maps
+XP	string	peptide sequence
+XR	string	reference peptide sequence
+XS	float	PSM score
+XQ	float	PSM q-value PSM FDR (i.e. q-value or 1-PEP).
+XC	int	peptide charge
+XA	int	Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
+XM	string	Modification(s): semicolon seperated list of position,modName
+XN	int	number of missed cleavages
+XT	int	tryptic state: 0:non-tryptic 1:semi-tryptic 2:tryptic
+XG	string	peptide type: N:normal peptide V:variant peptide J:novel junction peptide D:decoy peptide U:unmappped
+XB	string	Semicolon-separated list of mass in the following format: massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
+XE	int	
+XF	string	Reading frame of the peptide (0, 1, 2) (See section 4.4.6).
+XG	char	Peptide type (see Table 6 and Figure 1)
+XI	float	Peptide intensity
+XO	string	This field indicates the uniqueness of the peptide mapping
+XU	string	
+
+
+NH         i     NH:i:1
+XO         Z     XO:Z:unique
+XL         i     XL:i:1
+XP         Z     XP:Z:ATLELTHNWGTEDDATQSYHNGNSDPR 
+YP         Z     YP:Z:ENSP00000362463_rs4746:E111A
+XF         Z     XF:Z:1,1
+XI         f     XI:f:*
+XB         f     XB:f:0.70082940064
+XR         Z     XR:Z:ATLELTHNWGTEDDETQSYHNGNSDPR 
+YB         Z     YB:Z:RK
+YA         Z     YA:Z:GF
+XS         f     XS:f:73.1426
+XQ         f     XQ:f:0
+XC         i     XC:i:3
+XA         i     XA:i:0
+XM         Z     XM:Z:*
+XN         i     XN:i:0
+XT         i     XT:i:2
+XE         i     XE:i:1
+XG         Z     XG:Z:V
+XU         Z     klc_070108x_PH_P7_COLO_205_D13.pepXML
+
+NH:i:1
+XO:Z:unique
+XL:i:1
+XP:Z:ATLELTHNWGTEDDATQSYHNGNSDPR 
+YP:Z:ENSP00000362463_rs4746:E111A
+XF:Z:1,1
+XI:f:*
+XB:f:0.70082940064
+XR:Z:ATLELTHNWGTEDDETQSYHNGNSDPR 
+YB:Z:RK
+YA:Z:GF
+XS:f:73.1426
+XQ:f:0
+XC:i:3
+XA:i:0
+XM:Z:*
+XN:i:0
+XT:i:2
+XE:i:1
+XG:Z:V
+XU:Z:klc_070108x_PH_P7_COLO_205_D13.pepXML
+
+
+NH:i:*
+XO:Z:unique
+XL:i:1
+XP:Z:ATLELTHNWGTEDDATQSYHNGNSDPR 
+YP:Z:ENSP00000362463_rs4746:E111A
+XF:Z:1,1
+XI:f:*
+XB:f:0.70082940064
+XR:Z:ATLELTHNWGTEDDETQSYHNGNSDPR 
+YB:Z:RK
+YA:Z:GF
+XS:f:73.1426
+XQ:f:0
+XC:i:3
+XA:i:0
+XM:Z:*
+XN:i:0
+XT:i:2
+XE:i:1
+XG:Z:V
+XU:Z:klc_070108x_PH_P7_COLO_205_D13.pepXML
+"""
+
+
+PROBAM_TAGS = ['NH', 'XO', 'XL', 'XP', 'YP', 'XF', 'XI', 'XB', 'XR', 'YB', 'YA', 'XS', 'XQ', 'XC', 'XA', 'XM', 'XN', 'XT', 'XE', 'XG', 'XU']
+
+
+PROBAM_TYTPES = {
+    'NH' : 'i', #number of genomic locations to which the peptide sequence maps
+    'XO' : 'Z', #uniqueness of the peptide mapping
+    'XL' : 'i', #number of peptides to which the spectrum maps
+    'XP' : 'Z', #peptide sequence
+    'YP' : 'Z', #Protein accession ID from the original search result
+    'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
+    'XI' : 'f', #Peptide intensity
+    'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
+    'XR' : 'Z', #reference peptide sequence
+    'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
+    'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
+    'XS' : 'f', #PSM score
+    'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
+    'XC' : 'i', #peptide charge
+    'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
+    'XM' : 'Z', #Modifications
+    'XN' : 'i', #Number of missed cleavages in the peptide (XP)
+    'XT' : 'i', #Enzyme specificity
+    'XE' : 'i', #Enzyme used in the experiment
+    'XG' : 'A', #Peptide type
+    'XU' : 'Z', #URI
+}
+
+
+PROBAM_DEFAULTS = {
+    'NH' : -1, #number of genomic locations to which the peptide sequence maps
+    'XO' : '*', #uniqueness of the peptide mapping
+    'XL' : -1, #number of peptides to which the spectrum maps
+    'XP' : '*', #peptide sequence
+    'YP' : '*', #Protein accession ID from the original search result
+    'XF' : '*', #Reading frame of the peptide (0, 1, 2)
+    'XI' : -1, #Peptide intensity
+    'XB' : '*', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
+    'XR' : '*', #reference peptide sequence
+    'YB' : '*', #Preceding amino acids (2 AA, B stands for before).
+    'YA' : '*', #Following amino acids (2 AA, A stands for after).
+    'XS' : -1, #PSM score
+    'XQ' : -1, #PSM FDR (i.e. q-value or 1-PEP).
+    'XC' : -1, #peptide charge
+    'XA' : -1, #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
+    'XM' : '*', #Modifications
+    'XN' : -1, #Number of missed cleavages in the peptide (XP)
+    'XT' : -1, #Enzyme specificity
+    'XE' : -1, #Enzyme used in the experiment
+    'XG' : '*', #Peptide type
+    'XU' : '*', #URI
+}
+
+def cmp_alphanumeric(s1,s2):
+  if s1 == s2:
+    return 0
+  a1 = re.findall("\d+|[a-zA-Z]+",s1)
+  a2 = re.findall("\d+|[a-zA-Z]+",s2)
+  for i in range(min(len(a1),len(a2))):
+    if a1[i] == a2[i]:
+      continue
+    if a1[i].isdigit() and a2[i].isdigit():
+      return int(a1[i]) - int(a2[i])
+    return 1 if a1[i] >  a2[i] else -1
+  return len(a1) - len(a2)
+
+
+def sort_chrom_names(names):
+    rnames = sorted(names,cmp=cmp_alphanumeric)
+    if 'chrM' in rnames:
+        rnames.remove('chrM')
+        rnames.insert(0,'chrM')
+    if 'MT' in rnames:
+        rnames.remove('MT')
+        rnames.append('MT')
+    return rnames
+
+def as_int_list(obj):
+    if obj is None:
+        return None
+    if isinstance(obj, list):
+        return [int(x) for x in obj]
+    elif isinstance(obj, str):
+        return [int(x) for x in obj.split(',')]
+    else:  # python2 unicode?
+        return [int(x) for x in str(obj).split(',')]
+
+
+class ProBEDEntry (object): 
+    def __init__(self, chrom, chromStart, chromEnd, name, score, strand, 
+                 blockCount, blockSizes, blockStarts, 
+                 protacc, peptide, uniqueness, genomeReference,
+                 psmScore='.',  fdr='.',  mods='.',  charge='.', 
+                 expMassToCharge='.',  calcMassToCharge='.', 
+                 psmRank='.',  datasetID='.',  uri='.'):
+        self.chrom = chrom
+        self.chromStart = int(chromStart)
+        self.chromEnd = int(chromEnd)
+        self.name = name
+        self.score = int(score) if score is not None else 0
+        self.strand = '-' if str(strand).startswith('-') else '+'
+        self.thickStart = self.chromStart
+        self.thickEnd = self.chromEnd
+        self.itemRgb = '0'
+        self.blockCount = int(blockCount)
+        self.blockSizes = as_int_list(blockSizes)
+        self.blockStarts = as_int_list(blockStarts)
+        self.protacc = protacc
+        self.peptide = peptide
+        self.uniqueness = uniqueness
+        self.genomeReference = genomeReference
+        self.psmScore = psmScore
+        self.fdr = fdr
+        self.mods = mods
+        self.charge = charge
+        self.expMassToCharge = expMassToCharge
+        self.calcMassToCharge = calcMassToCharge
+        self.psmRank = psmRank
+        self.datasetID = datasetID
+        self.uri = uri
+
+    def __str__(self):
+        return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
+            (self.chrom, self.chromStart, self.chromEnd,
+             self.name, self.score, self.strand,
+             self.thickStart, self.thickEnd, self.itemRgb,
+             self.blockCount, self.blockSizes, self.blockStarts,
+             self.protacc, self.peptide, self.uniqueness,
+             self.genomeReference,
+             self.psmScore, self.fdr, self.mods,
+             self.charge, self.expMassToCharge, self.calcMassToCharge,
+             self.psmRank, self.datasetID, self.uri)
+
+
+class ProBED ( object ): 
+    def __init__(self,species=None,assembly=None,comments=[]):
+        self.species = species
+        self.assembly = assembly
+        self.comments = comments
+        self.entries = dict()
+    
+    def add_entry(self,entry):
+        if not entry.chrom in self.entries:
+            self.entries[entry.chrom] = []
+        self.entries[entry.chrom].append(entry)
+
+    def write(self,fh):
+        rnames = sort_chrom_names(self.entries.keys())
+        for sn in rnames:
+            if sn not in self.entries:
+                continue
+            ##for pbe in sorted(self.entries[sn], key=lambda probam_entry: probam_entry.pos):
+            for pbe in sorted(self.entries[sn], key=attrgetter('chromStart','chromEnd')):
+                fh.write('%s\n' % str(pbe))
+
+
+class ProBAMEntry (object): 
+    def __init__(self, qname='', flag=0, rname='', pos=0, mapq=255, cigar='', rnext='*', pnext='0', tlen='0', seq='*', qual='*', optional=PROBAM_DEFAULTS):
+        self.qname = qname
+        self.flag = flag
+        self.rname = rname
+        self.pos = pos
+        self.mapq = mapq 
+        self.cigar = cigar
+        self.rnext = rnext
+        self.pnext = pnext
+        self.tlen = tlen
+        self.seq = seq
+        self.qual = qual 
+        self.optional = optional
+    def __str__(self):
+        opt_cols = '\t%s' % '\t'.join(['%s:%s:%s' % (t,PROBAM_TYTPES[t],self.optional[t]) for t in PROBAM_TAGS]) if self.optional else ''
+        return '%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s%s' % (
+            self.qname,self.flag,self.rname,self.pos,self.mapq,self.cigar,
+            str(self.rnext) if self.rnext else '',
+            str(self.pnext) if self.pnext else '',
+            str(self.tlen) if self.tlen else '',
+            self.seq,
+            self.qual, opt_cols)
+    def add_optional(self,tag,value):
+        self.optional[tag] = value
+
+    
+class ProBAM ( object ): 
+    def __init__(self,species=None,assembly=None,seqlens={},comments=[]):
+        self.species = species
+        self.assembly = assembly
+        self.seqlens = seqlens    
+        self.comments = comments
+        self.entries = dict()
+        self.opt_columns = set()
+        self.rg = []
+    
+    def add_entry(self,pb_entry):
+        if not pb_entry.rname in self.entries:
+            self.entries[pb_entry.rname] = []
+        self.entries[pb_entry.rname].append(pb_entry)
+        if pb_entry.optional:
+            self.opt_columns | set(pb_entry.optional.keys())
+
+    def add_entry_from_bed(self,bed_entry,optional=dict()):
+        if bed_entry.pep:
+            optional['XP:Z'] = bed_entry.pep    
+        qname=bed_entry.name
+        flag = 0 if bed_entry.strand == '+' else 16
+        rname = bed_entry.chrom
+        pos = bed_entry.chromStart + 1
+        cigar = bed_entry.get_cigar()
+        seq = bed_entry.get_spliced_seq(strand='+') if bed_entry.seq else '*'
+        pb_entry = ProBAMEntry(qname=qname, flag=flag, rname=rname, pos=pos,cigar=cigar,seq=seq,optional=optional)
+        self.add_entry(pb_entry)
+        ## print >> sys.stderr,('add_entry_from_bed:%s\n  %s\n  %s' % (self.entries.keys(),bed_entry,pb_entry))
+
+    def write(self,fh):
+        fh.write('@HD	VN:1.0	SO:coordinate\n')
+        rnames = sort_chrom_names(self.seqlens.keys())
+        for sn in rnames:
+            fh.write('@SQ\tSN:%s\tLN:%d\n' % (sn,self.seqlens[sn]))
+        for rg in self.rg:
+            fh.write('@RG\tID:%s\n' % (rg))
+        fh.write('@PG\tID:SampleSpecificGenerator\n')
+        for comment in self.comments:
+            fh.write('@CO\t%s\n' % comment)
+        for sn in rnames:
+            if sn not in self.entries:
+                continue
+            ##for pbe in sorted(self.entries[sn], key=lambda probam_entry: probam_entry.pos):
+            for pbe in sorted(self.entries[sn], key=attrgetter('pos')):
+                fh.write('%s\n' % str(pbe))
+
Binary file profmt.pyc has changed