# HG changeset patch
# User galaxyp
# Date 1523916053 14400
# Node ID f2dc9805107ac7015716c882b0cab9260b39f988
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mzsqlite_psm_align commit 464e05be1084ed9a65b542c8eabb18147d425666
diff -r 000000000000 -r f2dc9805107a mzsqlite_psm_align.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mzsqlite_psm_align.py Mon Apr 16 18:00:53 2018 -0400
@@ -0,0 +1,753 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+# University of Minnesota
+# Copyright 2017, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+# James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+from __future__ import print_function
+
+import argparse
+import re
+import sys
+import sqlite3 as sqlite
+from time import time
+
+
+from Bio.Seq import reverse_complement, translate
+
+
+import pysam
+from twobitreader import TwoBitFile
+
+from profmt import PROBAM_DEFAULTS,ProBAM,ProBAMEntry,ProBED,ProBEDEntry
+
+
+def regex_match(expr, item):
+ return re.match(expr, item) is not None
+
+
+def regex_search(expr, item):
+ return re.search(expr, item) is not None
+
+
+def regex_sub(expr, replace, item):
+ return re.sub(expr, replace, item)
+
+
+def get_connection(sqlitedb_path, addfunctions=True):
+ conn = sqlite.connect(sqlitedb_path)
+ if addfunctions:
+ conn.create_function("re_match", 2, regex_match)
+ conn.create_function("re_search", 2, regex_search)
+ conn.create_function("re_sub", 3, regex_sub)
+ return conn
+
+
+## Peptide Spectral Match (PSM)s
+PSM_QUERY = """\
+SELECT
+ pe.dBSequence_ref,
+ pe.start,
+ pe.end,
+ pe.pre,
+ pe.post,
+ pep.sequence,
+ sr.id,
+ sr.spectrumTitle,
+ si.rank,
+ si.chargeState,
+ si.calculatedMassToCharge,
+ si.experimentalMassToCharge,
+ si.peptide_ref
+FROM spectrum_identification_results sr
+JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
+JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
+JOIN peptides pep ON pe.peptide_ref = pep.id
+WHERE pe.isDecoy = 'false'
+ORDER BY sr.spectrumTitle,si.rank
+"""
+
+
+## Peptide Post Translational Modifications
+PEP_MODS_QUERY = """\
+SELECT location, residue, name, modType, '' as "unimod"
+FROM peptide_modifications
+WHERE peptide_ref = :peptide_ref
+ORDER BY location, modType, name
+"""
+
+
+## number of peptides to which the spectrum maps
+## spectrum_identification_results => spectrum_identification_result_items -> peptide_evidence
+SPECTRUM_PEPTIDES_QUERY = """\
+SELECT count(distinct pep.sequence)
+FROM spectrum_identification_results sr
+JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
+JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
+JOIN peptides pep ON pe.peptide_ref = pep.id
+WHERE pe.isDecoy = 'false'
+AND sr.id = :sr_id
+GROUP BY sr.id
+"""
+
+
+## number of genomic locations to which the peptide sequence maps
+## uniqueness of the peptide mapping
+## peptides => peptide_evidence -> db_sequence -> location
+## proteins_by_peptide
+PEPTIDE_ACC_QUERY = """\
+SELECT
+ pe.dBSequence_ref,
+ pe.start,
+ pe.end
+FROM peptide_evidence pe
+JOIN peptides pep ON pe.peptide_ref = pep.id
+WHERE pe.isDecoy = 'false'
+AND pep.sequence = :sequence
+"""
+
+
+MAP_QUERY = """\
+SELECT distinct *
+FROM feature_cds_map
+WHERE name = :acc
+AND :p_start < cds_end
+AND :p_end >= cds_start
+ORDER BY name,cds_start,cds_end
+"""
+
+
+GENOMIC_POS_QUERY = """\
+SELECT distinct chrom, CASE WHEN strand = '+' THEN start + :cds_offset - cds_start ELSE end - :cds_offset - cds_start END as "pos"
+FROM feature_cds_map
+WHERE name = :acc
+AND :cds_offset >= cds_start
+AND :cds_offset < cds_end
+"""
+
+
+FEATURE_QUERY = """\
+SELECT distinct seqid,start,end,featuretype,strand,
+CAST(frame AS INTEGER) as "frame", CAST(frame AS INTEGER)==:frame as "in_frame",
+CASE WHEN :strand = '+' THEN start - :start ELSE end - :end END as "start_pos",
+CASE WHEN :strand = '+' THEN end - :end ELSE start - :start END as "end_pos",
+parent as "name"
+FROM features JOIN relations ON features.id = relations.child
+WHERE seqid = :seqid
+AND parent in (
+SELECT id
+FROM features
+WHERE seqid = :seqid
+AND featuretype = 'transcript'
+AND start <= :tstart AND :tend <= end
+)
+AND :end >= start AND :start <= end
+AND featuretype in ('CDS','five_prime_utr','three_prime_utr','transcript')
+ORDER BY strand = :strand DESC, featuretype
+"""
+
+## Use order by to get best matches
+## one_exon strand, featuretype, contained, inframe,
+## first_exon strand, featuretype, contained, end_pos,
+## middle_exon strand, featuretype, contained, start_pos, end_pos,
+## last_exon strand, featuretype, contained, start_pos,
+
+ONLY_EXON_QUERY = FEATURE_QUERY + """,\
+start <= :start AND end >= :end DESC,
+in_frame DESC, end - start DESC, start DESC, end
+"""
+
+FIRST_EXON_QUERY = FEATURE_QUERY + """,\
+start <= :start AND end >= :end DESC,
+in_frame DESC, abs(end_pos), start DESC, end
+"""
+
+MIDDLE_EXON_QUERY = FEATURE_QUERY + """,\
+start <= :start AND end >= :end DESC,
+in_frame DESC, abs(start_pos), abs(end_pos), start DESC, end
+"""
+
+LAST_EXON_QUERY = FEATURE_QUERY + """,\
+start <= :start AND end >= :end DESC,
+in_frame DESC, abs(start_pos), start DESC, end
+"""
+
+
+def __main__():
+ parser = argparse.ArgumentParser(
+ description='Generate proBED and proBAM from mz.sqlite')
+ parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML")
+ parser.add_argument('genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table")
+ parser.add_argument(
+ '-R', '--genomeReference', default='Unknown',
+ help='Genome reference sequence in 2bit format')
+ parser.add_argument(
+ '-t', '--twobit', default=None,
+ help='Genome reference sequence in 2bit format')
+ parser.add_argument(
+ '-r', '--reads_bam', default=None,
+ help='reads alignment bam path')
+ parser.add_argument(
+ '-g', '--gffutils_sqlite', default=None,
+ help='gffutils GTF sqlite DB')
+ parser.add_argument(
+ '-B', '--probed', default=None,
+ help='proBed path')
+ parser.add_argument(
+ '-s', '--prosam', default=None,
+ help='proSAM path')
+ parser.add_argument(
+ '-b', '--probam', default=None,
+ help='proBAM path')
+ parser.add_argument(
+ '-l', '--limit', type=int, default=None,
+ help='limit numbers of PSMs for testing')
+ parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
+ parser.add_argument('-d', '--debug', action='store_true', help='Debug')
+ args = parser.parse_args()
+
+ def get_sequence(chrom, start, end):
+ if twobit:
+ if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
+ return twobit[chrom][start:end]
+ contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
+ if contig in twobit and 0 <= start < end < len(twobit[contig]):
+ return twobit[contig][start:end]
+ return ''
+ return None
+
+ twobit = TwoBitFile(args.twobit) if args.twobit else None
+ samfile = pysam.AlignmentFile(args.reads_bam, "rb" ) if args.reads_bam else None
+ seqlens = twobit.sequence_sizes()
+
+ probed = open(args.probed,'w') if args.probed else sys.stdout
+
+ gff_cursor = get_connection(args.gffutils_sqlite).cursor() if args.gffutils_sqlite else None
+ map_cursor = get_connection(args.genomic_mapping_sqlite).cursor()
+ mz_cursor = get_connection(args.mzsqlite).cursor()
+
+ unmapped_accs = set()
+ timings = dict()
+ def add_time(name,elapsed):
+ if name in timings:
+ timings[name] += elapsed
+ else:
+ timings[name] = elapsed
+
+ XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*']
+ FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript']
+ def get_exon_features(exons):
+ efeatures = None
+ transcript_features = dict()
+ t_start = min(exons[0][2],exons[-1][2])
+ t_end = max(exons[0][3],exons[-1][3])
+ if gff_cursor:
+ ts = time()
+ (acc,gc,gs,ge,st,cs,ce) = exons[0]
+ ft_params = {"seqid" : str(gc).replace('chr',''), 'strand' : st, 'tstart': t_start, 'tend': t_end}
+ efeatures = [None] * len(exons)
+ for i,exon in enumerate(exons):
+ (acc,gc,gs,ge,st,cs,ce) = exon
+ fr = cs % 3
+ if args.debug:
+ print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc,gc,gs,ge,st,cs,ce,fr),file=sys.stderr)
+ ft_params = {"seqid" : str(gc).replace('chr',''), "start" : gs + 1, "end" : ge, 'strand' : st, 'frame' : fr, 'tstart': t_start, 'tend': t_end}
+ if len(exons) == 1:
+ q = ONLY_EXON_QUERY
+ elif i == 0:
+ q = FIRST_EXON_QUERY
+ elif len(exons) - i == 1:
+ q = LAST_EXON_QUERY
+ else:
+ q = MIDDLE_EXON_QUERY
+ features = [f for f in gff_cursor.execute(q,ft_params)]
+ transcripts = []
+ efeatures[i] = []
+ for j,f in enumerate(features):
+ transcript = f[-1]
+ ftype = f[3]
+ if ftype == 'transcript':
+ if i > 0 or efeatures[i]:
+ continue
+ if i == 0:
+ if transcript not in transcript_features:
+ transcript_features[transcript] = [[] for _ in range(len(exons))]
+ transcript_features[transcript][i].append(f)
+ efeatures[i].append(f)
+ else:
+ if transcript in transcript_features:
+ transcript_features[transcript][i].append(f)
+ efeatures[i].append(f)
+ else:
+ del transcript_features[transcript]
+ if not efeatures[i]:
+
+ efeatures[i] = transcripts
+ for f in efeatures[i]:
+ (seqid,start,end,featuretype,strand,frame,in_frame,start_offset,end_offset,parent) = f
+ if args.debug:
+ print('feat%d:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \
+ (i,seqid,start,end,featuretype,strand,frame,str(in_frame) == '1',start_offset,end_offset,parent),file=sys.stderr)
+ if args.debug:
+ print('fmap:\t%s' % transcript_features,file=sys.stderr)
+ return (efeatures,transcript_features)
+
+ def is_structural_variant(exons):
+ if len(exons) > 1:
+ (acc,gc,gs,ge,st,cs,ce) = exons[0]
+ for i in range(1,len(exons)):
+ if gc != exons[i][1] or st != exons[i][4]:
+ return True
+ return False
+
+ XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*']
+ FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript']
+ def classify_transcript(exons,transcript_features,ref_prot,peptide):
+ etypes = ['*'] * len(exons)
+ features = transcript_features[0]
+ (acc,gc,gs,ge,st,cs,ce) = exons[0]
+ if len(features) == 1:
+ (seqid,start,end,featuretype,strand,frame,in_frame,start_offset,end_offset,parent) = features[0]
+ if strand == st:
+ if featuretype == 'CDS':
+ if start <= gs and ge <= end:
+ if in_frame:
+ if ref_prot == peptide:
+ if len(exons) > 1:
+ pass
+ return 'N'
+ elif len(ref_prot) != len(peptide):
+ return 'W'
+ else:
+ return 'V'
+ else:
+ return 'O'
+ elif strand == '+' and start <= gs or ge > end:
+ return 'C'
+ elif strand == '-' and start <= gs and ge <= end:
+ return 'C'
+ elif featuretype == 'five_prime_utr':
+ return 'E'
+ elif featuretype == 'three_prime_utr':
+ return 'B'
+ elif featuretype == 'transcript':
+ return 'I'
+ else:
+ return 'R'
+ elif len(features) > 1:
+ ftypes = [f[3] for f in features]
+ if 'five_prime_utr' in ftypes:
+ return 'E'
+ elif 'three_prime_utr' in ftypes:
+ return 'B'
+ else:
+ return 'C'
+ return '*'
+
+ def classify_peptide(exons,ref_prot,peptide,pep_cds,probam_dict):
+ if ref_prot != peptide and samfile:
+ try:
+ if args.debug:
+ print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr)
+ ts = time()
+ for exon in exons:
+ (acc,chrom,start,end,strand,c_start,c_end) = exon
+ a_start = c_start / 3 * 3
+ a_end = c_end / 3 * 3
+ if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
+ pileup = get_exon_pileup(chrom,start,end)
+ for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]):
+ if ao == 0 or i == 0:
+ if ref_prot[ai] != peptide[ai]:
+ codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao)
+ if args.debug:
+ print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr)
+ if codon:
+ pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:]
+ te = time()
+ add_time('var_cds',te - ts)
+ except Exception as e:
+ print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr)
+ probam_dict['XG'] = '*'
+ if is_structural_variant(exons):
+ probam_dict['XG'] = 'G'
+ else:
+ (efeatures,transcript_features) = get_exon_features(exons)
+ n = len(exons)
+ if efeatures and efeatures[0]:
+ for f in efeatures[0]:
+ transcript = f[9]
+ features = transcript_features[transcript]
+ xg = classify_transcript(exons,features,ref_prot,peptide)
+ if xg != '*':
+ probam_dict['XG'] = xg
+ break
+ return pep_cds
+
+ def get_variant_cds(exons,ref_prot,peptide,pep_cds):
+ if ref_prot != peptide and samfile:
+ try:
+ if args.debug:
+ print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr)
+ ts = time()
+ for exon in exons:
+ (acc,chrom,start,end,strand,c_start,c_end) = exon
+ a_start = c_start / 3 * 3
+ a_end = c_end / 3 * 3
+ if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
+ pileup = get_exon_pileup(chrom,start,end)
+ for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]):
+ if ao == 0 or i == 0:
+ if ref_prot[ai] != peptide[ai]:
+ codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao)
+ if args.debug:
+ print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr)
+ if codon:
+ pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:]
+ te = time()
+ add_time('var_cds',te - ts)
+ except Exception as e:
+ print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr)
+ return pep_cds
+
+ def get_mapping(acc,pep_start,pep_end):
+ ts = time()
+ p_start = (pep_start - 1) * 3
+ p_end = pep_end * 3
+ map_params = {"acc" : acc, "p_start" : p_start, "p_end" : p_end}
+ if args.debug:
+ print('%s' % map_params, file=sys.stderr)
+ locs = [l for l in map_cursor.execute(MAP_QUERY,map_params)]
+ exons = []
+ ## ========= pep
+ ## --- continue
+ ## --- trim
+ ## --- copy
+ ## --- trim
+ ## --- break
+ c_end = 0
+ for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(locs):
+ if args.debug:
+ print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
+ c_start = c_end
+ if cds_end < p_start:
+ continue
+ if cds_start >= p_end:
+ break
+ if strand == '+':
+ if cds_start < p_start:
+ start += p_start - cds_start
+ if cds_end > p_end:
+ end -= cds_end - p_end
+ else:
+ if cds_start < p_start:
+ end -= p_start - cds_start
+ if cds_end > p_end:
+ start += cds_end - p_end
+ c_end = c_start + abs(end - start)
+ if args.debug:
+ print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
+ exons.append([acc,chrom,start,end,strand,c_start,c_end])
+ te = time()
+ add_time('get_mapping',te - ts)
+ return exons
+
+ def get_cds(exons):
+ ts = time()
+ seqs = []
+ for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(exons):
+ seq = get_sequence(chrom, min(start,end), max(start,end))
+ if strand == '-':
+ seq = reverse_complement(seq)
+ seqs.append(seq)
+ te = time()
+ add_time('get_cds',te - ts)
+ if args.debug:
+ print('CDS: %s' % str(seqs),file=sys.stderr)
+ return ''.join(seqs) if seqs else ''
+
+ def genomic_mapping_count(peptide):
+ ts = time()
+ params = {"sequence" : peptide}
+ acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY,params)]
+ te = time()
+ add_time('PEPTIDE_ACC_QUERY',te - ts)
+ if acc_locs:
+ if len(acc_locs) == 1:
+ return 1
+ locations = set()
+ for i,acc_loc in enumerate(acc_locs):
+ (acc,pep_start,pep_end) = acc_loc
+ if acc in unmapped_accs:
+ continue
+ try:
+ add_time('GENOMIC_POS_QUERY_COUNT',1)
+ ts = time()
+ p_start = pep_start * 3
+ p_end = pep_end * 3
+ params = {"acc" : acc, "cds_offset" : p_start}
+ (start_chrom,start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
+ params = {"acc" : acc, "cds_offset" : p_end}
+ (end_chrom,end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
+ locations.add('%s:%s-%s:%s' % (start_chrom,start_pos,end_chrom,end_pos))
+ te = time()
+ add_time('GENOMIC_POS_QUERY',te - ts)
+ except:
+ unmapped_accs.add(acc)
+ if args.debug:
+ print('Unmapped: %s' % acc, file=sys.stderr)
+ return len(locations)
+ return -1
+
+ def spectrum_peptide_count(spectrum_id):
+ ts = time()
+ params = {"sr_id" : spectrum_id}
+ pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0]
+ te = time()
+ add_time('SPECTRUM_PEPTIDES_QUERY',te - ts)
+ return pep_count
+
+ def get_exon_pileup(chrom,chromStart,chromEnd):
+ cols = []
+ for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd):
+ if chromStart <= pileupcolumn.reference_pos <= chromEnd:
+ bases = dict()
+ col = {'depth' : 0, 'cov' : pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases' : bases}
+ for pileupread in pileupcolumn.pileups:
+ if not pileupread.is_del and not pileupread.is_refskip:
+ col['depth'] += 1
+ base = pileupread.alignment.query_sequence[pileupread.query_position]
+ if base not in bases:
+ bases[base] = 1
+ else:
+ bases[base] += 1
+ cols.append(col)
+ return cols
+
+ codon_map = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L",
+ "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S",
+ "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*",
+ "TGT":"C", "TGC":"C", "TGA":"*", "TGG":"W",
+ "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
+ "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
+ "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
+ "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R",
+ "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M",
+ "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
+ "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K",
+ "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R",
+ "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
+ "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A",
+ "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E",
+ "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
+
+ aa_codon_map = dict()
+ for c,a in codon_map.items():
+ aa_codon_map[a] = [c] if a not in aa_codon_map else aa_codon_map[a] + [c]
+
+ aa_na_map = dict() # m[aa]{bo : {b1 : [b3]
+ for c,a in codon_map.items():
+ if a not in aa_na_map:
+ aa_na_map[a] = dict()
+ d = aa_na_map[a]
+ for i in range(3):
+ b = c[i]
+ if i < 2:
+ if b not in d:
+ d[b] = dict() if i < 1 else set()
+ d = d[b]
+ else:
+ d.add(b)
+
+ def get_pep_codon(pileup, idx, aa, ao):
+ try:
+ ts = time()
+ bases = []
+ for i in range(3):
+ if i < ao:
+ bases.append(list(set([c[i] for c in aa_codon_map[aa]])))
+ else:
+ bases.append([b for b, cnt in reversed(sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k,v): (v,k)))])
+ if args.debug:
+ print('%s' % bases,file=sys.stderr)
+ for b0 in bases[0]:
+ if b0 not in aa_na_map[aa]:
+ continue
+ for b1 in bases[1]:
+ if b1 not in aa_na_map[aa][b0]:
+ continue
+ for b2 in bases[2]:
+ if b2 in aa_na_map[aa][b0][b1]:
+ return '%s%s%s' % (b0,b1,b2)
+ te = time()
+ add_time('pep_codon',te - ts)
+ except Exception as e:
+ print("get_pep_codon: %s %s %s %s"
+ % (aa, ao, idx, pileup), file=sys.stderr)
+ raise e
+ return None
+
+ def write_probed(chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts,
+ spectrum,protacc,peptide,uniqueness,genomeReference,score=1000,
+ psmScore='.', fdr='.', mods='.', charge='.',
+ expMassToCharge='.', calcMassToCharge='.',
+ psmRank='.', datasetID='.', uri='.'):
+ probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
+ (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount,
+ ','.join([str(v) for v in blockSizes]),
+ ','.join([str(v) for v in blockStarts]),
+ protacc,peptide,uniqueness, genomeReference,
+ psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri))
+
+ def get_genomic_location(exons):
+ chrom = exons[0][1]
+ strand = exons[0][4]
+ pos = [exon[2] for exon in exons] + [exon[3] for exon in exons]
+ chromStart = min(pos)
+ chromEnd = max(pos)
+ blockCount = len(exons)
+ blockSizes = [abs(exon[3] - exon[2]) for exon in exons]
+ blockStarts = [min(exon[2],exon[3]) - chromStart for exon in exons]
+ return (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts)
+
+ def get_psm_modifications(peptide_ref):
+ mods = []
+ ts = time()
+ params = {"peptide_ref" : peptide_ref}
+ pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)]
+ if pepmods:
+ for (location, residue, name, modType, unimod) in pepmods:
+ mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name,residue)))
+ te = time()
+ add_time('PEP_MODS_QUERY',te - ts)
+ return ';'.join(mods)
+
+ ## iterate through PSMs
+ psm_cursor = get_connection(args.mzsqlite).cursor()
+ ts = time()
+ psms = psm_cursor.execute(PSM_QUERY)
+ te = time()
+ add_time('PSM_QUERY',te - ts)
+ proBAM = ProBAM(species=None,assembly=args.genomeReference,seqlens=seqlens,comments=[]) if args.prosam or args.probam else None
+ proBED = ProBED(species=None,assembly=args.genomeReference,comments=[]) if args.probed else None
+ for i, psm in enumerate(psms):
+ probam_dict = PROBAM_DEFAULTS.copy()
+ (acc,pep_start,pep_end,aa_pre,aa_post,peptide,spectrum_id,spectrum_title,rank,charge,calcmass,exprmass,pepref) = psm
+ scan_name = spectrum_title if spectrum_title else spectrum_id
+ if args.debug:
+ print('\nPSM: %d\t%s' % (i, '\t'.join([str(v) for v in (acc,pep_start,pep_end,peptide,spectrum_id,scan_name,rank,charge,calcmass,exprmass)])), file=sys.stderr)
+ exons = get_mapping(acc,pep_start,pep_end)
+ if args.debug:
+ print('%s' % exons, file=sys.stderr)
+ if not exons:
+ continue
+ mods = get_psm_modifications(pepref)
+ (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) = get_genomic_location(exons)
+ ref_cds = get_cds(exons)
+ if args.debug:
+ print('%s' % ref_cds, file=sys.stderr)
+ ref_prot = translate(ref_cds)
+ if args.debug:
+ print('%s' % ref_prot, file=sys.stderr)
+ print('%s' % peptide, file=sys.stderr)
+ spectrum_peptides = spectrum_peptide_count(spectrum_id)
+ peptide_locations = genomic_mapping_count(peptide)
+ if args.debug:
+ print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides,peptide_locations), file=sys.stderr)
+ uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]'
+ if proBED is not None:
+ ts = time()
+ proBEDEntry = ProBEDEntry(chrom,chromStart,chromEnd,
+ '%s_%s' % (acc,scan_name),
+ 1000,strand,
+ blockCount,blockSizes,blockStarts,
+ acc,peptide,uniqueness,args.genomeReference,
+ charge=charge,expMassToCharge=exprmass,calcMassToCharge=calcmass,
+ mods=mods if mods else '.', psmRank=rank)
+ proBED.add_entry(proBEDEntry)
+ te = time()
+ add_time('add_probed',te - ts)
+ if proBAM is not None:
+ if len(ref_prot) != len(peptide):
+ pass
+ # continue
+ ts = time()
+ probam_dict['NH'] = peptide_locations
+ probam_dict['XO'] = uniqueness
+ probam_dict['XL'] = peptide_locations
+ probam_dict['XP'] = peptide
+ probam_dict['YP'] = acc
+ probam_dict['XC'] = charge
+ probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass)
+ probam_dict['XR'] = ref_prot # ? dbSequence
+ probam_dict['YA'] = aa_post
+ probam_dict['YB'] = aa_pre
+ probam_dict['XM'] = mods if mods else '*'
+ flag = 16 if strand == '-' else 0
+ if str(rank)!=str(1) and rank!='*' and rank!=[] and rank!="":
+ flag += 256
+ probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons])
+ ## what if structural variant, need to split into multiple entries
+ ## probam_dict['XG'] = peptide_type
+ pep_cds = classify_peptide(exons,ref_prot,peptide,ref_cds,probam_dict)
+ ## probam_dict['MD'] = peptide
+
+ ## FIX SAM sequence is forward strand
+ seq = pep_cds if strand == '+' else reverse_complement(pep_cds)
+ ## cigar based on plus strand
+ cigar = ''
+ if strand == '+':
+ blkStarts = blockStarts
+ blkSizes = blockSizes
+ else:
+ blkStarts = [x for x in reversed(blockStarts)]
+ blkSizes = [x for x in reversed(blockSizes)]
+ for j in range(blockCount):
+ if j > 0:
+ intron = blkStarts[j] - (blkStarts[j-1] + blkSizes[j-1])
+ if intron > 0:
+ cigar += '%dN' % intron
+ cigar += '%dM' % blkSizes[j]
+ proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart+1,
+ cigar=cigar,seq=seq,optional=probam_dict)
+ proBAM.add_entry(proBAMEntry)
+ te = time()
+ add_time('add_probam',te - ts)
+ if args.debug:
+ print('%s' % probam_dict, file=sys.stderr)
+ if args.limit and i >= args.limit:
+ break
+ if args.probed:
+ ts = time()
+ with open(args.probed,'w') as fh:
+ proBED.write(fh)
+ te = time()
+ add_time('write_probed',te - ts)
+ if args.prosam or args.probam:
+ samfile = args.prosam if args.prosam else 'temp.sam'
+ ts = time()
+ with open(samfile,'w') as fh:
+ proBAM.write(fh)
+ te = time()
+ add_time('write_prosam',te - ts)
+ if args.probam:
+ ts = time()
+ bamfile = args.prosam.replace('.sam','.bam')
+ pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False)
+ te = time()
+ add_time('write_probam',te - ts)
+ pysam.index(args.probam)
+
+ if args.verbose:
+ print('\n%s\n' % str(timings), file=sys.stderr)
+
+if __name__ == "__main__":
+ __main__()
diff -r 000000000000 -r f2dc9805107a mzsqlite_psm_align.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mzsqlite_psm_align.xml Mon Apr 16 18:00:53 2018 -0400
@@ -0,0 +1,113 @@
+
+ from mz.sqlite aand genomic mapping
+
+ biopython
+ twobitreader
+ pysam
+ gffutils
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 'prosam' in output_formats
+
+
+ 'probam' in output_formats
+
+
+ 'probed' in output_formats
+
+
+ select * from feature_cds_map WHERE name like 'ENSMUSP00000000001%';
+ ENSMUSP00000000001 chr3 108145887 108146005 - 0 118
+ ENSMUSP00000000001 chr3 108123794 108123837 - 118 161
+ ENSMUSP00000000001 chr3 108123541 108123683 - 161 303
+ ENSMUSP00000000001 chr3 108118300 108118458 - 303 461
+ ENSMUSP00000000001 chr3 108115762 108115891 - 461 590
+ ENSMUSP00000000001 chr3 108112472 108112602 - 590 720
+ ENSMUSP00000000001 chr3 108111934 108112088 - 720 874
+ ENSMUSP00000000001 chr3 108109421 108109612 - 874 1065
+
+This schema can describe structural variants as well as canonical transcripts.
+
+ ]]>
+
diff -r 000000000000 -r f2dc9805107a profmt.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/profmt.py Mon Apr 16 18:00:53 2018 -0400
@@ -0,0 +1,247 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+# University of Minnesota
+# Copyright 2018, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+# James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+import sys,re
+from operator import itemgetter, attrgetter
+from twobitreader import TwoBitFile
+
+
+PROBAM_TAGS = ['NH', 'XO', 'XL', 'XP', 'YP', 'XF', 'XI', 'XB', 'XR', 'YB', 'YA', 'XS', 'XQ', 'XC', 'XA', 'XM', 'XN', 'XT', 'XE', 'XG', 'XU']
+
+
+PROBAM_TYTPES = {
+ 'NH' : 'i', #number of genomic locations to which the peptide sequence maps
+ 'XO' : 'Z', #uniqueness of the peptide mapping
+ 'XL' : 'i', #number of peptides to which the spectrum maps
+ 'XP' : 'Z', #peptide sequence
+ 'YP' : 'Z', #Protein accession ID from the original search result
+ 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
+ 'XI' : 'f', #Peptide intensity
+ 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
+ 'XR' : 'Z', #reference peptide sequence
+ 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
+ 'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
+ 'XS' : 'f', #PSM score
+ 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
+ 'XC' : 'i', #peptide charge
+ 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
+ 'XM' : 'Z', #Modifications
+ 'XN' : 'i', #Number of missed cleavages in the peptide (XP)
+ 'XT' : 'i', #Enzyme specificity
+ 'XE' : 'i', #Enzyme used in the experiment
+ 'XG' : 'A', #Peptide type
+ 'XU' : 'Z', #URI
+}
+
+
+PROBAM_DEFAULTS = {
+ 'NH' : -1, #number of genomic locations to which the peptide sequence maps
+ 'XO' : '*', #uniqueness of the peptide mapping
+ 'XL' : -1, #number of peptides to which the spectrum maps
+ 'XP' : '*', #peptide sequence
+ 'YP' : '*', #Protein accession ID from the original search result
+ 'XF' : '*', #Reading frame of the peptide (0, 1, 2)
+ 'XI' : -1, #Peptide intensity
+ 'XB' : '*', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
+ 'XR' : '*', #reference peptide sequence
+ 'YB' : '*', #Preceding amino acids (2 AA, B stands for before).
+ 'YA' : '*', #Following amino acids (2 AA, A stands for after).
+ 'XS' : -1, #PSM score
+ 'XQ' : -1, #PSM FDR (i.e. q-value or 1-PEP).
+ 'XC' : -1, #peptide charge
+ 'XA' : -1, #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
+ 'XM' : '*', #Modifications
+ 'XN' : -1, #Number of missed cleavages in the peptide (XP)
+ 'XT' : -1, #Enzyme specificity
+ 'XE' : -1, #Enzyme used in the experiment
+ 'XG' : '*', #Peptide type
+ 'XU' : '*', #URI
+}
+
+def cmp_alphanumeric(s1,s2):
+ if s1 == s2:
+ return 0
+ a1 = re.findall("\d+|[a-zA-Z]+",s1)
+ a2 = re.findall("\d+|[a-zA-Z]+",s2)
+ for i in range(min(len(a1),len(a2))):
+ if a1[i] == a2[i]:
+ continue
+ if a1[i].isdigit() and a2[i].isdigit():
+ return int(a1[i]) - int(a2[i])
+ return 1 if a1[i] > a2[i] else -1
+ return len(a1) - len(a2)
+
+
+def sort_chrom_names(names):
+ rnames = sorted(names,cmp=cmp_alphanumeric)
+ if 'chrM' in rnames:
+ rnames.remove('chrM')
+ rnames.insert(0,'chrM')
+ if 'MT' in rnames:
+ rnames.remove('MT')
+ rnames.append('MT')
+ return rnames
+
+
+def as_int_list(obj):
+ if obj is None:
+ return None
+ if isinstance(obj, list):
+ return [int(x) for x in obj]
+ elif isinstance(obj, str):
+ return [int(x) for x in obj.split(',')]
+ else: # python2 unicode?
+ return [int(x) for x in str(obj).split(',')]
+
+
+class ProBEDEntry (object):
+ def __init__(self, chrom, chromStart, chromEnd, name, score, strand,
+ blockCount, blockSizes, blockStarts,
+ protacc, peptide, uniqueness, genomeReference,
+ psmScore='.', fdr='.', mods='.', charge='.',
+ expMassToCharge='.', calcMassToCharge='.',
+ psmRank='.', datasetID='.', uri='.'):
+ self.chrom = chrom
+ self.chromStart = int(chromStart)
+ self.chromEnd = int(chromEnd)
+ self.name = name
+ self.score = int(score) if score is not None else 0
+ self.strand = '-' if str(strand).startswith('-') else '+'
+ self.thickStart = self.chromStart
+ self.thickEnd = self.chromEnd
+ self.itemRgb = '0'
+ self.blockCount = int(blockCount)
+ self.blockSizes = as_int_list(blockSizes)
+ self.blockStarts = as_int_list(blockStarts)
+ self.protacc = protacc
+ self.peptide = peptide
+ self.uniqueness = uniqueness
+ self.genomeReference = genomeReference
+ self.psmScore = psmScore
+ self.fdr = fdr
+ self.mods = mods
+ self.charge = charge
+ self.expMassToCharge = expMassToCharge
+ self.calcMassToCharge = calcMassToCharge
+ self.psmRank = psmRank
+ self.datasetID = datasetID
+ self.uri = uri
+
+ def __str__(self):
+ return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
+ (self.chrom, self.chromStart, self.chromEnd,
+ self.name, self.score, self.strand,
+ self.thickStart, self.thickEnd, self.itemRgb,
+ self.blockCount,
+ ','.join([str(x) for x in self.blockSizes]),
+ ','.join([str(x) for x in self.blockStarts]),
+ self.protacc, self.peptide, self.uniqueness,
+ self.genomeReference,
+ self.psmScore, self.fdr, self.mods,
+ self.charge, self.expMassToCharge, self.calcMassToCharge,
+ self.psmRank, self.datasetID, self.uri)
+
+
+class ProBED ( object ):
+ def __init__(self,species=None,assembly=None,comments=[]):
+ self.species = species
+ self.assembly = assembly
+ self.comments = comments
+ self.entries = dict()
+
+ def add_entry(self,entry):
+ if not entry.chrom in self.entries:
+ self.entries[entry.chrom] = []
+ self.entries[entry.chrom].append(entry)
+
+ def write(self,fh):
+ rnames = sort_chrom_names(self.entries.keys())
+ for sn in rnames:
+ if sn not in self.entries:
+ continue
+ for pbe in sorted(self.entries[sn], key=attrgetter('chromStart','chromEnd')):
+ fh.write(str(pbe))
+
+
+class ProBAMEntry (object):
+ def __init__(self, qname='', flag=0, rname='', pos=0, mapq=255, cigar='', rnext='*', pnext='0', tlen='0', seq='*', qual='*', optional=PROBAM_DEFAULTS):
+ self.qname = qname
+ self.flag = flag
+ self.rname = rname
+ self.pos = pos
+ self.mapq = mapq
+ self.cigar = cigar
+ self.rnext = rnext
+ self.pnext = pnext
+ self.tlen = tlen
+ self.seq = seq
+ self.qual = qual
+ self.optional = optional
+ def __str__(self):
+ opt_cols = '\t%s' % '\t'.join(['%s:%s:%s' % (t,PROBAM_TYTPES[t],self.optional[t]) for t in PROBAM_TAGS]) if self.optional else ''
+ return '%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s%s' % (
+ self.qname,self.flag,self.rname,self.pos,self.mapq,self.cigar,
+ str(self.rnext) if self.rnext else '',
+ str(self.pnext) if self.pnext else '',
+ str(self.tlen) if self.tlen else '',
+ self.seq,
+ self.qual, opt_cols)
+ def add_optional(self,tag,value):
+ self.optional[tag] = value
+
+
+class ProBAM ( object ):
+ def __init__(self,species=None,assembly=None,seqlens={},comments=[]):
+ self.species = species
+ self.assembly = assembly
+ self.seqlens = seqlens
+ self.comments = comments
+ self.entries = dict()
+ self.opt_columns = set()
+ self.rg = []
+
+ def add_entry(self,pb_entry):
+ if not pb_entry.rname in self.entries:
+ self.entries[pb_entry.rname] = []
+ self.entries[pb_entry.rname].append(pb_entry)
+ if pb_entry.optional:
+ self.opt_columns | set(pb_entry.optional.keys())
+
+ def add_entry_from_bed(self,bed_entry,optional=dict()):
+ if bed_entry.pep:
+ optional['XP:Z'] = bed_entry.pep
+ qname=bed_entry.name
+ flag = 0 if bed_entry.strand == '+' else 16
+ rname = bed_entry.chrom
+ pos = bed_entry.chromStart + 1
+ cigar = bed_entry.get_cigar()
+ seq = bed_entry.get_spliced_seq(strand='+') if bed_entry.seq else '*'
+ pb_entry = ProBAMEntry(qname=qname, flag=flag, rname=rname, pos=pos,cigar=cigar,seq=seq,optional=optional)
+ self.add_entry(pb_entry)
+
+ def write(self,fh):
+ fh.write('@HD VN:1.0 SO:coordinate\n')
+ rnames = sort_chrom_names(self.seqlens.keys())
+ for sn in rnames:
+ fh.write('@SQ\tSN:%s\tLN:%d\n' % (sn,self.seqlens[sn]))
+ for rg in self.rg:
+ fh.write('@RG\tID:%s\n' % (rg))
+ fh.write('@PG\tID:SampleSpecificGenerator\n')
+ for comment in self.comments:
+ fh.write('@CO\t%s\n' % comment)
+ for sn in rnames:
+ if sn not in self.entries:
+ continue
+ for pbe in sorted(self.entries[sn], key=attrgetter('pos')):
+ fh.write('%s\n' % str(pbe))
diff -r 000000000000 -r f2dc9805107a tool-data/twobit.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/twobit.loc.sample Mon Apr 16 18:00:53 2018 -0400
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that is used by some
+#tools. The twobit.loc file has this format (white space characters
+#are TAB characters):
+#
+#
+#
+#So, for example, if you had droPer1 twobit files stored in
+#/depot/data2/galaxy/droPer1/, then the twobit.loc entry
+#would look like this:
+#
+#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit
+#
+#and your /depot/data2/galaxy/droPer1/ directory would
+#contain all of your twobit files (e.g.):
+#
+#-rw-rw-r-- 1 nate galaxy 48972650 2007-05-04 11:27 droPer1.2bit
+#...etc...
+#
+#Your twobit.loc file should include an entry per line for each twobit
+#file you have stored. For example:
+#
+#droPer1 /depot/data2/galaxy/droPer1/droPer1.2bit
+#apiMel2 /depot/data2/galaxy/apiMel2/apiMel2.2bit
+#droAna1 /depot/data2/galaxy/droAna1/droAna1.2bit
+#droAna2 /depot/data2/galaxy/droAna2/droAna2.2bit
+#...etc...
diff -r 000000000000 -r f2dc9805107a tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Mon Apr 16 18:00:53 2018 -0400
@@ -0,0 +1,6 @@
+
+
+