Mercurial > repos > jjohnson > mzsqlite_psm_align
view mzsqlite_psm_align.py @ 0:492f98d89e26 draft
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/mzsqlite_psm_align commit 88e2fb9c31fbd687a0956924a870137d1fb9bee3-dirty
author | jjohnson |
---|---|
date | Tue, 10 Apr 2018 09:57:49 -0400 |
parents | |
children | af5f22779a8e |
line wrap: on
line source
#!/usr/bin/env python """ # #------------------------------------------------------------------------------ # University of Minnesota # Copyright 2017, Regents of the University of Minnesota #------------------------------------------------------------------------------ # Author: # # James E Johnson # #------------------------------------------------------------------------------ """ from __future__ import print_function import argparse import re import sys import sqlite3 as sqlite from time import time from Bio.Seq import reverse_complement, translate ## from bedutil import bed_from_line ## import digest ## from ensembl_rest import get_cdna import pysam from twobitreader import TwoBitFile from profmt import PROBAM_DEFAULTS,ProBAM,ProBAMEntry,ProBED,ProBEDEntry """ inputs proBed mzIdentML twobit bam inputs mz.sqlite genomic.mapping bam CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT); CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER); CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT); CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER); SELECT FROM spectrum_identification_result_items siri JOIN peptide_evidence pe ON siri.peptide_ref = pe.peptide_ref JOIN db_sequence dbs ON pe.dBSequence_ref = WHERE pe.isDecoy = 'false' SELECT psm.spectrumID, psm.spectrumTitle as "QNAME", psm.id, psm.sequence, psm.passThreshold, psm."PeptideShaker PSM confidence", psm."PeptideShaker PSM score", pe.start, pe.end, pe.pre, pe.post, pe.dBSequence_ref FROM psm_entries psm JOIN peptide_evidence pe ON psm.id = pe.peptide_ref JOIN db_sequence dbs ON pe.dBSequence_ref = dbs.accession WHERE pe.isDecoy = 'false' AND pe.peptide_ref = 'SFYPEEVSSMVITK_15.99491461956-ATAA-10' ORDER BY psm.spectrumID proBed to SQLite or index proBed for psm in psms: beds = get_bed(protein_acc) cds = '' for bed in beds: bed.seq = twobit[bed.chrom][bed.start,bed.end] cds += bed.get_cds() refprot = translate(cds) def read_bed(path): pdict = dict() prog = re.compile('^([^\t]+\t[^\t]+\t[^\t]+\t([^\t]+)\t.*)$') with open(path,'r') as bed: for i,line in enumerate(bed): m = prog.match(line) prot = m.groups()[1] pdict[prot] = m.groups()[0] return pdict from pyteomics import mzid with mzid.reader(args.mzid) as mzidrdr: for psm in mzidrdr: SpectrumIdentificationItems = psm['SpectrumIdentificationItem'] for SpectrumIdentificationItem in SpectrumIdentificationItems: PeptideEvidenceRef = SpectrumIdentificationItem['PeptideEvidenceRef'] PepEvs = [r['peptideEvidence_ref'] for r in PeptideEvidenceRef] for PepEv in PepEvs: PepRef = mzidrdr[PepEv] dBSequence_ref = PepRef['dBSequence_ref'] spectrum_peptides = count(distinct sequence) FROM psm_entries WHERE 1 QNAME String Query template NAME Spectrum name * psm.spectrumTitle 2 FLAG Int Bitwise FLAG Bitwise FLAG map.strand 3 RNAME String Reference sequence NAME Reference sequence NAME * map.chrom 4 POS Int 1-based leftmost mapping POSition 1-based leftmost mapping POSition map.start 5 -MAPQ Int MAPping Quality (Phred-scaled) - 255 6 CIGAR String Extended CIGAR string (operations: MIDN) CIGAR string * map.cigar 7 -RNEXT String Mate Reference NAME ('=' if same as RNAME) - * 8 -PNEXT Int 1-Based leftmost Mate POSition - 0 9 TLEN Int observed Template LENgth - 0 10 SEQ String segment SEQuence Coding sequence * genomic.seq 11 -QUAL String Query QUALity (ASCII-33=Phred base quality) - * 1 QNAME psm.spectrumTitle 2 FLAG map.strand 3 RNAME map.chrom 4 POS map.start 5 -MAPQ 6 CIGAR map.cigar 7 -RNEXT 8 -PNEXT 9 -TLEN 10 SEQ genomic.seq 11 -QUAL 'NH' : 'i' genomic_locations 'XO' : 'Z' 'XL' : 'i' spectrum_peptides 'XP' : 'Z' psm.sequence 'YP' : 'Z' peptide_evidence.dBSequence_ref 'XF' : 'Z' reading_frame 'XI' : 'f' 'XB' : 'Z' 'XR' : 'Z' 'YB' : 'Z' 'YA' : 'Z' 'XS' : 'f' 'XQ' : 'f' 'XC' : 'i' 'XA' : 'i' 'XM' : 'Z' 'XN' : 'i' 'XT' : 'i' 'XE' : 'i' 'XG' : 'A' 'XU' : 'Z' 'NH' : 'i', #number of genomic locations to which the peptide sequence maps 'XO' : 'Z', #uniqueness of the peptide mapping 'XL' : 'i', #number of peptides to which the spectrum maps 'XP' : 'Z', #peptide sequence 'YP' : 'Z', #Protein accession ID from the original search result 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) 'XI' : 'f', #Peptide intensity 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). 'XR' : 'Z', #reference peptide sequence 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). 'XS' : 'f', #PSM score 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). 'XC' : 'i', #peptide charge 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; 'XM' : 'Z', #Modifications 'XN' : 'i', #Number of missed cleavages in the peptide (XP) 'XT' : 'i', #Enzyme specificity 'XE' : 'i', #Enzyme used in the experiment 'XG' : 'A', #Peptide type 'XU' : 'Z', #URI Datatype Field name Description Origin RNAME string chrom map.chrom Reference sequence chromosome POS uint chromStart map Start position of the first DNA base uint chromEnd map End position of the last DNA base QNAME string name spectrum.title Unique name uint score Score char[1] strand + or - for strand uint thickStart Coding region start uint thickEnd Coding region end uint reserved Always 0 int blockCount Number of blocks int[blockCount] blockSizes Block sizes int[blockCount] chromStarts Block starts YP string proteinAccession Protein accession number XP string peptideSequence Peptide sequence XO string uniqueness Peptide uniqueness string genomeReferenceVersion Genome reference version number XS double psmScore PSM score XQ double fdr Estimated global false discovery rate XM string modifications Post-translational modifications XC int charge Charge value XB double expMassToCharge Experimental mass to charge value XB double calcMassToCharge Calculated mass to charge value int psmRank Peptide-Spectrum Match rank. string datasetID Dataset Identifier string uri Uniform Resource Identifier XG N Normal peptide. The peptide sequence is contained in the reference protein sequence. V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. W Indel peptide. An insertion or deletion is present as compared to the reference. J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. I Intron peptide. A peptide that is located in an intronic region of the reference isoform. G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. U Unmapped peptide. A peptide that could not be mapped to a reference sequence. X Unknown. SELECT distinct chrom, CASE WHEN strand = '+' THEN start + cds_offset - cds_start ELSE end - cds_offset - cds_start END as "pos" FROM feature_cds_map WHERE name = acc_name AND cds_offset >= cds_start AND cds_offset < cds_end sqlite> select * from feature_cds_map WHERE name = 'pre_STRG.28813.4_j_5350_5470'; pre_STRG.28813.4_j_5350_5470|chr7|5074750|5074857|+|0|107 pre_STRG.28813.4_j_5350_5470|chr7|5075140|5075153|+|107|120 SELECT pe.isDecoy, pe.dBSequence_ref, pe.start, pe.end, sr.spectrumTitle, si.rank, si.chargeState, si.calculatedMassToCharge, si.experimentalMassToCharge FROM spectrum_identification_results sr JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref WHERE si.id = 'SII_7389_1' ORDER BY si.rank; SELECT pe.isDecoy, pe.dBSequence_ref, pe.start, pe.end, sr.spectrumTitle, si.rank, si.chargeState, si.calculatedMassToCharge, si.experimentalMassToCharge FROM spectrum_identification_results sr JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref WHERE si.id = 'SII_7389_1' ORDER BY si.rank; CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT); CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER); CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT); CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER); {'write_probed': 0.08575654029846191, 'PSM_QUERY': 4.704349040985107, 'get_cds': 0.21015286445617676, 'SPECTRUM_PEPTIDES_QUERY': 32.92655086517334, 'PEPTIDE_ACC_QUERY': 425.11919951438904, 'get_mapping': 1.5911591053009033, 'GENOMIC_POS_QUERY': 10.909647226333618} """ def regex_match(expr, item): return re.match(expr, item) is not None def regex_search(expr, item): return re.search(expr, item) is not None def regex_sub(expr, replace, item): return re.sub(expr, replace, item) def get_connection(sqlitedb_path, addfunctions=True): conn = sqlite.connect(sqlitedb_path) if addfunctions: conn.create_function("re_match", 2, regex_match) conn.create_function("re_search", 2, regex_search) conn.create_function("re_sub", 3, regex_sub) return conn PSM_QUERY = """\ SELECT pe.dBSequence_ref, pe.start, pe.end, pe.pre, pe.post, pep.sequence, sr.id, sr.spectrumTitle, si.rank, si.chargeState, si.calculatedMassToCharge, si.experimentalMassToCharge, si.peptide_ref FROM spectrum_identification_results sr JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref JOIN peptides pep ON pe.peptide_ref = pep.id WHERE pe.isDecoy = 'false' ORDER BY sr.spectrumTitle,si.rank """ PEP_MODS_QUERY = """\ SELECT location, residue, name, modType, '' as "unimod" FROM peptide_modifications WHERE peptide_ref = :peptide_ref ORDER BY location, modType, name """ #number of peptides to which the spectrum maps ## spectrum_identification_results => spectrum_identification_result_items -> peptide_evidence SPECTRUM_PEPTIDES_QUERY = """\ SELECT count(distinct pep.sequence) FROM spectrum_identification_results sr JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref JOIN peptides pep ON pe.peptide_ref = pep.id WHERE pe.isDecoy = 'false' AND sr.id = :sr_id GROUP BY sr.id """ #number of genomic locations to which the peptide sequence maps #uniqueness of the peptide mapping ## peptides => peptide_evidence -> db_sequence -> location ## proteins_by_peptide PEPTIDE_ACC_QUERY = """\ SELECT pe.dBSequence_ref, pe.start, pe.end FROM peptide_evidence pe JOIN peptides pep ON pe.peptide_ref = pep.id WHERE pe.isDecoy = 'false' AND pep.sequence = :sequence """ MAP_QUERY = """\ SELECT distinct * FROM feature_cds_map WHERE name = :acc AND :p_start < cds_end AND :p_end >= cds_start ORDER BY name,cds_start,cds_end """ GENOMIC_POS_QUERY = """\ SELECT distinct chrom, CASE WHEN strand = '+' THEN start + :cds_offset - cds_start ELSE end - :cds_offset - cds_start END as "pos" FROM feature_cds_map WHERE name = :acc AND :cds_offset >= cds_start AND :cds_offset < cds_end """ FEATURE_CONTAIN_QUERY = """\ SELECT id,seqid,start,end,featuretype,strand,frame FROM features WHERE seqid = :seqid AND start <= :start AND end >= :end AND strand = :strand AND featuretype = :ftype """ FEATURE_OVERLAP_QUERY = """\ SELECT id,seqid,start,end,featuretype,strand,frame FROM features WHERE seqid = :seqid AND :end >= start AND :start <= end AND strand = :strand AND featuretype = :ftype """ FEATURE_ANY_QUERY = """\ SELECT id,seqid,start,end,featuretype,strand,CAST(frame AS INTEGER) as "frame", CAST(frame AS INTEGER)==:frame as "in_frame" FROM features WHERE seqid = :seqid AND :end >= start AND :start <= end AND featuretype in ('CDS','five_prime_utr','three_prime_utr','transcript') ORDER BY strand == :strand DESC, featuretype, start <= :start AND end >= :end DESC, in_frame DESC, end - start, start DESC, end """ def __main__(): parser = argparse.ArgumentParser( description='Generate proBED and proBAM from mz.sqlite') parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML") parser.add_argument('genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table") parser.add_argument( '-R', '--genomeReference', default='Unknown', help='Genome reference sequence in 2bit format') parser.add_argument( '-t', '--twobit', default=None, help='Genome reference sequence in 2bit format') parser.add_argument( '-r', '--reads_bam', default=None, help='reads alignment bam path') parser.add_argument( '-g', '--gffutils_file', default=None, help='gffutils GTF sqlite DB') parser.add_argument( '-B', '--probed', default=None, help='proBed path') parser.add_argument( '-s', '--prosam', default=None, help='proSAM path') parser.add_argument( '-b', '--probam', default=None, help='proBAM path') parser.add_argument( '-l', '--limit', type=int, default=None, help='limit numbers of PSMs for testing') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') parser.add_argument('-d', '--debug', action='store_true', help='Debug') args = parser.parse_args() def get_sequence(chrom, start, end): if twobit: if chrom in twobit and 0 <= start < end < len(twobit[chrom]): return twobit[chrom][start:end] contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom if contig in twobit and 0 <= start < end < len(twobit[contig]): return twobit[contig][start:end] return '' return None twobit = TwoBitFile(args.twobit) if args.twobit else None samfile = pysam.AlignmentFile(args.reads_bam, "rb" ) if args.reads_bam else None seqlens = twobit.sequence_sizes() probed = open(args.probed,'w') if args.probed else sys.stdout gff_cursor = get_connection(args.gffutils_file).cursor() if args.gffutils_file else None map_cursor = get_connection(args.genomic_mapping_sqlite).cursor() mz_cursor = get_connection(args.mzsqlite_file).cursor() unmapped_accs = set() timings = dict() def add_time(name,elapsed): if name in timings: timings[name] += elapsed else: timings[name] = elapsed XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*'] FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript'] def get_peptide_type(exons): ## XG classify peptide ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. ## W Indel peptide. An insertion or deletion is present as compared to the reference. ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. ## X Unknown. peptide_type = '*' if gff_cursor: ts = time() etypes = ['*'] * len(exons) efeatures = [None] * len(exons) if args.debug: print('exons:%d\t%s'% (len(exons),etypes),file=sys.stderr) for i,exon in enumerate(exons): (acc,gc,gs,ge,st,cs,ce) = exon fr = cs % 3 if args.debug: print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc,gc,gs,ge,st,cs,ce,fr),file=sys.stderr) ft_params = {"seqid" : str(gc).replace('chr',''), "start" : gs, "end" : ge, 'strand' : st, 'frame' : fr, 'ftype' : 'CDS'} features = [f for f in gff_cursor.execute(FEATURE_ANY_QUERY,ft_params)] efeatures[i] = features for i,exon in enumerate(exons): (acc,gc,gs,ge,st,cs,ce) = exon for f in efeatures[i]: (id,seqid,start,end,featuretype,strand,frame,in_frame) = f if args.debug: print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (id,seqid,start,end,featuretype,strand,frame,in_frame),file=sys.stderr) if strand == st: if start <= gs and ge <= end: if in_frame: etypes[i] = 'N' break elif XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): etypes[i] = 'O' break else: if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): etypes[i] = 'O' peptide_type = etypes[i] te = time() add_time('pep_type',te - ts) return peptide_type def classify_exon(exon,exons,features): ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. # 1 exon, contained, in_frame # 2+ exons, contained, in_frame, on_exon_boundary ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. # 1 exon, contained, in_frame, AA_mismatch # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch ## W Indel peptide. An insertion or deletion is present as compared to the reference. # 1 exon, contained, in_frame, AA_mismatch # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). # 1 exon overlaps but not contained ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. # exon overlaps a three_prime_utr ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. # exon contained but not in_frame ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. # exon contained in transcript, not not overlapping any exon ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. # exonis from different seqs, strand, or transcripts ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. ## X Unknown. return '*' def get_variant_cds(exons,ref_prot,peptide,pep_cds): if ref_prot != peptide and samfile: try: if args.debug: print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr) ts = time() for exon in exons: (acc,chrom,start,end,strand,c_start,c_end) = exon a_start = c_start / 3 * 3 a_end = c_end / 3 * 3 if ref_prot[a_start:a_end] != peptide[a_start:a_end]: pileup = get_exon_pileup(chrom,start,end) for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]): if ao == 0 or i == 0: if ref_prot[ai] != peptide[ai]: codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao) if args.debug: print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr) if codon: pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:] te = time() add_time('var_cds',te - ts) except Exception as e: print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr) return pep_cds def get_mapping(acc,pep_start,pep_end): ts = time() p_start = (pep_start - 1) * 3 p_end = pep_end * 3 map_params = {"acc" : acc, "p_start" : p_start, "p_end" : p_end} if args.debug: print('%s' % map_params, file=sys.stderr) locs = [l for l in map_cursor.execute(MAP_QUERY,map_params)] exons = [] ## ========= pep ## --- continue ## --- trim ## --- copy ## --- trim ## --- break c_end = 0 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(locs): if args.debug: print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr) c_start = c_end if cds_end < p_start: continue if cds_start >= p_end: break if strand == '+': if cds_start < p_start: start += p_start - cds_start if cds_end > p_end: end -= cds_end - p_end else: if cds_start < p_start: end -= p_start - cds_start if cds_end > p_end: start += cds_end - p_end c_end = c_start + abs(end - start) if args.debug: print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr) exons.append([acc,chrom,start,end,strand,c_start,c_end]) te = time() add_time('get_mapping',te - ts) return exons def get_cds(exons): ts = time() seqs = [] for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(exons): seq = get_sequence(chrom, min(start,end), max(start,end)) if strand == '-': seq = reverse_complement(seq) seqs.append(seq) te = time() add_time('get_cds',te - ts) if args.debug: print('CDS: %s' % str(seqs),file=sys.stderr) return ''.join(seqs) if seqs else '' def genomic_mapping_count(peptide): ts = time() params = {"sequence" : peptide} acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY,params)] te = time() add_time('PEPTIDE_ACC_QUERY',te - ts) if acc_locs: if len(acc_locs) == 1: return 1 locations = set() for i,acc_loc in enumerate(acc_locs): (acc,pep_start,pep_end) = acc_loc if acc in unmapped_accs: continue try: add_time('GENOMIC_POS_QUERY_COUNT',1) ts = time() p_start = pep_start * 3 p_end = pep_end * 3 params = {"acc" : acc, "cds_offset" : p_start} (start_chrom,start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() params = {"acc" : acc, "cds_offset" : p_end} (end_chrom,end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() locations.add('%s:%s-%s:%s' % (start_chrom,start_pos,end_chrom,end_pos)) te = time() add_time('GENOMIC_POS_QUERY',te - ts) except: unmapped_accs.add(acc) print('Unmapped: %s' % acc, file=sys.stderr) return len(locations) return -1 def spectrum_peptide_count(spectrum_id): ts = time() params = {"sr_id" : spectrum_id} pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0] te = time() add_time('SPECTRUM_PEPTIDES_QUERY',te - ts) return pep_count def get_exon_pileup(chrom,chromStart,chromEnd): cols = [] for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd): if chromStart <= pileupcolumn.reference_pos <= chromEnd: bases = dict() col = {'depth' : 0, 'cov' : pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases' : bases} for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: col['depth'] += 1 base = pileupread.alignment.query_sequence[pileupread.query_position] if base not in bases: bases[base] = 1 else: bases[base] += 1 cols.append(col) return cols codon_map = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L", "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S", "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*", "TGT":"C", "TGC":"C", "TGA":"*", "TGG":"W", "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L", "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P", "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R", "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M", "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T", "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K", "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R", "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V", "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A", "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E", "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",} aa_codon_map = dict() for c,a in codon_map.items(): aa_codon_map[a] = [c] if a not in aa_codon_map else aa_codon_map[a] + [c] aa_na_map = dict() # m[aa]{bo : {b1 : [b3] for c,a in codon_map.items(): if a not in aa_na_map: aa_na_map[a] = dict() d = aa_na_map[a] for i in range(3): b = c[i] if i < 2: if b not in d: d[b] = dict() if i < 1 else set() d = d[b] else: d.add(b) def get_pep_codon(pileup, idx, aa, ao): try: ts = time() bases = [] for i in range(3): if i < ao: bases.append(list(set([c[i] for c in aa_codon_map[aa]]))) else: bases.append([b for b, cnt in reversed(sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k,v): (v,k)))]) print('%s' % bases) for b0 in bases[0]: if b0 not in aa_na_map[aa]: continue for b1 in bases[1]: if b1 not in aa_na_map[aa][b0]: continue for b2 in bases[2]: if b2 in aa_na_map[aa][b0][b1]: return '%s%s%s' % (b0,b1,b2) te = time() add_time('pep_codon',te - ts) except Exception as e: print("get_pep_codon: %s %s %s %s" % (aa, ao, idx, pileup), file=sys.stderr) raise e return None def write_probed(chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts, spectrum,protacc,peptide,uniqueness,genomeReference,score=1000, psmScore='.', fdr='.', mods='.', charge='.', expMassToCharge='.', calcMassToCharge='.', psmRank='.', datasetID='.', uri='.'): probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount, ','.join([str(v) for v in blockSizes]), ','.join([str(v) for v in blockStarts]), protacc,peptide,uniqueness, genomeReference, psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri)) def get_genomic_location(exons): chrom = exons[0][1] strand = exons[0][4] pos = [exon[2] for exon in exons] + [exon[3] for exon in exons] chromStart = min(pos) chromEnd = max(pos) blockCount = len(exons) blockSizes = [abs(exon[3] - exon[2]) for exon in exons] blockStarts = [min(exon[2],exon[3]) - chromStart for exon in exons] return (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) def get_psm_modifications(peptide_ref): mods = [] ts = time() params = {"peptide_ref" : peptide_ref} pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)] if pepmods: for (location, residue, name, modType, unimod) in pepmods: mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name,residue))) te = time() add_time('PEP_MODS_QUERY',te - ts) return ';'.join(mods) """ QNAME FLAG RNAME POS CIGAR SEQ 'NH' : 'i', #number of genomic locations to which the peptide sequence maps 'XO' : 'Z', #uniqueness of the peptide mapping 'XL' : 'i', #number of peptides to which the spectrum maps 'XP' : 'Z', #peptide sequence 'YP' : 'Z', #Protein accession ID from the original search result 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) 'XI' : 'f', #Peptide intensity 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). 'XR' : 'Z', #reference peptide sequence 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). 'XS' : 'f', #PSM score 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). 'XC' : 'i', #peptide charge 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; 'XM' : 'Z', #Modifications 'XN' : 'i', #Number of missed cleavages in the peptide (XP) 'XT' : 'i', #Enzyme specificity 'XE' : 'i', #Enzyme used in the experiment 'XG' : 'A', #Peptide type 'XU' : 'Z', #URI """ psm_cursor = get_connection(args.mzsqlite_file).cursor() ts = time() psms = psm_cursor.execute(PSM_QUERY) te = time() add_time('PSM_QUERY',te - ts) proBAM = ProBAM(species=None,assembly=args.genomeReference,seqlens=seqlens,comments=[]) proBED = ProBED(species=None,assembly=args.genomeReference,comments=[]) for i, psm in enumerate(psms): probam_dict = PROBAM_DEFAULTS.copy() (acc,pep_start,pep_end,aa_pre,aa_post,peptide,spectrum_id,spectrum_title,rank,charge,calcmass,exprmass,pepref) = psm scan_name = spectrum_title if spectrum_title else spectrum_id if args.debug: print('\nPSM: %d\t%s' % (i, '\t'.join([str(v) for v in (acc,pep_start,pep_end,peptide,spectrum_id,scan_name,rank,charge,calcmass,exprmass)])), file=sys.stderr) exons = get_mapping(acc,pep_start,pep_end) if args.debug: print('%s' % exons, file=sys.stderr) if not exons: continue mods = get_psm_modifications(pepref) (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) = get_genomic_location(exons) ref_cds = get_cds(exons) if args.debug: print('%s' % ref_cds, file=sys.stderr) ref_prot = translate(ref_cds) if args.debug: print('%s' % ref_prot, file=sys.stderr) print('%s' % peptide, file=sys.stderr) spectrum_peptides = spectrum_peptide_count(spectrum_id) peptide_locations = genomic_mapping_count(peptide) if args.debug: print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides,peptide_locations), file=sys.stderr) uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]' ts = time() proBEDEntry = ProBEDEntry(chrom,chromStart,chromEnd, '%s_%s' % (acc,scan_name), 1000,strand, blockCount,blockSizes,blockStarts, acc,peptide,uniqueness,args.genomeReference, charge=charge,expMassToCharge=exprmass,calcMassToCharge=calcmass, mods=mods if mods else '.', psmRank=rank) proBED.add_entry(proBEDEntry) te = time() add_time('add_probed',te - ts) if len(ref_prot) != len(peptide): continue ts = time() probam_dict['NH'] = peptide_locations probam_dict['XO'] = uniqueness probam_dict['XL'] = peptide_locations probam_dict['XP'] = peptide probam_dict['YP'] = acc probam_dict['XC'] = charge probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass) probam_dict['XR'] = ref_prot # ? dbSequence probam_dict['YA'] = aa_post probam_dict['YB'] = aa_pre probam_dict['XM'] = mods if mods else '*' flag = 16 if strand == '-' else 0 if str(rank)!=str(1) and rank!='*' and rank!=[] and rank!="": flag += 256 probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons]) ## check for variation from ref_cds pep_cds = get_variant_cds(exons,ref_prot,peptide,ref_cds) peptide_type = '*' ## XG classify peptide probam_dict['XG'] = get_peptide_type(exons) ## probam_dict['MD'] = peptide ## FIX SAM sequence is forward strand seq = pep_cds if strand == '+' else reverse_complement(pep_cds) ## cigar based on plus strand cigar = '' if strand == '+': blkStarts = blockStarts blkSizes = blockSizes else: blkStarts = [x for x in reversed(blockStarts)] blkSizes = [x for x in reversed(blockSizes)] for j in range(blockCount): if j > 0: intron = blkStarts[j] - (blkStarts[j-1] + blkSizes[j-1]) if intron > 0: cigar += '%dN' % intron cigar += '%dM' % blkSizes[j] ## Mods TODO proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart+1, cigar=cigar,seq=seq,optional=probam_dict) proBAM.add_entry(proBAMEntry) te = time() add_time('add_probam',te - ts) if args.debug: print('%s' % probam_dict, file=sys.stderr) if args.limit and i >= args.limit: break if args.probed: ts = time() with open(args.probed,'w') as fh: proBED.write(fh) te = time() add_time('write_probed',te - ts) if args.prosam or args.probam: samfile = args.prosam if args.prosam else 'temp.sam' ts = time() with open(samfile,'w') as fh: proBAM.write(fh) te = time() add_time('write_prosam',te - ts) if args.probam: ts = time() bamfile = args.prosam.replace('.sam','.bam') pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False) te = time() add_time('write_probam',te - ts) pysam.index(args.probam) print('\n%s\n' % str(timings), file=sys.stderr) if __name__ == "__main__": __main__()