comparison mzsqlite_psm_align.py @ 0:492f98d89e26 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/mzsqlite_psm_align commit 88e2fb9c31fbd687a0956924a870137d1fb9bee3-dirty
author jjohnson
date Tue, 10 Apr 2018 09:57:49 -0400
parents
children af5f22779a8e
comparison
equal deleted inserted replaced
-1:000000000000 0:492f98d89e26
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2017, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 from __future__ import print_function
16
17 import argparse
18 import re
19 import sys
20 import sqlite3 as sqlite
21 from time import time
22
23
24 from Bio.Seq import reverse_complement, translate
25
26 ## from bedutil import bed_from_line
27
28 ## import digest
29
30 ## from ensembl_rest import get_cdna
31
32 import pysam
33 from twobitreader import TwoBitFile
34
35 from profmt import PROBAM_DEFAULTS,ProBAM,ProBAMEntry,ProBED,ProBEDEntry
36
37 """
38 inputs
39 proBed
40 mzIdentML
41 twobit
42 bam
43
44 inputs
45 mz.sqlite
46 genomic.mapping
47 bam
48
49 CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT);
50 CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER);
51 CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT);
52 CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER);
53
54 SELECT
55 FROM spectrum_identification_result_items siri
56 JOIN peptide_evidence pe ON siri.peptide_ref = pe.peptide_ref
57 JOIN db_sequence dbs ON pe.dBSequence_ref =
58 WHERE pe.isDecoy = 'false'
59
60 SELECT
61 psm.spectrumID,
62 psm.spectrumTitle as "QNAME",
63 psm.id,
64 psm.sequence,
65 psm.passThreshold,
66 psm."PeptideShaker PSM confidence",
67 psm."PeptideShaker PSM score",
68 pe.start,
69 pe.end,
70 pe.pre,
71 pe.post,
72 pe.dBSequence_ref
73 FROM psm_entries psm
74 JOIN peptide_evidence pe ON psm.id = pe.peptide_ref
75 JOIN db_sequence dbs ON pe.dBSequence_ref = dbs.accession
76 WHERE pe.isDecoy = 'false'
77 AND pe.peptide_ref = 'SFYPEEVSSMVITK_15.99491461956-ATAA-10'
78 ORDER BY psm.spectrumID
79
80
81 proBed to SQLite
82 or index proBed
83
84 for psm in psms:
85 beds = get_bed(protein_acc)
86 cds = ''
87 for bed in beds:
88 bed.seq = twobit[bed.chrom][bed.start,bed.end]
89 cds += bed.get_cds()
90 refprot = translate(cds)
91
92 def read_bed(path):
93 pdict = dict()
94 prog = re.compile('^([^\t]+\t[^\t]+\t[^\t]+\t([^\t]+)\t.*)$')
95 with open(path,'r') as bed:
96 for i,line in enumerate(bed):
97 m = prog.match(line)
98 prot = m.groups()[1]
99 pdict[prot] = m.groups()[0]
100 return pdict
101
102 from pyteomics import mzid
103 with mzid.reader(args.mzid) as mzidrdr:
104 for psm in mzidrdr:
105 SpectrumIdentificationItems = psm['SpectrumIdentificationItem']
106 for SpectrumIdentificationItem in SpectrumIdentificationItems:
107 PeptideEvidenceRef = SpectrumIdentificationItem['PeptideEvidenceRef']
108 PepEvs = [r['peptideEvidence_ref'] for r in PeptideEvidenceRef]
109 for PepEv in PepEvs:
110 PepRef = mzidrdr[PepEv]
111 dBSequence_ref = PepRef['dBSequence_ref']
112
113 spectrum_peptides = count(distinct sequence) FROM psm_entries WHERE
114
115 1 QNAME String Query template NAME Spectrum name * psm.spectrumTitle
116 2 FLAG Int Bitwise FLAG Bitwise FLAG map.strand
117 3 RNAME String Reference sequence NAME Reference sequence NAME * map.chrom
118 4 POS Int 1-based leftmost mapping POSition 1-based leftmost mapping POSition map.start
119 5 -MAPQ Int MAPping Quality (Phred-scaled) - 255
120 6 CIGAR String Extended CIGAR string (operations: MIDN) CIGAR string * map.cigar
121 7 -RNEXT String Mate Reference NAME ('=' if same as RNAME) - *
122 8 -PNEXT Int 1-Based leftmost Mate POSition - 0
123 9 TLEN Int observed Template LENgth - 0
124 10 SEQ String segment SEQuence Coding sequence * genomic.seq
125 11 -QUAL String Query QUALity (ASCII-33=Phred base quality) - *
126
127 1 QNAME psm.spectrumTitle
128 2 FLAG map.strand
129 3 RNAME map.chrom
130 4 POS map.start
131 5 -MAPQ
132 6 CIGAR map.cigar
133 7 -RNEXT
134 8 -PNEXT
135 9 -TLEN
136 10 SEQ genomic.seq
137 11 -QUAL
138
139 'NH' : 'i' genomic_locations
140 'XO' : 'Z'
141 'XL' : 'i' spectrum_peptides
142 'XP' : 'Z' psm.sequence
143 'YP' : 'Z' peptide_evidence.dBSequence_ref
144 'XF' : 'Z' reading_frame
145 'XI' : 'f'
146 'XB' : 'Z'
147 'XR' : 'Z'
148 'YB' : 'Z'
149 'YA' : 'Z'
150 'XS' : 'f'
151 'XQ' : 'f'
152 'XC' : 'i'
153 'XA' : 'i'
154 'XM' : 'Z'
155 'XN' : 'i'
156 'XT' : 'i'
157 'XE' : 'i'
158 'XG' : 'A'
159 'XU' : 'Z'
160
161 'NH' : 'i', #number of genomic locations to which the peptide sequence maps
162 'XO' : 'Z', #uniqueness of the peptide mapping
163 'XL' : 'i', #number of peptides to which the spectrum maps
164 'XP' : 'Z', #peptide sequence
165 'YP' : 'Z', #Protein accession ID from the original search result
166 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
167 'XI' : 'f', #Peptide intensity
168 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
169 'XR' : 'Z', #reference peptide sequence
170 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
171 'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
172 'XS' : 'f', #PSM score
173 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
174 'XC' : 'i', #peptide charge
175 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
176 'XM' : 'Z', #Modifications
177 'XN' : 'i', #Number of missed cleavages in the peptide (XP)
178 'XT' : 'i', #Enzyme specificity
179 'XE' : 'i', #Enzyme used in the experiment
180 'XG' : 'A', #Peptide type
181 'XU' : 'Z', #URI
182
183
184 Datatype Field name Description Origin
185 RNAME string chrom map.chrom Reference sequence chromosome
186 POS uint chromStart map Start position of the first DNA base
187 uint chromEnd map End position of the last DNA base
188 QNAME string name spectrum.title Unique name
189 uint score Score
190 char[1] strand + or - for strand
191 uint thickStart Coding region start
192 uint thickEnd Coding region end
193 uint reserved Always 0
194 int blockCount Number of blocks
195 int[blockCount] blockSizes Block sizes
196 int[blockCount] chromStarts Block starts
197 YP string proteinAccession Protein accession number
198 XP string peptideSequence Peptide sequence
199 XO string uniqueness Peptide uniqueness
200 string genomeReferenceVersion Genome reference version number
201 XS double psmScore PSM score
202 XQ double fdr Estimated global false discovery rate
203 XM string modifications Post-translational modifications
204 XC int charge Charge value
205 XB double expMassToCharge Experimental mass to charge value
206 XB double calcMassToCharge Calculated mass to charge value
207 int psmRank Peptide-Spectrum Match rank.
208 string datasetID Dataset Identifier
209 string uri Uniform Resource Identifier
210
211 XG
212 N Normal peptide. The peptide sequence is contained in the reference protein sequence.
213 V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
214 W Indel peptide. An insertion or deletion is present as compared to the reference.
215 J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
216 A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
217 M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
218 C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
219 E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
220 B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
221 O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
222 T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
223 R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
224 I Intron peptide. A peptide that is located in an intronic region of the reference isoform.
225 G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
226 D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
227 U Unmapped peptide. A peptide that could not be mapped to a reference sequence.
228 X Unknown.
229
230
231
232 SELECT distinct chrom, CASE WHEN strand = '+' THEN start + cds_offset - cds_start ELSE end - cds_offset - cds_start END as "pos"
233 FROM feature_cds_map
234 WHERE name = acc_name AND cds_offset >= cds_start AND cds_offset < cds_end
235
236 sqlite> select * from feature_cds_map WHERE name = 'pre_STRG.28813.4_j_5350_5470';
237 pre_STRG.28813.4_j_5350_5470|chr7|5074750|5074857|+|0|107
238 pre_STRG.28813.4_j_5350_5470|chr7|5075140|5075153|+|107|120
239
240
241
242 SELECT
243 pe.isDecoy,
244 pe.dBSequence_ref,
245 pe.start,
246 pe.end,
247 sr.spectrumTitle,
248 si.rank,
249 si.chargeState,
250 si.calculatedMassToCharge,
251 si.experimentalMassToCharge
252 FROM spectrum_identification_results sr
253 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
254 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
255 WHERE si.id = 'SII_7389_1'
256 ORDER BY si.rank;
257
258 SELECT pe.isDecoy, pe.dBSequence_ref, pe.start, pe.end, sr.spectrumTitle, si.rank, si.chargeState, si.calculatedMassToCharge, si.experimentalMassToCharge
259 FROM spectrum_identification_results sr
260 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
261 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
262 WHERE si.id = 'SII_7389_1'
263 ORDER BY si.rank;
264
265
266
267 CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT);
268 CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER);
269 CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT);
270 CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER);
271
272 {'write_probed': 0.08575654029846191, 'PSM_QUERY': 4.704349040985107, 'get_cds': 0.21015286445617676, 'SPECTRUM_PEPTIDES_QUERY': 32.92655086517334, 'PEPTIDE_ACC_QUERY': 425.11919951438904, 'get_mapping': 1.5911591053009033, 'GENOMIC_POS_QUERY': 10.909647226333618}
273 """
274
275
276
277 def regex_match(expr, item):
278 return re.match(expr, item) is not None
279
280
281 def regex_search(expr, item):
282 return re.search(expr, item) is not None
283
284
285 def regex_sub(expr, replace, item):
286 return re.sub(expr, replace, item)
287
288
289 def get_connection(sqlitedb_path, addfunctions=True):
290 conn = sqlite.connect(sqlitedb_path)
291 if addfunctions:
292 conn.create_function("re_match", 2, regex_match)
293 conn.create_function("re_search", 2, regex_search)
294 conn.create_function("re_sub", 3, regex_sub)
295 return conn
296
297 PSM_QUERY = """\
298 SELECT
299 pe.dBSequence_ref,
300 pe.start,
301 pe.end,
302 pe.pre,
303 pe.post,
304 pep.sequence,
305 sr.id,
306 sr.spectrumTitle,
307 si.rank,
308 si.chargeState,
309 si.calculatedMassToCharge,
310 si.experimentalMassToCharge,
311 si.peptide_ref
312 FROM spectrum_identification_results sr
313 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
314 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
315 JOIN peptides pep ON pe.peptide_ref = pep.id
316 WHERE pe.isDecoy = 'false'
317 ORDER BY sr.spectrumTitle,si.rank
318 """
319
320 PEP_MODS_QUERY = """\
321 SELECT location, residue, name, modType, '' as "unimod"
322 FROM peptide_modifications
323 WHERE peptide_ref = :peptide_ref
324 ORDER BY location, modType, name
325 """
326
327 #number of peptides to which the spectrum maps
328 ## spectrum_identification_results => spectrum_identification_result_items -> peptide_evidence
329 SPECTRUM_PEPTIDES_QUERY = """\
330 SELECT count(distinct pep.sequence)
331 FROM spectrum_identification_results sr
332 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
333 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
334 JOIN peptides pep ON pe.peptide_ref = pep.id
335 WHERE pe.isDecoy = 'false'
336 AND sr.id = :sr_id
337 GROUP BY sr.id
338 """
339 #number of genomic locations to which the peptide sequence maps
340 #uniqueness of the peptide mapping
341 ## peptides => peptide_evidence -> db_sequence -> location
342 ## proteins_by_peptide
343 PEPTIDE_ACC_QUERY = """\
344 SELECT
345 pe.dBSequence_ref,
346 pe.start,
347 pe.end
348 FROM peptide_evidence pe
349 JOIN peptides pep ON pe.peptide_ref = pep.id
350 WHERE pe.isDecoy = 'false'
351 AND pep.sequence = :sequence
352 """
353
354 MAP_QUERY = """\
355 SELECT distinct *
356 FROM feature_cds_map
357 WHERE name = :acc
358 AND :p_start < cds_end
359 AND :p_end >= cds_start
360 ORDER BY name,cds_start,cds_end
361 """
362
363 GENOMIC_POS_QUERY = """\
364 SELECT distinct chrom, CASE WHEN strand = '+' THEN start + :cds_offset - cds_start ELSE end - :cds_offset - cds_start END as "pos"
365 FROM feature_cds_map
366 WHERE name = :acc
367 AND :cds_offset >= cds_start
368 AND :cds_offset < cds_end
369 """
370
371 FEATURE_CONTAIN_QUERY = """\
372 SELECT id,seqid,start,end,featuretype,strand,frame
373 FROM features
374 WHERE seqid = :seqid AND start <= :start AND end >= :end
375 AND strand = :strand AND featuretype = :ftype
376 """
377
378 FEATURE_OVERLAP_QUERY = """\
379 SELECT id,seqid,start,end,featuretype,strand,frame
380 FROM features
381 WHERE seqid = :seqid
382 AND :end >= start AND :start <= end
383 AND strand = :strand AND featuretype = :ftype
384 """
385
386 FEATURE_ANY_QUERY = """\
387 SELECT id,seqid,start,end,featuretype,strand,CAST(frame AS INTEGER) as "frame", CAST(frame AS INTEGER)==:frame as "in_frame"
388 FROM features
389 WHERE seqid = :seqid
390 AND :end >= start AND :start <= end
391 AND featuretype in ('CDS','five_prime_utr','three_prime_utr','transcript')
392 ORDER BY strand == :strand DESC, featuretype,
393 start <= :start AND end >= :end DESC,
394 in_frame DESC, end - start, start DESC, end
395 """
396
397 def __main__():
398 parser = argparse.ArgumentParser(
399 description='Generate proBED and proBAM from mz.sqlite')
400 parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML")
401 parser.add_argument('genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table")
402 parser.add_argument(
403 '-R', '--genomeReference', default='Unknown',
404 help='Genome reference sequence in 2bit format')
405 parser.add_argument(
406 '-t', '--twobit', default=None,
407 help='Genome reference sequence in 2bit format')
408 parser.add_argument(
409 '-r', '--reads_bam', default=None,
410 help='reads alignment bam path')
411 parser.add_argument(
412 '-g', '--gffutils_file', default=None,
413 help='gffutils GTF sqlite DB')
414 parser.add_argument(
415 '-B', '--probed', default=None,
416 help='proBed path')
417 parser.add_argument(
418 '-s', '--prosam', default=None,
419 help='proSAM path')
420 parser.add_argument(
421 '-b', '--probam', default=None,
422 help='proBAM path')
423 parser.add_argument(
424 '-l', '--limit', type=int, default=None,
425 help='limit numbers of PSMs for testing')
426 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
427 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
428 args = parser.parse_args()
429
430 def get_sequence(chrom, start, end):
431 if twobit:
432 if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
433 return twobit[chrom][start:end]
434 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
435 if contig in twobit and 0 <= start < end < len(twobit[contig]):
436 return twobit[contig][start:end]
437 return ''
438 return None
439
440 twobit = TwoBitFile(args.twobit) if args.twobit else None
441 samfile = pysam.AlignmentFile(args.reads_bam, "rb" ) if args.reads_bam else None
442 seqlens = twobit.sequence_sizes()
443
444 probed = open(args.probed,'w') if args.probed else sys.stdout
445
446 gff_cursor = get_connection(args.gffutils_file).cursor() if args.gffutils_file else None
447 map_cursor = get_connection(args.genomic_mapping_sqlite).cursor()
448 mz_cursor = get_connection(args.mzsqlite_file).cursor()
449
450 unmapped_accs = set()
451 timings = dict()
452 def add_time(name,elapsed):
453 if name in timings:
454 timings[name] += elapsed
455 else:
456 timings[name] = elapsed
457
458 XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*']
459 FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript']
460 def get_peptide_type(exons):
461 ## XG classify peptide
462 ## N Normal peptide. The peptide sequence is contained in the reference protein sequence.
463 ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
464 ## W Indel peptide. An insertion or deletion is present as compared to the reference.
465 ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
466 ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
467 ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
468 ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
469 ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
470 ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
471 ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
472 ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
473 ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
474 ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform.
475 ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
476 ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
477 ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence.
478 ## X Unknown.
479
480 peptide_type = '*'
481 if gff_cursor:
482 ts = time()
483 etypes = ['*'] * len(exons)
484 efeatures = [None] * len(exons)
485 if args.debug:
486 print('exons:%d\t%s'% (len(exons),etypes),file=sys.stderr)
487 for i,exon in enumerate(exons):
488 (acc,gc,gs,ge,st,cs,ce) = exon
489 fr = cs % 3
490 if args.debug:
491 print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc,gc,gs,ge,st,cs,ce,fr),file=sys.stderr)
492 ft_params = {"seqid" : str(gc).replace('chr',''), "start" : gs, "end" : ge, 'strand' : st, 'frame' : fr, 'ftype' : 'CDS'}
493 features = [f for f in gff_cursor.execute(FEATURE_ANY_QUERY,ft_params)]
494 efeatures[i] = features
495 for i,exon in enumerate(exons):
496 (acc,gc,gs,ge,st,cs,ce) = exon
497 for f in efeatures[i]:
498 (id,seqid,start,end,featuretype,strand,frame,in_frame) = f
499 if args.debug:
500 print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (id,seqid,start,end,featuretype,strand,frame,in_frame),file=sys.stderr)
501 if strand == st:
502 if start <= gs and ge <= end:
503 if in_frame:
504 etypes[i] = 'N'
505 break
506 elif XG_TYPES.index('O') < XG_TYPES.index(etypes[i]):
507 etypes[i] = 'O'
508 break
509 else:
510 if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]):
511 etypes[i] = 'O'
512 peptide_type = etypes[i]
513 te = time()
514 add_time('pep_type',te - ts)
515 return peptide_type
516 def classify_exon(exon,exons,features):
517 ## N Normal peptide. The peptide sequence is contained in the reference protein sequence.
518 # 1 exon, contained, in_frame
519 # 2+ exons, contained, in_frame, on_exon_boundary
520 ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
521 # 1 exon, contained, in_frame, AA_mismatch
522 # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch
523 ## W Indel peptide. An insertion or deletion is present as compared to the reference.
524 # 1 exon, contained, in_frame, AA_mismatch
525 # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch
526 ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
527 # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
528 ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
529 # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
530 ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
531 ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
532 # 1 exon overlaps but not contained
533 ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
534 ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
535 # exon overlaps a three_prime_utr
536 ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
537 # exon contained but not in_frame
538 ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
539 ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
540 ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform.
541 # exon contained in transcript, not not overlapping any exon
542 ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
543 # exonis from different seqs, strand, or transcripts
544 ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
545 ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence.
546 ## X Unknown.
547 return '*'
548
549 def get_variant_cds(exons,ref_prot,peptide,pep_cds):
550 if ref_prot != peptide and samfile:
551 try:
552 if args.debug:
553 print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr)
554 ts = time()
555 for exon in exons:
556 (acc,chrom,start,end,strand,c_start,c_end) = exon
557 a_start = c_start / 3 * 3
558 a_end = c_end / 3 * 3
559 if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
560 pileup = get_exon_pileup(chrom,start,end)
561 for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]):
562 if ao == 0 or i == 0:
563 if ref_prot[ai] != peptide[ai]:
564 codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao)
565 if args.debug:
566 print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr)
567 if codon:
568 pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:]
569 te = time()
570 add_time('var_cds',te - ts)
571 except Exception as e:
572 print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr)
573 return pep_cds
574
575 def get_mapping(acc,pep_start,pep_end):
576 ts = time()
577 p_start = (pep_start - 1) * 3
578 p_end = pep_end * 3
579 map_params = {"acc" : acc, "p_start" : p_start, "p_end" : p_end}
580 if args.debug:
581 print('%s' % map_params, file=sys.stderr)
582 locs = [l for l in map_cursor.execute(MAP_QUERY,map_params)]
583 exons = []
584 ## ========= pep
585 ## --- continue
586 ## --- trim
587 ## --- copy
588 ## --- trim
589 ## --- break
590 c_end = 0
591 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(locs):
592 if args.debug:
593 print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
594 c_start = c_end
595 if cds_end < p_start:
596 continue
597 if cds_start >= p_end:
598 break
599 if strand == '+':
600 if cds_start < p_start:
601 start += p_start - cds_start
602 if cds_end > p_end:
603 end -= cds_end - p_end
604 else:
605 if cds_start < p_start:
606 end -= p_start - cds_start
607 if cds_end > p_end:
608 start += cds_end - p_end
609 c_end = c_start + abs(end - start)
610 if args.debug:
611 print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
612 exons.append([acc,chrom,start,end,strand,c_start,c_end])
613 te = time()
614 add_time('get_mapping',te - ts)
615 return exons
616
617 def get_cds(exons):
618 ts = time()
619 seqs = []
620 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(exons):
621 seq = get_sequence(chrom, min(start,end), max(start,end))
622 if strand == '-':
623 seq = reverse_complement(seq)
624 seqs.append(seq)
625 te = time()
626 add_time('get_cds',te - ts)
627 if args.debug:
628 print('CDS: %s' % str(seqs),file=sys.stderr)
629 return ''.join(seqs) if seqs else ''
630
631 def genomic_mapping_count(peptide):
632 ts = time()
633 params = {"sequence" : peptide}
634 acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY,params)]
635 te = time()
636 add_time('PEPTIDE_ACC_QUERY',te - ts)
637 if acc_locs:
638 if len(acc_locs) == 1:
639 return 1
640 locations = set()
641 for i,acc_loc in enumerate(acc_locs):
642 (acc,pep_start,pep_end) = acc_loc
643 if acc in unmapped_accs:
644 continue
645 try:
646 add_time('GENOMIC_POS_QUERY_COUNT',1)
647 ts = time()
648 p_start = pep_start * 3
649 p_end = pep_end * 3
650 params = {"acc" : acc, "cds_offset" : p_start}
651 (start_chrom,start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
652 params = {"acc" : acc, "cds_offset" : p_end}
653 (end_chrom,end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
654 locations.add('%s:%s-%s:%s' % (start_chrom,start_pos,end_chrom,end_pos))
655 te = time()
656 add_time('GENOMIC_POS_QUERY',te - ts)
657 except:
658 unmapped_accs.add(acc)
659 print('Unmapped: %s' % acc, file=sys.stderr)
660 return len(locations)
661 return -1
662
663 def spectrum_peptide_count(spectrum_id):
664 ts = time()
665 params = {"sr_id" : spectrum_id}
666 pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0]
667 te = time()
668 add_time('SPECTRUM_PEPTIDES_QUERY',te - ts)
669 return pep_count
670
671 def get_exon_pileup(chrom,chromStart,chromEnd):
672 cols = []
673 for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd):
674 if chromStart <= pileupcolumn.reference_pos <= chromEnd:
675 bases = dict()
676 col = {'depth' : 0, 'cov' : pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases' : bases}
677 for pileupread in pileupcolumn.pileups:
678 if not pileupread.is_del and not pileupread.is_refskip:
679 col['depth'] += 1
680 base = pileupread.alignment.query_sequence[pileupread.query_position]
681 if base not in bases:
682 bases[base] = 1
683 else:
684 bases[base] += 1
685 cols.append(col)
686 return cols
687
688 codon_map = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L",
689 "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S",
690 "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*",
691 "TGT":"C", "TGC":"C", "TGA":"*", "TGG":"W",
692 "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
693 "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
694 "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
695 "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R",
696 "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M",
697 "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
698 "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K",
699 "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R",
700 "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
701 "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A",
702 "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E",
703 "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
704
705 aa_codon_map = dict()
706 for c,a in codon_map.items():
707 aa_codon_map[a] = [c] if a not in aa_codon_map else aa_codon_map[a] + [c]
708
709 aa_na_map = dict() # m[aa]{bo : {b1 : [b3]
710 for c,a in codon_map.items():
711 if a not in aa_na_map:
712 aa_na_map[a] = dict()
713 d = aa_na_map[a]
714 for i in range(3):
715 b = c[i]
716 if i < 2:
717 if b not in d:
718 d[b] = dict() if i < 1 else set()
719 d = d[b]
720 else:
721 d.add(b)
722
723 def get_pep_codon(pileup, idx, aa, ao):
724 try:
725 ts = time()
726 bases = []
727 for i in range(3):
728 if i < ao:
729 bases.append(list(set([c[i] for c in aa_codon_map[aa]])))
730 else:
731 bases.append([b for b, cnt in reversed(sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k,v): (v,k)))])
732 print('%s' % bases)
733 for b0 in bases[0]:
734 if b0 not in aa_na_map[aa]:
735 continue
736 for b1 in bases[1]:
737 if b1 not in aa_na_map[aa][b0]:
738 continue
739 for b2 in bases[2]:
740 if b2 in aa_na_map[aa][b0][b1]:
741 return '%s%s%s' % (b0,b1,b2)
742 te = time()
743 add_time('pep_codon',te - ts)
744 except Exception as e:
745 print("get_pep_codon: %s %s %s %s"
746 % (aa, ao, idx, pileup), file=sys.stderr)
747 raise e
748 return None
749
750 def write_probed(chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts,
751 spectrum,protacc,peptide,uniqueness,genomeReference,score=1000,
752 psmScore='.', fdr='.', mods='.', charge='.',
753 expMassToCharge='.', calcMassToCharge='.',
754 psmRank='.', datasetID='.', uri='.'):
755 probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
756 (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount,
757 ','.join([str(v) for v in blockSizes]),
758 ','.join([str(v) for v in blockStarts]),
759 protacc,peptide,uniqueness, genomeReference,
760 psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri))
761
762 def get_genomic_location(exons):
763 chrom = exons[0][1]
764 strand = exons[0][4]
765 pos = [exon[2] for exon in exons] + [exon[3] for exon in exons]
766 chromStart = min(pos)
767 chromEnd = max(pos)
768 blockCount = len(exons)
769 blockSizes = [abs(exon[3] - exon[2]) for exon in exons]
770 blockStarts = [min(exon[2],exon[3]) - chromStart for exon in exons]
771 return (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts)
772
773 def get_psm_modifications(peptide_ref):
774 mods = []
775 ts = time()
776 params = {"peptide_ref" : peptide_ref}
777 pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)]
778 if pepmods:
779 for (location, residue, name, modType, unimod) in pepmods:
780 mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name,residue)))
781 te = time()
782 add_time('PEP_MODS_QUERY',te - ts)
783 return ';'.join(mods)
784
785
786 """
787 QNAME
788 FLAG
789 RNAME
790 POS
791 CIGAR
792 SEQ
793 'NH' : 'i', #number of genomic locations to which the peptide sequence maps
794 'XO' : 'Z', #uniqueness of the peptide mapping
795 'XL' : 'i', #number of peptides to which the spectrum maps
796 'XP' : 'Z', #peptide sequence
797 'YP' : 'Z', #Protein accession ID from the original search result
798 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
799 'XI' : 'f', #Peptide intensity
800 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
801 'XR' : 'Z', #reference peptide sequence
802 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
803 'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
804 'XS' : 'f', #PSM score
805 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
806 'XC' : 'i', #peptide charge
807 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
808 'XM' : 'Z', #Modifications
809 'XN' : 'i', #Number of missed cleavages in the peptide (XP)
810 'XT' : 'i', #Enzyme specificity
811 'XE' : 'i', #Enzyme used in the experiment
812 'XG' : 'A', #Peptide type
813 'XU' : 'Z', #URI
814 """
815 psm_cursor = get_connection(args.mzsqlite_file).cursor()
816 ts = time()
817 psms = psm_cursor.execute(PSM_QUERY)
818 te = time()
819 add_time('PSM_QUERY',te - ts)
820 proBAM = ProBAM(species=None,assembly=args.genomeReference,seqlens=seqlens,comments=[])
821 proBED = ProBED(species=None,assembly=args.genomeReference,comments=[])
822 for i, psm in enumerate(psms):
823 probam_dict = PROBAM_DEFAULTS.copy()
824 (acc,pep_start,pep_end,aa_pre,aa_post,peptide,spectrum_id,spectrum_title,rank,charge,calcmass,exprmass,pepref) = psm
825 scan_name = spectrum_title if spectrum_title else spectrum_id
826 if args.debug:
827 print('\nPSM: %d\t%s' % (i, '\t'.join([str(v) for v in (acc,pep_start,pep_end,peptide,spectrum_id,scan_name,rank,charge,calcmass,exprmass)])), file=sys.stderr)
828 exons = get_mapping(acc,pep_start,pep_end)
829 if args.debug:
830 print('%s' % exons, file=sys.stderr)
831 if not exons:
832 continue
833 mods = get_psm_modifications(pepref)
834 (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) = get_genomic_location(exons)
835 ref_cds = get_cds(exons)
836 if args.debug:
837 print('%s' % ref_cds, file=sys.stderr)
838 ref_prot = translate(ref_cds)
839 if args.debug:
840 print('%s' % ref_prot, file=sys.stderr)
841 print('%s' % peptide, file=sys.stderr)
842 spectrum_peptides = spectrum_peptide_count(spectrum_id)
843 peptide_locations = genomic_mapping_count(peptide)
844 if args.debug:
845 print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides,peptide_locations), file=sys.stderr)
846 uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]'
847 ts = time()
848 proBEDEntry = ProBEDEntry(chrom,chromStart,chromEnd,
849 '%s_%s' % (acc,scan_name),
850 1000,strand,
851 blockCount,blockSizes,blockStarts,
852 acc,peptide,uniqueness,args.genomeReference,
853 charge=charge,expMassToCharge=exprmass,calcMassToCharge=calcmass,
854 mods=mods if mods else '.', psmRank=rank)
855 proBED.add_entry(proBEDEntry)
856 te = time()
857 add_time('add_probed',te - ts)
858 if len(ref_prot) != len(peptide):
859 continue
860 ts = time()
861 probam_dict['NH'] = peptide_locations
862 probam_dict['XO'] = uniqueness
863 probam_dict['XL'] = peptide_locations
864 probam_dict['XP'] = peptide
865 probam_dict['YP'] = acc
866 probam_dict['XC'] = charge
867 probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass)
868 probam_dict['XR'] = ref_prot # ? dbSequence
869 probam_dict['YA'] = aa_post
870 probam_dict['YB'] = aa_pre
871 probam_dict['XM'] = mods if mods else '*'
872 flag = 16 if strand == '-' else 0
873 if str(rank)!=str(1) and rank!='*' and rank!=[] and rank!="":
874 flag += 256
875 probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons])
876 ## check for variation from ref_cds
877 pep_cds = get_variant_cds(exons,ref_prot,peptide,ref_cds)
878 peptide_type = '*'
879 ## XG classify peptide
880 probam_dict['XG'] = get_peptide_type(exons)
881 ## probam_dict['MD'] = peptide
882
883 ## FIX SAM sequence is forward strand
884 seq = pep_cds if strand == '+' else reverse_complement(pep_cds)
885 ## cigar based on plus strand
886 cigar = ''
887 if strand == '+':
888 blkStarts = blockStarts
889 blkSizes = blockSizes
890 else:
891 blkStarts = [x for x in reversed(blockStarts)]
892 blkSizes = [x for x in reversed(blockSizes)]
893 for j in range(blockCount):
894 if j > 0:
895 intron = blkStarts[j] - (blkStarts[j-1] + blkSizes[j-1])
896 if intron > 0:
897 cigar += '%dN' % intron
898 cigar += '%dM' % blkSizes[j]
899 ## Mods TODO
900 proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart+1,
901 cigar=cigar,seq=seq,optional=probam_dict)
902 proBAM.add_entry(proBAMEntry)
903 te = time()
904 add_time('add_probam',te - ts)
905
906 if args.debug:
907 print('%s' % probam_dict, file=sys.stderr)
908
909 if args.limit and i >= args.limit:
910 break
911 if args.probed:
912 ts = time()
913 with open(args.probed,'w') as fh:
914 proBED.write(fh)
915 te = time()
916 add_time('write_probed',te - ts)
917 if args.prosam or args.probam:
918 samfile = args.prosam if args.prosam else 'temp.sam'
919 ts = time()
920 with open(samfile,'w') as fh:
921 proBAM.write(fh)
922 te = time()
923 add_time('write_prosam',te - ts)
924 if args.probam:
925 ts = time()
926 bamfile = args.prosam.replace('.sam','.bam')
927 pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False)
928 te = time()
929 add_time('write_probam',te - ts)
930 pysam.index(args.probam)
931
932 print('\n%s\n' % str(timings), file=sys.stderr)
933
934 if __name__ == "__main__":
935 __main__()