Mercurial > repos > jjohnson > mzsqlite_psm_align
comparison mzsqlite_psm_align.py @ 0:492f98d89e26 draft
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/mzsqlite_psm_align commit 88e2fb9c31fbd687a0956924a870137d1fb9bee3-dirty
author | jjohnson |
---|---|
date | Tue, 10 Apr 2018 09:57:49 -0400 |
parents | |
children | af5f22779a8e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:492f98d89e26 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2017, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 from __future__ import print_function | |
16 | |
17 import argparse | |
18 import re | |
19 import sys | |
20 import sqlite3 as sqlite | |
21 from time import time | |
22 | |
23 | |
24 from Bio.Seq import reverse_complement, translate | |
25 | |
26 ## from bedutil import bed_from_line | |
27 | |
28 ## import digest | |
29 | |
30 ## from ensembl_rest import get_cdna | |
31 | |
32 import pysam | |
33 from twobitreader import TwoBitFile | |
34 | |
35 from profmt import PROBAM_DEFAULTS,ProBAM,ProBAMEntry,ProBED,ProBEDEntry | |
36 | |
37 """ | |
38 inputs | |
39 proBed | |
40 mzIdentML | |
41 twobit | |
42 bam | |
43 | |
44 inputs | |
45 mz.sqlite | |
46 genomic.mapping | |
47 bam | |
48 | |
49 CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT); | |
50 CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER); | |
51 CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT); | |
52 CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER); | |
53 | |
54 SELECT | |
55 FROM spectrum_identification_result_items siri | |
56 JOIN peptide_evidence pe ON siri.peptide_ref = pe.peptide_ref | |
57 JOIN db_sequence dbs ON pe.dBSequence_ref = | |
58 WHERE pe.isDecoy = 'false' | |
59 | |
60 SELECT | |
61 psm.spectrumID, | |
62 psm.spectrumTitle as "QNAME", | |
63 psm.id, | |
64 psm.sequence, | |
65 psm.passThreshold, | |
66 psm."PeptideShaker PSM confidence", | |
67 psm."PeptideShaker PSM score", | |
68 pe.start, | |
69 pe.end, | |
70 pe.pre, | |
71 pe.post, | |
72 pe.dBSequence_ref | |
73 FROM psm_entries psm | |
74 JOIN peptide_evidence pe ON psm.id = pe.peptide_ref | |
75 JOIN db_sequence dbs ON pe.dBSequence_ref = dbs.accession | |
76 WHERE pe.isDecoy = 'false' | |
77 AND pe.peptide_ref = 'SFYPEEVSSMVITK_15.99491461956-ATAA-10' | |
78 ORDER BY psm.spectrumID | |
79 | |
80 | |
81 proBed to SQLite | |
82 or index proBed | |
83 | |
84 for psm in psms: | |
85 beds = get_bed(protein_acc) | |
86 cds = '' | |
87 for bed in beds: | |
88 bed.seq = twobit[bed.chrom][bed.start,bed.end] | |
89 cds += bed.get_cds() | |
90 refprot = translate(cds) | |
91 | |
92 def read_bed(path): | |
93 pdict = dict() | |
94 prog = re.compile('^([^\t]+\t[^\t]+\t[^\t]+\t([^\t]+)\t.*)$') | |
95 with open(path,'r') as bed: | |
96 for i,line in enumerate(bed): | |
97 m = prog.match(line) | |
98 prot = m.groups()[1] | |
99 pdict[prot] = m.groups()[0] | |
100 return pdict | |
101 | |
102 from pyteomics import mzid | |
103 with mzid.reader(args.mzid) as mzidrdr: | |
104 for psm in mzidrdr: | |
105 SpectrumIdentificationItems = psm['SpectrumIdentificationItem'] | |
106 for SpectrumIdentificationItem in SpectrumIdentificationItems: | |
107 PeptideEvidenceRef = SpectrumIdentificationItem['PeptideEvidenceRef'] | |
108 PepEvs = [r['peptideEvidence_ref'] for r in PeptideEvidenceRef] | |
109 for PepEv in PepEvs: | |
110 PepRef = mzidrdr[PepEv] | |
111 dBSequence_ref = PepRef['dBSequence_ref'] | |
112 | |
113 spectrum_peptides = count(distinct sequence) FROM psm_entries WHERE | |
114 | |
115 1 QNAME String Query template NAME Spectrum name * psm.spectrumTitle | |
116 2 FLAG Int Bitwise FLAG Bitwise FLAG map.strand | |
117 3 RNAME String Reference sequence NAME Reference sequence NAME * map.chrom | |
118 4 POS Int 1-based leftmost mapping POSition 1-based leftmost mapping POSition map.start | |
119 5 -MAPQ Int MAPping Quality (Phred-scaled) - 255 | |
120 6 CIGAR String Extended CIGAR string (operations: MIDN) CIGAR string * map.cigar | |
121 7 -RNEXT String Mate Reference NAME ('=' if same as RNAME) - * | |
122 8 -PNEXT Int 1-Based leftmost Mate POSition - 0 | |
123 9 TLEN Int observed Template LENgth - 0 | |
124 10 SEQ String segment SEQuence Coding sequence * genomic.seq | |
125 11 -QUAL String Query QUALity (ASCII-33=Phred base quality) - * | |
126 | |
127 1 QNAME psm.spectrumTitle | |
128 2 FLAG map.strand | |
129 3 RNAME map.chrom | |
130 4 POS map.start | |
131 5 -MAPQ | |
132 6 CIGAR map.cigar | |
133 7 -RNEXT | |
134 8 -PNEXT | |
135 9 -TLEN | |
136 10 SEQ genomic.seq | |
137 11 -QUAL | |
138 | |
139 'NH' : 'i' genomic_locations | |
140 'XO' : 'Z' | |
141 'XL' : 'i' spectrum_peptides | |
142 'XP' : 'Z' psm.sequence | |
143 'YP' : 'Z' peptide_evidence.dBSequence_ref | |
144 'XF' : 'Z' reading_frame | |
145 'XI' : 'f' | |
146 'XB' : 'Z' | |
147 'XR' : 'Z' | |
148 'YB' : 'Z' | |
149 'YA' : 'Z' | |
150 'XS' : 'f' | |
151 'XQ' : 'f' | |
152 'XC' : 'i' | |
153 'XA' : 'i' | |
154 'XM' : 'Z' | |
155 'XN' : 'i' | |
156 'XT' : 'i' | |
157 'XE' : 'i' | |
158 'XG' : 'A' | |
159 'XU' : 'Z' | |
160 | |
161 'NH' : 'i', #number of genomic locations to which the peptide sequence maps | |
162 'XO' : 'Z', #uniqueness of the peptide mapping | |
163 'XL' : 'i', #number of peptides to which the spectrum maps | |
164 'XP' : 'Z', #peptide sequence | |
165 'YP' : 'Z', #Protein accession ID from the original search result | |
166 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) | |
167 'XI' : 'f', #Peptide intensity | |
168 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). | |
169 'XR' : 'Z', #reference peptide sequence | |
170 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). | |
171 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). | |
172 'XS' : 'f', #PSM score | |
173 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). | |
174 'XC' : 'i', #peptide charge | |
175 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; | |
176 'XM' : 'Z', #Modifications | |
177 'XN' : 'i', #Number of missed cleavages in the peptide (XP) | |
178 'XT' : 'i', #Enzyme specificity | |
179 'XE' : 'i', #Enzyme used in the experiment | |
180 'XG' : 'A', #Peptide type | |
181 'XU' : 'Z', #URI | |
182 | |
183 | |
184 Datatype Field name Description Origin | |
185 RNAME string chrom map.chrom Reference sequence chromosome | |
186 POS uint chromStart map Start position of the first DNA base | |
187 uint chromEnd map End position of the last DNA base | |
188 QNAME string name spectrum.title Unique name | |
189 uint score Score | |
190 char[1] strand + or - for strand | |
191 uint thickStart Coding region start | |
192 uint thickEnd Coding region end | |
193 uint reserved Always 0 | |
194 int blockCount Number of blocks | |
195 int[blockCount] blockSizes Block sizes | |
196 int[blockCount] chromStarts Block starts | |
197 YP string proteinAccession Protein accession number | |
198 XP string peptideSequence Peptide sequence | |
199 XO string uniqueness Peptide uniqueness | |
200 string genomeReferenceVersion Genome reference version number | |
201 XS double psmScore PSM score | |
202 XQ double fdr Estimated global false discovery rate | |
203 XM string modifications Post-translational modifications | |
204 XC int charge Charge value | |
205 XB double expMassToCharge Experimental mass to charge value | |
206 XB double calcMassToCharge Calculated mass to charge value | |
207 int psmRank Peptide-Spectrum Match rank. | |
208 string datasetID Dataset Identifier | |
209 string uri Uniform Resource Identifier | |
210 | |
211 XG | |
212 N Normal peptide. The peptide sequence is contained in the reference protein sequence. | |
213 V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. | |
214 W Indel peptide. An insertion or deletion is present as compared to the reference. | |
215 J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. | |
216 A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. | |
217 M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. | |
218 C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). | |
219 E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. | |
220 B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. | |
221 O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. | |
222 T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. | |
223 R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. | |
224 I Intron peptide. A peptide that is located in an intronic region of the reference isoform. | |
225 G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. | |
226 D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. | |
227 U Unmapped peptide. A peptide that could not be mapped to a reference sequence. | |
228 X Unknown. | |
229 | |
230 | |
231 | |
232 SELECT distinct chrom, CASE WHEN strand = '+' THEN start + cds_offset - cds_start ELSE end - cds_offset - cds_start END as "pos" | |
233 FROM feature_cds_map | |
234 WHERE name = acc_name AND cds_offset >= cds_start AND cds_offset < cds_end | |
235 | |
236 sqlite> select * from feature_cds_map WHERE name = 'pre_STRG.28813.4_j_5350_5470'; | |
237 pre_STRG.28813.4_j_5350_5470|chr7|5074750|5074857|+|0|107 | |
238 pre_STRG.28813.4_j_5350_5470|chr7|5075140|5075153|+|107|120 | |
239 | |
240 | |
241 | |
242 SELECT | |
243 pe.isDecoy, | |
244 pe.dBSequence_ref, | |
245 pe.start, | |
246 pe.end, | |
247 sr.spectrumTitle, | |
248 si.rank, | |
249 si.chargeState, | |
250 si.calculatedMassToCharge, | |
251 si.experimentalMassToCharge | |
252 FROM spectrum_identification_results sr | |
253 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
254 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
255 WHERE si.id = 'SII_7389_1' | |
256 ORDER BY si.rank; | |
257 | |
258 SELECT pe.isDecoy, pe.dBSequence_ref, pe.start, pe.end, sr.spectrumTitle, si.rank, si.chargeState, si.calculatedMassToCharge, si.experimentalMassToCharge | |
259 FROM spectrum_identification_results sr | |
260 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
261 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
262 WHERE si.id = 'SII_7389_1' | |
263 ORDER BY si.rank; | |
264 | |
265 | |
266 | |
267 CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT); | |
268 CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER); | |
269 CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT); | |
270 CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER); | |
271 | |
272 {'write_probed': 0.08575654029846191, 'PSM_QUERY': 4.704349040985107, 'get_cds': 0.21015286445617676, 'SPECTRUM_PEPTIDES_QUERY': 32.92655086517334, 'PEPTIDE_ACC_QUERY': 425.11919951438904, 'get_mapping': 1.5911591053009033, 'GENOMIC_POS_QUERY': 10.909647226333618} | |
273 """ | |
274 | |
275 | |
276 | |
277 def regex_match(expr, item): | |
278 return re.match(expr, item) is not None | |
279 | |
280 | |
281 def regex_search(expr, item): | |
282 return re.search(expr, item) is not None | |
283 | |
284 | |
285 def regex_sub(expr, replace, item): | |
286 return re.sub(expr, replace, item) | |
287 | |
288 | |
289 def get_connection(sqlitedb_path, addfunctions=True): | |
290 conn = sqlite.connect(sqlitedb_path) | |
291 if addfunctions: | |
292 conn.create_function("re_match", 2, regex_match) | |
293 conn.create_function("re_search", 2, regex_search) | |
294 conn.create_function("re_sub", 3, regex_sub) | |
295 return conn | |
296 | |
297 PSM_QUERY = """\ | |
298 SELECT | |
299 pe.dBSequence_ref, | |
300 pe.start, | |
301 pe.end, | |
302 pe.pre, | |
303 pe.post, | |
304 pep.sequence, | |
305 sr.id, | |
306 sr.spectrumTitle, | |
307 si.rank, | |
308 si.chargeState, | |
309 si.calculatedMassToCharge, | |
310 si.experimentalMassToCharge, | |
311 si.peptide_ref | |
312 FROM spectrum_identification_results sr | |
313 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
314 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
315 JOIN peptides pep ON pe.peptide_ref = pep.id | |
316 WHERE pe.isDecoy = 'false' | |
317 ORDER BY sr.spectrumTitle,si.rank | |
318 """ | |
319 | |
320 PEP_MODS_QUERY = """\ | |
321 SELECT location, residue, name, modType, '' as "unimod" | |
322 FROM peptide_modifications | |
323 WHERE peptide_ref = :peptide_ref | |
324 ORDER BY location, modType, name | |
325 """ | |
326 | |
327 #number of peptides to which the spectrum maps | |
328 ## spectrum_identification_results => spectrum_identification_result_items -> peptide_evidence | |
329 SPECTRUM_PEPTIDES_QUERY = """\ | |
330 SELECT count(distinct pep.sequence) | |
331 FROM spectrum_identification_results sr | |
332 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
333 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
334 JOIN peptides pep ON pe.peptide_ref = pep.id | |
335 WHERE pe.isDecoy = 'false' | |
336 AND sr.id = :sr_id | |
337 GROUP BY sr.id | |
338 """ | |
339 #number of genomic locations to which the peptide sequence maps | |
340 #uniqueness of the peptide mapping | |
341 ## peptides => peptide_evidence -> db_sequence -> location | |
342 ## proteins_by_peptide | |
343 PEPTIDE_ACC_QUERY = """\ | |
344 SELECT | |
345 pe.dBSequence_ref, | |
346 pe.start, | |
347 pe.end | |
348 FROM peptide_evidence pe | |
349 JOIN peptides pep ON pe.peptide_ref = pep.id | |
350 WHERE pe.isDecoy = 'false' | |
351 AND pep.sequence = :sequence | |
352 """ | |
353 | |
354 MAP_QUERY = """\ | |
355 SELECT distinct * | |
356 FROM feature_cds_map | |
357 WHERE name = :acc | |
358 AND :p_start < cds_end | |
359 AND :p_end >= cds_start | |
360 ORDER BY name,cds_start,cds_end | |
361 """ | |
362 | |
363 GENOMIC_POS_QUERY = """\ | |
364 SELECT distinct chrom, CASE WHEN strand = '+' THEN start + :cds_offset - cds_start ELSE end - :cds_offset - cds_start END as "pos" | |
365 FROM feature_cds_map | |
366 WHERE name = :acc | |
367 AND :cds_offset >= cds_start | |
368 AND :cds_offset < cds_end | |
369 """ | |
370 | |
371 FEATURE_CONTAIN_QUERY = """\ | |
372 SELECT id,seqid,start,end,featuretype,strand,frame | |
373 FROM features | |
374 WHERE seqid = :seqid AND start <= :start AND end >= :end | |
375 AND strand = :strand AND featuretype = :ftype | |
376 """ | |
377 | |
378 FEATURE_OVERLAP_QUERY = """\ | |
379 SELECT id,seqid,start,end,featuretype,strand,frame | |
380 FROM features | |
381 WHERE seqid = :seqid | |
382 AND :end >= start AND :start <= end | |
383 AND strand = :strand AND featuretype = :ftype | |
384 """ | |
385 | |
386 FEATURE_ANY_QUERY = """\ | |
387 SELECT id,seqid,start,end,featuretype,strand,CAST(frame AS INTEGER) as "frame", CAST(frame AS INTEGER)==:frame as "in_frame" | |
388 FROM features | |
389 WHERE seqid = :seqid | |
390 AND :end >= start AND :start <= end | |
391 AND featuretype in ('CDS','five_prime_utr','three_prime_utr','transcript') | |
392 ORDER BY strand == :strand DESC, featuretype, | |
393 start <= :start AND end >= :end DESC, | |
394 in_frame DESC, end - start, start DESC, end | |
395 """ | |
396 | |
397 def __main__(): | |
398 parser = argparse.ArgumentParser( | |
399 description='Generate proBED and proBAM from mz.sqlite') | |
400 parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML") | |
401 parser.add_argument('genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table") | |
402 parser.add_argument( | |
403 '-R', '--genomeReference', default='Unknown', | |
404 help='Genome reference sequence in 2bit format') | |
405 parser.add_argument( | |
406 '-t', '--twobit', default=None, | |
407 help='Genome reference sequence in 2bit format') | |
408 parser.add_argument( | |
409 '-r', '--reads_bam', default=None, | |
410 help='reads alignment bam path') | |
411 parser.add_argument( | |
412 '-g', '--gffutils_file', default=None, | |
413 help='gffutils GTF sqlite DB') | |
414 parser.add_argument( | |
415 '-B', '--probed', default=None, | |
416 help='proBed path') | |
417 parser.add_argument( | |
418 '-s', '--prosam', default=None, | |
419 help='proSAM path') | |
420 parser.add_argument( | |
421 '-b', '--probam', default=None, | |
422 help='proBAM path') | |
423 parser.add_argument( | |
424 '-l', '--limit', type=int, default=None, | |
425 help='limit numbers of PSMs for testing') | |
426 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') | |
427 parser.add_argument('-d', '--debug', action='store_true', help='Debug') | |
428 args = parser.parse_args() | |
429 | |
430 def get_sequence(chrom, start, end): | |
431 if twobit: | |
432 if chrom in twobit and 0 <= start < end < len(twobit[chrom]): | |
433 return twobit[chrom][start:end] | |
434 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom | |
435 if contig in twobit and 0 <= start < end < len(twobit[contig]): | |
436 return twobit[contig][start:end] | |
437 return '' | |
438 return None | |
439 | |
440 twobit = TwoBitFile(args.twobit) if args.twobit else None | |
441 samfile = pysam.AlignmentFile(args.reads_bam, "rb" ) if args.reads_bam else None | |
442 seqlens = twobit.sequence_sizes() | |
443 | |
444 probed = open(args.probed,'w') if args.probed else sys.stdout | |
445 | |
446 gff_cursor = get_connection(args.gffutils_file).cursor() if args.gffutils_file else None | |
447 map_cursor = get_connection(args.genomic_mapping_sqlite).cursor() | |
448 mz_cursor = get_connection(args.mzsqlite_file).cursor() | |
449 | |
450 unmapped_accs = set() | |
451 timings = dict() | |
452 def add_time(name,elapsed): | |
453 if name in timings: | |
454 timings[name] += elapsed | |
455 else: | |
456 timings[name] = elapsed | |
457 | |
458 XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*'] | |
459 FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript'] | |
460 def get_peptide_type(exons): | |
461 ## XG classify peptide | |
462 ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. | |
463 ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. | |
464 ## W Indel peptide. An insertion or deletion is present as compared to the reference. | |
465 ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. | |
466 ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. | |
467 ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. | |
468 ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). | |
469 ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. | |
470 ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. | |
471 ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. | |
472 ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. | |
473 ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. | |
474 ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. | |
475 ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. | |
476 ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. | |
477 ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. | |
478 ## X Unknown. | |
479 | |
480 peptide_type = '*' | |
481 if gff_cursor: | |
482 ts = time() | |
483 etypes = ['*'] * len(exons) | |
484 efeatures = [None] * len(exons) | |
485 if args.debug: | |
486 print('exons:%d\t%s'% (len(exons),etypes),file=sys.stderr) | |
487 for i,exon in enumerate(exons): | |
488 (acc,gc,gs,ge,st,cs,ce) = exon | |
489 fr = cs % 3 | |
490 if args.debug: | |
491 print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc,gc,gs,ge,st,cs,ce,fr),file=sys.stderr) | |
492 ft_params = {"seqid" : str(gc).replace('chr',''), "start" : gs, "end" : ge, 'strand' : st, 'frame' : fr, 'ftype' : 'CDS'} | |
493 features = [f for f in gff_cursor.execute(FEATURE_ANY_QUERY,ft_params)] | |
494 efeatures[i] = features | |
495 for i,exon in enumerate(exons): | |
496 (acc,gc,gs,ge,st,cs,ce) = exon | |
497 for f in efeatures[i]: | |
498 (id,seqid,start,end,featuretype,strand,frame,in_frame) = f | |
499 if args.debug: | |
500 print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (id,seqid,start,end,featuretype,strand,frame,in_frame),file=sys.stderr) | |
501 if strand == st: | |
502 if start <= gs and ge <= end: | |
503 if in_frame: | |
504 etypes[i] = 'N' | |
505 break | |
506 elif XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): | |
507 etypes[i] = 'O' | |
508 break | |
509 else: | |
510 if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): | |
511 etypes[i] = 'O' | |
512 peptide_type = etypes[i] | |
513 te = time() | |
514 add_time('pep_type',te - ts) | |
515 return peptide_type | |
516 def classify_exon(exon,exons,features): | |
517 ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. | |
518 # 1 exon, contained, in_frame | |
519 # 2+ exons, contained, in_frame, on_exon_boundary | |
520 ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. | |
521 # 1 exon, contained, in_frame, AA_mismatch | |
522 # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch | |
523 ## W Indel peptide. An insertion or deletion is present as compared to the reference. | |
524 # 1 exon, contained, in_frame, AA_mismatch | |
525 # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch | |
526 ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. | |
527 # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons | |
528 ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. | |
529 # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons | |
530 ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. | |
531 ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). | |
532 # 1 exon overlaps but not contained | |
533 ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. | |
534 ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. | |
535 # exon overlaps a three_prime_utr | |
536 ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. | |
537 # exon contained but not in_frame | |
538 ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. | |
539 ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. | |
540 ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. | |
541 # exon contained in transcript, not not overlapping any exon | |
542 ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. | |
543 # exonis from different seqs, strand, or transcripts | |
544 ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. | |
545 ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. | |
546 ## X Unknown. | |
547 return '*' | |
548 | |
549 def get_variant_cds(exons,ref_prot,peptide,pep_cds): | |
550 if ref_prot != peptide and samfile: | |
551 try: | |
552 if args.debug: | |
553 print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr) | |
554 ts = time() | |
555 for exon in exons: | |
556 (acc,chrom,start,end,strand,c_start,c_end) = exon | |
557 a_start = c_start / 3 * 3 | |
558 a_end = c_end / 3 * 3 | |
559 if ref_prot[a_start:a_end] != peptide[a_start:a_end]: | |
560 pileup = get_exon_pileup(chrom,start,end) | |
561 for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]): | |
562 if ao == 0 or i == 0: | |
563 if ref_prot[ai] != peptide[ai]: | |
564 codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao) | |
565 if args.debug: | |
566 print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr) | |
567 if codon: | |
568 pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:] | |
569 te = time() | |
570 add_time('var_cds',te - ts) | |
571 except Exception as e: | |
572 print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr) | |
573 return pep_cds | |
574 | |
575 def get_mapping(acc,pep_start,pep_end): | |
576 ts = time() | |
577 p_start = (pep_start - 1) * 3 | |
578 p_end = pep_end * 3 | |
579 map_params = {"acc" : acc, "p_start" : p_start, "p_end" : p_end} | |
580 if args.debug: | |
581 print('%s' % map_params, file=sys.stderr) | |
582 locs = [l for l in map_cursor.execute(MAP_QUERY,map_params)] | |
583 exons = [] | |
584 ## ========= pep | |
585 ## --- continue | |
586 ## --- trim | |
587 ## --- copy | |
588 ## --- trim | |
589 ## --- break | |
590 c_end = 0 | |
591 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(locs): | |
592 if args.debug: | |
593 print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr) | |
594 c_start = c_end | |
595 if cds_end < p_start: | |
596 continue | |
597 if cds_start >= p_end: | |
598 break | |
599 if strand == '+': | |
600 if cds_start < p_start: | |
601 start += p_start - cds_start | |
602 if cds_end > p_end: | |
603 end -= cds_end - p_end | |
604 else: | |
605 if cds_start < p_start: | |
606 end -= p_start - cds_start | |
607 if cds_end > p_end: | |
608 start += cds_end - p_end | |
609 c_end = c_start + abs(end - start) | |
610 if args.debug: | |
611 print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr) | |
612 exons.append([acc,chrom,start,end,strand,c_start,c_end]) | |
613 te = time() | |
614 add_time('get_mapping',te - ts) | |
615 return exons | |
616 | |
617 def get_cds(exons): | |
618 ts = time() | |
619 seqs = [] | |
620 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(exons): | |
621 seq = get_sequence(chrom, min(start,end), max(start,end)) | |
622 if strand == '-': | |
623 seq = reverse_complement(seq) | |
624 seqs.append(seq) | |
625 te = time() | |
626 add_time('get_cds',te - ts) | |
627 if args.debug: | |
628 print('CDS: %s' % str(seqs),file=sys.stderr) | |
629 return ''.join(seqs) if seqs else '' | |
630 | |
631 def genomic_mapping_count(peptide): | |
632 ts = time() | |
633 params = {"sequence" : peptide} | |
634 acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY,params)] | |
635 te = time() | |
636 add_time('PEPTIDE_ACC_QUERY',te - ts) | |
637 if acc_locs: | |
638 if len(acc_locs) == 1: | |
639 return 1 | |
640 locations = set() | |
641 for i,acc_loc in enumerate(acc_locs): | |
642 (acc,pep_start,pep_end) = acc_loc | |
643 if acc in unmapped_accs: | |
644 continue | |
645 try: | |
646 add_time('GENOMIC_POS_QUERY_COUNT',1) | |
647 ts = time() | |
648 p_start = pep_start * 3 | |
649 p_end = pep_end * 3 | |
650 params = {"acc" : acc, "cds_offset" : p_start} | |
651 (start_chrom,start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() | |
652 params = {"acc" : acc, "cds_offset" : p_end} | |
653 (end_chrom,end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() | |
654 locations.add('%s:%s-%s:%s' % (start_chrom,start_pos,end_chrom,end_pos)) | |
655 te = time() | |
656 add_time('GENOMIC_POS_QUERY',te - ts) | |
657 except: | |
658 unmapped_accs.add(acc) | |
659 print('Unmapped: %s' % acc, file=sys.stderr) | |
660 return len(locations) | |
661 return -1 | |
662 | |
663 def spectrum_peptide_count(spectrum_id): | |
664 ts = time() | |
665 params = {"sr_id" : spectrum_id} | |
666 pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0] | |
667 te = time() | |
668 add_time('SPECTRUM_PEPTIDES_QUERY',te - ts) | |
669 return pep_count | |
670 | |
671 def get_exon_pileup(chrom,chromStart,chromEnd): | |
672 cols = [] | |
673 for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd): | |
674 if chromStart <= pileupcolumn.reference_pos <= chromEnd: | |
675 bases = dict() | |
676 col = {'depth' : 0, 'cov' : pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases' : bases} | |
677 for pileupread in pileupcolumn.pileups: | |
678 if not pileupread.is_del and not pileupread.is_refskip: | |
679 col['depth'] += 1 | |
680 base = pileupread.alignment.query_sequence[pileupread.query_position] | |
681 if base not in bases: | |
682 bases[base] = 1 | |
683 else: | |
684 bases[base] += 1 | |
685 cols.append(col) | |
686 return cols | |
687 | |
688 codon_map = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L", | |
689 "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S", | |
690 "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*", | |
691 "TGT":"C", "TGC":"C", "TGA":"*", "TGG":"W", | |
692 "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L", | |
693 "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P", | |
694 "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", | |
695 "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R", | |
696 "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M", | |
697 "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T", | |
698 "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K", | |
699 "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R", | |
700 "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V", | |
701 "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A", | |
702 "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E", | |
703 "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",} | |
704 | |
705 aa_codon_map = dict() | |
706 for c,a in codon_map.items(): | |
707 aa_codon_map[a] = [c] if a not in aa_codon_map else aa_codon_map[a] + [c] | |
708 | |
709 aa_na_map = dict() # m[aa]{bo : {b1 : [b3] | |
710 for c,a in codon_map.items(): | |
711 if a not in aa_na_map: | |
712 aa_na_map[a] = dict() | |
713 d = aa_na_map[a] | |
714 for i in range(3): | |
715 b = c[i] | |
716 if i < 2: | |
717 if b not in d: | |
718 d[b] = dict() if i < 1 else set() | |
719 d = d[b] | |
720 else: | |
721 d.add(b) | |
722 | |
723 def get_pep_codon(pileup, idx, aa, ao): | |
724 try: | |
725 ts = time() | |
726 bases = [] | |
727 for i in range(3): | |
728 if i < ao: | |
729 bases.append(list(set([c[i] for c in aa_codon_map[aa]]))) | |
730 else: | |
731 bases.append([b for b, cnt in reversed(sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k,v): (v,k)))]) | |
732 print('%s' % bases) | |
733 for b0 in bases[0]: | |
734 if b0 not in aa_na_map[aa]: | |
735 continue | |
736 for b1 in bases[1]: | |
737 if b1 not in aa_na_map[aa][b0]: | |
738 continue | |
739 for b2 in bases[2]: | |
740 if b2 in aa_na_map[aa][b0][b1]: | |
741 return '%s%s%s' % (b0,b1,b2) | |
742 te = time() | |
743 add_time('pep_codon',te - ts) | |
744 except Exception as e: | |
745 print("get_pep_codon: %s %s %s %s" | |
746 % (aa, ao, idx, pileup), file=sys.stderr) | |
747 raise e | |
748 return None | |
749 | |
750 def write_probed(chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts, | |
751 spectrum,protacc,peptide,uniqueness,genomeReference,score=1000, | |
752 psmScore='.', fdr='.', mods='.', charge='.', | |
753 expMassToCharge='.', calcMassToCharge='.', | |
754 psmRank='.', datasetID='.', uri='.'): | |
755 probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ | |
756 (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount, | |
757 ','.join([str(v) for v in blockSizes]), | |
758 ','.join([str(v) for v in blockStarts]), | |
759 protacc,peptide,uniqueness, genomeReference, | |
760 psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri)) | |
761 | |
762 def get_genomic_location(exons): | |
763 chrom = exons[0][1] | |
764 strand = exons[0][4] | |
765 pos = [exon[2] for exon in exons] + [exon[3] for exon in exons] | |
766 chromStart = min(pos) | |
767 chromEnd = max(pos) | |
768 blockCount = len(exons) | |
769 blockSizes = [abs(exon[3] - exon[2]) for exon in exons] | |
770 blockStarts = [min(exon[2],exon[3]) - chromStart for exon in exons] | |
771 return (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) | |
772 | |
773 def get_psm_modifications(peptide_ref): | |
774 mods = [] | |
775 ts = time() | |
776 params = {"peptide_ref" : peptide_ref} | |
777 pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)] | |
778 if pepmods: | |
779 for (location, residue, name, modType, unimod) in pepmods: | |
780 mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name,residue))) | |
781 te = time() | |
782 add_time('PEP_MODS_QUERY',te - ts) | |
783 return ';'.join(mods) | |
784 | |
785 | |
786 """ | |
787 QNAME | |
788 FLAG | |
789 RNAME | |
790 POS | |
791 CIGAR | |
792 SEQ | |
793 'NH' : 'i', #number of genomic locations to which the peptide sequence maps | |
794 'XO' : 'Z', #uniqueness of the peptide mapping | |
795 'XL' : 'i', #number of peptides to which the spectrum maps | |
796 'XP' : 'Z', #peptide sequence | |
797 'YP' : 'Z', #Protein accession ID from the original search result | |
798 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) | |
799 'XI' : 'f', #Peptide intensity | |
800 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). | |
801 'XR' : 'Z', #reference peptide sequence | |
802 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). | |
803 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). | |
804 'XS' : 'f', #PSM score | |
805 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). | |
806 'XC' : 'i', #peptide charge | |
807 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; | |
808 'XM' : 'Z', #Modifications | |
809 'XN' : 'i', #Number of missed cleavages in the peptide (XP) | |
810 'XT' : 'i', #Enzyme specificity | |
811 'XE' : 'i', #Enzyme used in the experiment | |
812 'XG' : 'A', #Peptide type | |
813 'XU' : 'Z', #URI | |
814 """ | |
815 psm_cursor = get_connection(args.mzsqlite_file).cursor() | |
816 ts = time() | |
817 psms = psm_cursor.execute(PSM_QUERY) | |
818 te = time() | |
819 add_time('PSM_QUERY',te - ts) | |
820 proBAM = ProBAM(species=None,assembly=args.genomeReference,seqlens=seqlens,comments=[]) | |
821 proBED = ProBED(species=None,assembly=args.genomeReference,comments=[]) | |
822 for i, psm in enumerate(psms): | |
823 probam_dict = PROBAM_DEFAULTS.copy() | |
824 (acc,pep_start,pep_end,aa_pre,aa_post,peptide,spectrum_id,spectrum_title,rank,charge,calcmass,exprmass,pepref) = psm | |
825 scan_name = spectrum_title if spectrum_title else spectrum_id | |
826 if args.debug: | |
827 print('\nPSM: %d\t%s' % (i, '\t'.join([str(v) for v in (acc,pep_start,pep_end,peptide,spectrum_id,scan_name,rank,charge,calcmass,exprmass)])), file=sys.stderr) | |
828 exons = get_mapping(acc,pep_start,pep_end) | |
829 if args.debug: | |
830 print('%s' % exons, file=sys.stderr) | |
831 if not exons: | |
832 continue | |
833 mods = get_psm_modifications(pepref) | |
834 (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) = get_genomic_location(exons) | |
835 ref_cds = get_cds(exons) | |
836 if args.debug: | |
837 print('%s' % ref_cds, file=sys.stderr) | |
838 ref_prot = translate(ref_cds) | |
839 if args.debug: | |
840 print('%s' % ref_prot, file=sys.stderr) | |
841 print('%s' % peptide, file=sys.stderr) | |
842 spectrum_peptides = spectrum_peptide_count(spectrum_id) | |
843 peptide_locations = genomic_mapping_count(peptide) | |
844 if args.debug: | |
845 print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides,peptide_locations), file=sys.stderr) | |
846 uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]' | |
847 ts = time() | |
848 proBEDEntry = ProBEDEntry(chrom,chromStart,chromEnd, | |
849 '%s_%s' % (acc,scan_name), | |
850 1000,strand, | |
851 blockCount,blockSizes,blockStarts, | |
852 acc,peptide,uniqueness,args.genomeReference, | |
853 charge=charge,expMassToCharge=exprmass,calcMassToCharge=calcmass, | |
854 mods=mods if mods else '.', psmRank=rank) | |
855 proBED.add_entry(proBEDEntry) | |
856 te = time() | |
857 add_time('add_probed',te - ts) | |
858 if len(ref_prot) != len(peptide): | |
859 continue | |
860 ts = time() | |
861 probam_dict['NH'] = peptide_locations | |
862 probam_dict['XO'] = uniqueness | |
863 probam_dict['XL'] = peptide_locations | |
864 probam_dict['XP'] = peptide | |
865 probam_dict['YP'] = acc | |
866 probam_dict['XC'] = charge | |
867 probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass) | |
868 probam_dict['XR'] = ref_prot # ? dbSequence | |
869 probam_dict['YA'] = aa_post | |
870 probam_dict['YB'] = aa_pre | |
871 probam_dict['XM'] = mods if mods else '*' | |
872 flag = 16 if strand == '-' else 0 | |
873 if str(rank)!=str(1) and rank!='*' and rank!=[] and rank!="": | |
874 flag += 256 | |
875 probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons]) | |
876 ## check for variation from ref_cds | |
877 pep_cds = get_variant_cds(exons,ref_prot,peptide,ref_cds) | |
878 peptide_type = '*' | |
879 ## XG classify peptide | |
880 probam_dict['XG'] = get_peptide_type(exons) | |
881 ## probam_dict['MD'] = peptide | |
882 | |
883 ## FIX SAM sequence is forward strand | |
884 seq = pep_cds if strand == '+' else reverse_complement(pep_cds) | |
885 ## cigar based on plus strand | |
886 cigar = '' | |
887 if strand == '+': | |
888 blkStarts = blockStarts | |
889 blkSizes = blockSizes | |
890 else: | |
891 blkStarts = [x for x in reversed(blockStarts)] | |
892 blkSizes = [x for x in reversed(blockSizes)] | |
893 for j in range(blockCount): | |
894 if j > 0: | |
895 intron = blkStarts[j] - (blkStarts[j-1] + blkSizes[j-1]) | |
896 if intron > 0: | |
897 cigar += '%dN' % intron | |
898 cigar += '%dM' % blkSizes[j] | |
899 ## Mods TODO | |
900 proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart+1, | |
901 cigar=cigar,seq=seq,optional=probam_dict) | |
902 proBAM.add_entry(proBAMEntry) | |
903 te = time() | |
904 add_time('add_probam',te - ts) | |
905 | |
906 if args.debug: | |
907 print('%s' % probam_dict, file=sys.stderr) | |
908 | |
909 if args.limit and i >= args.limit: | |
910 break | |
911 if args.probed: | |
912 ts = time() | |
913 with open(args.probed,'w') as fh: | |
914 proBED.write(fh) | |
915 te = time() | |
916 add_time('write_probed',te - ts) | |
917 if args.prosam or args.probam: | |
918 samfile = args.prosam if args.prosam else 'temp.sam' | |
919 ts = time() | |
920 with open(samfile,'w') as fh: | |
921 proBAM.write(fh) | |
922 te = time() | |
923 add_time('write_prosam',te - ts) | |
924 if args.probam: | |
925 ts = time() | |
926 bamfile = args.prosam.replace('.sam','.bam') | |
927 pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False) | |
928 te = time() | |
929 add_time('write_probam',te - ts) | |
930 pysam.index(args.probam) | |
931 | |
932 print('\n%s\n' % str(timings), file=sys.stderr) | |
933 | |
934 if __name__ == "__main__": | |
935 __main__() |