comparison profmt.py @ 0:492f98d89e26 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/mzsqlite_psm_align commit 88e2fb9c31fbd687a0956924a870137d1fb9bee3-dirty
author jjohnson
date Tue, 10 Apr 2018 09:57:49 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:492f98d89e26
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2016, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 import sys,re
16 from operator import itemgetter, attrgetter
17 from twobitreader import TwoBitFile
18
19 """
20 1 QNAME string spectrum name *
21 2 FLAG int bitwise FLAG (see further) *
22 3 RNAME string reference sequence name *
23 4 POS int 1-based lefmost mapping position 0
24 5 MAPQ int unused in proBAM 255
25 6 CIGAR string extended cigar string (see further) *
26 7 RNEXT string unused in proBAM *
27 8 PNEXT int unused in proBAM 0
28 9 TLEN int unused in proBAM 0
29 10 SEQ string coding sequence *
30 11 QUAL string unused in proBAM *
31 """
32 """
33 bit description FLAG
34 0x00 peptide maps to the forward strand 0
35 0x10 peptide maps to the reverse strand 16
36 0x100 peptide is not the rank=1 peptide for the spectrum 256
37 0x400 decoy peptide 1024
38 0x4 unmapped peptide 4
39 """
40
41 """
42 tag type description
43 --- ---- -----------
44 NH int number of genomic locations to which the peptide sequence maps
45 XL int number of peptides to which the spectrum maps
46 XP string peptide sequence
47 XR string reference peptide sequence
48 XS float PSM score
49 XQ float PSM q-value PSM FDR (i.e. q-value or 1-PEP).
50 XC int peptide charge
51 XA int Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
52 XM string Modification(s): semicolon seperated list of position,modName
53 XN int number of missed cleavages
54 XT int tryptic state: 0:non-tryptic 1:semi-tryptic 2:tryptic
55 XG string peptide type: N:normal peptide V:variant peptide J:novel junction peptide D:decoy peptide U:unmappped
56 XB string Semicolon-separated list of mass in the following format: massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
57 XE int
58 XF string Reading frame of the peptide (0, 1, 2) (See section 4.4.6).
59 XG char Peptide type (see Table 6 and Figure 1)
60 XI float Peptide intensity
61 XO string This field indicates the uniqueness of the peptide mapping
62 XU string
63
64
65 NH i NH:i:1
66 XO Z XO:Z:unique
67 XL i XL:i:1
68 XP Z XP:Z:ATLELTHNWGTEDDATQSYHNGNSDPR
69 YP Z YP:Z:ENSP00000362463_rs4746:E111A
70 XF Z XF:Z:1,1
71 XI f XI:f:*
72 XB f XB:f:0.70082940064
73 XR Z XR:Z:ATLELTHNWGTEDDETQSYHNGNSDPR
74 YB Z YB:Z:RK
75 YA Z YA:Z:GF
76 XS f XS:f:73.1426
77 XQ f XQ:f:0
78 XC i XC:i:3
79 XA i XA:i:0
80 XM Z XM:Z:*
81 XN i XN:i:0
82 XT i XT:i:2
83 XE i XE:i:1
84 XG Z XG:Z:V
85 XU Z klc_070108x_PH_P7_COLO_205_D13.pepXML
86
87 NH:i:1
88 XO:Z:unique
89 XL:i:1
90 XP:Z:ATLELTHNWGTEDDATQSYHNGNSDPR
91 YP:Z:ENSP00000362463_rs4746:E111A
92 XF:Z:1,1
93 XI:f:*
94 XB:f:0.70082940064
95 XR:Z:ATLELTHNWGTEDDETQSYHNGNSDPR
96 YB:Z:RK
97 YA:Z:GF
98 XS:f:73.1426
99 XQ:f:0
100 XC:i:3
101 XA:i:0
102 XM:Z:*
103 XN:i:0
104 XT:i:2
105 XE:i:1
106 XG:Z:V
107 XU:Z:klc_070108x_PH_P7_COLO_205_D13.pepXML
108
109
110 NH:i:*
111 XO:Z:unique
112 XL:i:1
113 XP:Z:ATLELTHNWGTEDDATQSYHNGNSDPR
114 YP:Z:ENSP00000362463_rs4746:E111A
115 XF:Z:1,1
116 XI:f:*
117 XB:f:0.70082940064
118 XR:Z:ATLELTHNWGTEDDETQSYHNGNSDPR
119 YB:Z:RK
120 YA:Z:GF
121 XS:f:73.1426
122 XQ:f:0
123 XC:i:3
124 XA:i:0
125 XM:Z:*
126 XN:i:0
127 XT:i:2
128 XE:i:1
129 XG:Z:V
130 XU:Z:klc_070108x_PH_P7_COLO_205_D13.pepXML
131 """
132
133
134 PROBAM_TAGS = ['NH', 'XO', 'XL', 'XP', 'YP', 'XF', 'XI', 'XB', 'XR', 'YB', 'YA', 'XS', 'XQ', 'XC', 'XA', 'XM', 'XN', 'XT', 'XE', 'XG', 'XU']
135
136
137 PROBAM_TYTPES = {
138 'NH' : 'i', #number of genomic locations to which the peptide sequence maps
139 'XO' : 'Z', #uniqueness of the peptide mapping
140 'XL' : 'i', #number of peptides to which the spectrum maps
141 'XP' : 'Z', #peptide sequence
142 'YP' : 'Z', #Protein accession ID from the original search result
143 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
144 'XI' : 'f', #Peptide intensity
145 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
146 'XR' : 'Z', #reference peptide sequence
147 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
148 'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
149 'XS' : 'f', #PSM score
150 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
151 'XC' : 'i', #peptide charge
152 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
153 'XM' : 'Z', #Modifications
154 'XN' : 'i', #Number of missed cleavages in the peptide (XP)
155 'XT' : 'i', #Enzyme specificity
156 'XE' : 'i', #Enzyme used in the experiment
157 'XG' : 'A', #Peptide type
158 'XU' : 'Z', #URI
159 }
160
161
162 PROBAM_DEFAULTS = {
163 'NH' : -1, #number of genomic locations to which the peptide sequence maps
164 'XO' : '*', #uniqueness of the peptide mapping
165 'XL' : -1, #number of peptides to which the spectrum maps
166 'XP' : '*', #peptide sequence
167 'YP' : '*', #Protein accession ID from the original search result
168 'XF' : '*', #Reading frame of the peptide (0, 1, 2)
169 'XI' : -1, #Peptide intensity
170 'XB' : '*', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
171 'XR' : '*', #reference peptide sequence
172 'YB' : '*', #Preceding amino acids (2 AA, B stands for before).
173 'YA' : '*', #Following amino acids (2 AA, A stands for after).
174 'XS' : -1, #PSM score
175 'XQ' : -1, #PSM FDR (i.e. q-value or 1-PEP).
176 'XC' : -1, #peptide charge
177 'XA' : -1, #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
178 'XM' : '*', #Modifications
179 'XN' : -1, #Number of missed cleavages in the peptide (XP)
180 'XT' : -1, #Enzyme specificity
181 'XE' : -1, #Enzyme used in the experiment
182 'XG' : '*', #Peptide type
183 'XU' : '*', #URI
184 }
185
186 def cmp_alphanumeric(s1,s2):
187 if s1 == s2:
188 return 0
189 a1 = re.findall("\d+|[a-zA-Z]+",s1)
190 a2 = re.findall("\d+|[a-zA-Z]+",s2)
191 for i in range(min(len(a1),len(a2))):
192 if a1[i] == a2[i]:
193 continue
194 if a1[i].isdigit() and a2[i].isdigit():
195 return int(a1[i]) - int(a2[i])
196 return 1 if a1[i] > a2[i] else -1
197 return len(a1) - len(a2)
198
199
200 def sort_chrom_names(names):
201 rnames = sorted(names,cmp=cmp_alphanumeric)
202 if 'chrM' in rnames:
203 rnames.remove('chrM')
204 rnames.insert(0,'chrM')
205 if 'MT' in rnames:
206 rnames.remove('MT')
207 rnames.append('MT')
208 return rnames
209
210 def as_int_list(obj):
211 if obj is None:
212 return None
213 if isinstance(obj, list):
214 return [int(x) for x in obj]
215 elif isinstance(obj, str):
216 return [int(x) for x in obj.split(',')]
217 else: # python2 unicode?
218 return [int(x) for x in str(obj).split(',')]
219
220
221 class ProBEDEntry (object):
222 def __init__(self, chrom, chromStart, chromEnd, name, score, strand,
223 blockCount, blockSizes, blockStarts,
224 protacc, peptide, uniqueness, genomeReference,
225 psmScore='.', fdr='.', mods='.', charge='.',
226 expMassToCharge='.', calcMassToCharge='.',
227 psmRank='.', datasetID='.', uri='.'):
228 self.chrom = chrom
229 self.chromStart = int(chromStart)
230 self.chromEnd = int(chromEnd)
231 self.name = name
232 self.score = int(score) if score is not None else 0
233 self.strand = '-' if str(strand).startswith('-') else '+'
234 self.thickStart = self.chromStart
235 self.thickEnd = self.chromEnd
236 self.itemRgb = '0'
237 self.blockCount = int(blockCount)
238 self.blockSizes = as_int_list(blockSizes)
239 self.blockStarts = as_int_list(blockStarts)
240 self.protacc = protacc
241 self.peptide = peptide
242 self.uniqueness = uniqueness
243 self.genomeReference = genomeReference
244 self.psmScore = psmScore
245 self.fdr = fdr
246 self.mods = mods
247 self.charge = charge
248 self.expMassToCharge = expMassToCharge
249 self.calcMassToCharge = calcMassToCharge
250 self.psmRank = psmRank
251 self.datasetID = datasetID
252 self.uri = uri
253
254 def __str__(self):
255 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
256 (self.chrom, self.chromStart, self.chromEnd,
257 self.name, self.score, self.strand,
258 self.thickStart, self.thickEnd, self.itemRgb,
259 self.blockCount, self.blockSizes, self.blockStarts,
260 self.protacc, self.peptide, self.uniqueness,
261 self.genomeReference,
262 self.psmScore, self.fdr, self.mods,
263 self.charge, self.expMassToCharge, self.calcMassToCharge,
264 self.psmRank, self.datasetID, self.uri)
265
266
267 class ProBED ( object ):
268 def __init__(self,species=None,assembly=None,comments=[]):
269 self.species = species
270 self.assembly = assembly
271 self.comments = comments
272 self.entries = dict()
273
274 def add_entry(self,entry):
275 if not entry.chrom in self.entries:
276 self.entries[entry.chrom] = []
277 self.entries[entry.chrom].append(entry)
278
279 def write(self,fh):
280 rnames = sort_chrom_names(self.entries.keys())
281 for sn in rnames:
282 if sn not in self.entries:
283 continue
284 ##for pbe in sorted(self.entries[sn], key=lambda probam_entry: probam_entry.pos):
285 for pbe in sorted(self.entries[sn], key=attrgetter('chromStart','chromEnd')):
286 fh.write('%s\n' % str(pbe))
287
288
289 class ProBAMEntry (object):
290 def __init__(self, qname='', flag=0, rname='', pos=0, mapq=255, cigar='', rnext='*', pnext='0', tlen='0', seq='*', qual='*', optional=PROBAM_DEFAULTS):
291 self.qname = qname
292 self.flag = flag
293 self.rname = rname
294 self.pos = pos
295 self.mapq = mapq
296 self.cigar = cigar
297 self.rnext = rnext
298 self.pnext = pnext
299 self.tlen = tlen
300 self.seq = seq
301 self.qual = qual
302 self.optional = optional
303 def __str__(self):
304 opt_cols = '\t%s' % '\t'.join(['%s:%s:%s' % (t,PROBAM_TYTPES[t],self.optional[t]) for t in PROBAM_TAGS]) if self.optional else ''
305 return '%s\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s%s' % (
306 self.qname,self.flag,self.rname,self.pos,self.mapq,self.cigar,
307 str(self.rnext) if self.rnext else '',
308 str(self.pnext) if self.pnext else '',
309 str(self.tlen) if self.tlen else '',
310 self.seq,
311 self.qual, opt_cols)
312 def add_optional(self,tag,value):
313 self.optional[tag] = value
314
315
316 class ProBAM ( object ):
317 def __init__(self,species=None,assembly=None,seqlens={},comments=[]):
318 self.species = species
319 self.assembly = assembly
320 self.seqlens = seqlens
321 self.comments = comments
322 self.entries = dict()
323 self.opt_columns = set()
324 self.rg = []
325
326 def add_entry(self,pb_entry):
327 if not pb_entry.rname in self.entries:
328 self.entries[pb_entry.rname] = []
329 self.entries[pb_entry.rname].append(pb_entry)
330 if pb_entry.optional:
331 self.opt_columns | set(pb_entry.optional.keys())
332
333 def add_entry_from_bed(self,bed_entry,optional=dict()):
334 if bed_entry.pep:
335 optional['XP:Z'] = bed_entry.pep
336 qname=bed_entry.name
337 flag = 0 if bed_entry.strand == '+' else 16
338 rname = bed_entry.chrom
339 pos = bed_entry.chromStart + 1
340 cigar = bed_entry.get_cigar()
341 seq = bed_entry.get_spliced_seq(strand='+') if bed_entry.seq else '*'
342 pb_entry = ProBAMEntry(qname=qname, flag=flag, rname=rname, pos=pos,cigar=cigar,seq=seq,optional=optional)
343 self.add_entry(pb_entry)
344 ## print >> sys.stderr,('add_entry_from_bed:%s\n %s\n %s' % (self.entries.keys(),bed_entry,pb_entry))
345
346 def write(self,fh):
347 fh.write('@HD VN:1.0 SO:coordinate\n')
348 rnames = sort_chrom_names(self.seqlens.keys())
349 for sn in rnames:
350 fh.write('@SQ\tSN:%s\tLN:%d\n' % (sn,self.seqlens[sn]))
351 for rg in self.rg:
352 fh.write('@RG\tID:%s\n' % (rg))
353 fh.write('@PG\tID:SampleSpecificGenerator\n')
354 for comment in self.comments:
355 fh.write('@CO\t%s\n' % comment)
356 for sn in rnames:
357 if sn not in self.entries:
358 continue
359 ##for pbe in sorted(self.entries[sn], key=lambda probam_entry: probam_entry.pos):
360 for pbe in sorted(self.entries[sn], key=attrgetter('pos')):
361 fh.write('%s\n' % str(pbe))
362