Mercurial > repos > galaxyp > unipept
diff unipept.py @ 0:b33376bf2290 draft
planemo upload for repository http://unipept.ugent.be/apidocs commit 704a0414b2547298b2596219998002491505d927-dirty
author | galaxyp |
---|---|
date | Wed, 24 Oct 2018 14:45:18 -0400 |
parents | |
children | b65ee881ca64 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unipept.py Wed Oct 24 14:45:18 2018 -0400 @@ -0,0 +1,658 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2015, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" + +import json +import logging +import optparse +from optparse import OptionParser +import os +import sys +import re +import urllib +import urllib2 + +""" +pept2taxa json +pept2lca json +pept2prot +pept2ec ecjson ec +pept2go go +pept2funct go ec +peptinfo json ecjson ec go + +""" + +try: + import xml.etree.cElementTree as ET +except ImportError: + import xml.etree.ElementTree as ET + +def warn_err(msg,exit_code=1): + sys.stderr.write(msg) + if exit_code: + sys.exit(exit_code) + +go_types = ['biological process', 'molecular function', 'cellular component'] +ec_name_dict = { +'1' : 'Oxidoreductase', +'1.1' : 'act on the CH-OH group of donors', +'1.2' : 'act on the aldehyde or oxo group of donors', +'1.3' : 'act on the CH-CH group of donors', +'1.4' : 'act on the CH-NH2 group of donors', +'1.5' : 'act on CH-NH group of donors', +'1.6' : 'act on NADH or NADPH', +'1.7' : 'act on other nitrogenous compounds as donors', +'1.8' : 'act on a sulfur group of donors', +'1.9' : 'act on a heme group of donors', +'1.10' : 'act on diphenols and related substances as donors', +'1.11' : 'act on peroxide as an acceptor -- peroxidases', +'1.12' : 'act on hydrogen as a donor', +'1.13' : 'act on single donors with incorporation of molecular oxygen', +'1.14' : 'act on paired donors with incorporation of molecular oxygen', +'1.15' : 'act on superoxide radicals as acceptors', +'1.16' : 'oxidize metal ions', +'1.17' : 'act on CH or CH2 groups', +'1.18' : 'act on iron-sulfur proteins as donors', +'1.19' : 'act on reduced flavodoxin as donor', +'1.20' : 'act on phosphorus or arsenic as donors', +'1.21' : 'act on X-H and Y-H to form an X-Y bond', +'1.97' : 'other oxidoreductases', +'2' : 'Transferase', +'2.1' : 'transfer one-carbon groups, Methylase', +'2.2' : 'transfer aldehyde or ketone groups', +'2.3' : 'acyltransferases', +'2.4' : 'glycosyltransferases', +'2.5' : 'transfer alkyl or aryl groups, other than methyl groups', +'2.6' : 'transfer nitrogenous groups', +'2.7' : 'transfer phosphorus-containing groups', +'2.8' : 'transfer sulfur-containing groups', +'2.9' : 'transfer selenium-containing groups', +'3' : 'Hydrolase', +'3.1' : 'act on ester bonds', +'3.2' : 'act on sugars - glycosylases', +'3.3' : 'act on ether bonds', +'3.4' : 'act on peptide bonds - Peptidase', +'3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', +'3.6' : 'act on acid anhydrides', +'3.7' : 'act on carbon-carbon bonds', +'3.8' : 'act on halide bonds', +'3.9' : 'act on phosphorus-nitrogen bonds', +'3.10' : 'act on sulfur-nitrogen bonds', +'3.11' : 'act on carbon-phosphorus bonds', +'3.12' : 'act on sulfur-sulfur bonds', +'3.13' : 'act on carbon-sulfur bonds', +'4' : 'Lyase', +'4.1' : 'carbon-carbon lyases', +'4.2' : 'carbon-oxygen lyases', +'4.3' : 'carbon-nitrogen lyases', +'4.4' : 'carbon-sulfur lyases', +'4.5' : 'carbon-halide lyases', +'4.6' : 'phosphorus-oxygen lyases', +'5' : 'Isomerase', +'5.1' : 'racemases and epimerases', +'5.2' : 'cis-trans-isomerases', +'5.3' : 'intramolecular oxidoreductases', +'5.4' : 'intramolecular transferases -- mutases', +'5.5' : 'intramolecular lyases', +'5.99' : 'other isomerases', +'6' : 'Ligase', +'6.1' : 'form carbon-oxygen bonds', +'6.2' : 'form carbon-sulfur bonds', +'6.3' : 'form carbon-nitrogen bonds', +'6.4' : 'form carbon-carbon bonds', +'6.5' : 'form phosphoric ester bonds', +'6.6' : 'form nitrogen-metal bonds', +} +pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] +pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] +pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] +pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] +pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] +pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] +pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] +pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] +pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] +pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] + +def __main__(): + version = '2.0' + pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' + + def read_tabular(filepath,col): + peptides = [] + with open(filepath) as fp: + for i,line in enumerate(fp): + if line.strip() == '' or line.startswith('#'): + continue + fields = line.rstrip('\n').split('\t') + peptide = fields[col] + if not re.match(pep_pat,peptide): + warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) + peptides.append(peptide) + return peptides + + def get_fasta_entries(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + + def read_fasta(filepath): + peptides = [] + with open(filepath) as fp: + for id, peptide in get_fasta_entries(fp): + if not re.match(pep_pat,peptide): + warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) + peptides.append(peptide) + return peptides + + def read_mzid(fp): + peptides = [] + for event, elem in ET.iterparse(fp): + if event == 'end': + if re.search('PeptideSequence',elem.tag): + peptides.append(elem.text) + return peptides + + def read_pepxml(fp): + peptides = [] + for event, elem in ET.iterparse(fp): + if event == 'end': + if re.search('search_hit',elem.tag): + peptides.append(elem.get('peptide')) + return peptides + + def best_match(peptide,matches): + if not matches: + return None + elif len(matches) == 1: + return matches[0].copy() + else: + # find the most specific match (peptide is always the first column order field) + for col in reversed(pept2lca_extra_column_order[1:]): + col_id = col+"_id" if options.extra else col + for match in matches: + if 'taxon_rank' in match and match['taxon_rank'] == col: + return match.copy() + if col_id in match and match[col_id]: + return match.copy() + return None + + def get_taxon_json(resp): + found_keys = set() + for i,pdict in enumerate(resp): + found_keys |= set(pdict.keys()) + taxa_cols = [] + for col in pept2lca_extra_column_order[-1:0:-1]: + if col+'_id' in found_keys: + taxa_cols.append(col) + id_to_node = dict() + def get_node(id,name,rank,child,seq): + if id not in id_to_node: + data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } + node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } + id_to_node[id] = node + else: + node = id_to_node[id] + node['data']['count'] += 1 + if seq is not None and seq not in node['data']['sequences']: + node['data']['sequences'].append(seq) + if child is None: + node['data']['self_count'] += 1 + elif child['id'] not in node['kids']: + node['kids'].append(child['id']) + node['children'].append(child) + return node + root = get_node(1,'root','no rank',None,None) + for i,pdict in enumerate(resp): + sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) + seq = sequence + child = None + for col in taxa_cols: + col_id = col+'_id' + if col_id in pdict and pdict.get(col_id): + col_name = col if col in found_keys else col+'_name' + child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) + seq = None + if child: + get_node(1,'root','no rank',child,None) + return root + + def get_ec_json(resp): + ecMap = dict() + for pdict in resp: + if 'ec' in pdict: + for ec in pdict['ec']: + ec_number = ec['ec_number'] + if ec_number not in ecMap: + ecMap[ec_number] = [] + ecMap[ec_number].append(pdict) + def get_ids(ec): + ids = [] + i = len(ec) + while i >= 0: + ids.append(ec[:i]) + i = ec.rfind('.',0,i - 1) + return ids + id_to_node = dict() + def get_node(id,name,child,seq): + if id not in id_to_node: + data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } + node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } + id_to_node[id] = node + else: + node = id_to_node[id] + node['data']['count'] += 1 + if seq is not None and seq not in node['data']['sequences']: + node['data']['sequences'].append(seq) + if child is None: + node['data']['self_count'] += 1 + elif child['id'] not in node['kids']: + node['kids'].append(child['id']) + node['children'].append(child) + return node + root = get_node(0,'-.-.-.-',None,None) + for i in range(1,7): + child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) + get_node(0,'-.-.-.-',child,None) + for i,pdict in enumerate(resp): + sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) + seq = sequence + if 'ec' in pdict: + for ec in pdict['ec']: + child = None + protein_count = ec['protein_count'] + ec_number = ec['ec_number'] + for ec_id in get_ids(ec_number): + child = get_node(ec_id,ec_id,child,seq) + seq = None + if child: + get_node(0,'-.-.-.-',child,None) + return root + + def get_taxon_dict(resp, column_order, extra=False, names=False): + found_keys = set() + results = [] + for i,pdict in enumerate(resp): + results.append(pdict) + found_keys |= set(pdict.keys()) + # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) + column_names = [] + column_keys = [] + for col in column_order: + if col in found_keys: + column_names.append(col) + column_keys.append(col) + elif names: + col_id = col+'_id' + col_name = col+'_name' + if extra: + if col_id in found_keys: + column_names.append(col_id) + column_keys.append(col_id) + if names: + if col_name in found_keys: + column_names.append(col) + column_keys.append(col_name) + else: + if col+'_name' in found_keys: + column_names.append(col) + column_keys.append(col+'_name') + elif col+'_id' in found_keys: + column_names.append(col) + column_keys.append(col+'_id') + # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) + taxa = dict() ## peptide : [taxonomy] + for i,pdict in enumerate(results): + peptide = pdict['peptide'] if 'peptide' in pdict else None + if peptide and peptide not in taxa: + vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] + taxa[peptide] = vals + return (taxa,column_names) + + def get_ec_dict(resp, extra=False): + ec_cols = ['ec_numbers', 'ec_protein_counts'] + if extra: + ec_cols.append('ec_names') + ec_dict = dict() + for i,pdict in enumerate(resp): + peptide = pdict['peptide'] + ec_numbers = [] + protein_counts = [] + ec_names = [] + if 'ec' in pdict: + for ec in pdict['ec']: + ec_numbers.append(ec['ec_number']) + protein_counts.append(str(ec['protein_count'])) + if extra: + ec_names.append(ec['name'] if 'name' in ec else '') + vals = [','.join(ec_numbers),','.join(protein_counts)] + if extra: + vals.append(','.join(ec_names)) + ec_dict[peptide] = vals + return (ec_dict, ec_cols) + + def get_go_dict(resp, extra=False): + go_cols = ['go_terms', 'go_protein_counts'] + if extra: + go_cols.append('go_names') + go_dict = dict() + for i,pdict in enumerate(resp): + peptide = pdict['peptide'] + go_terms = [] + protein_counts = [] + go_names = [] + if 'go' in pdict: + for go in pdict['go']: + if 'go_term' in go: + go_terms.append(go['go_term']) + protein_counts.append(str(go['protein_count'])) + if extra: + go_names.append(go['name'] if 'name' in go else '') + else: + for go_type in go_types: + if go_type in go: + for _go in go[go_type]: + go_terms.append(_go['go_term']) + protein_counts.append(str(_go['protein_count'])) + if extra: + go_names.append(_go['name'] if 'name' in _go else '') + vals = [','.join(go_terms),','.join(protein_counts)] + if extra: + vals.append(','.join(go_names)) + go_dict[peptide] = vals + return (go_dict, go_cols) + + def write_ec_table(outfile, resp, column_order): + with open(outfile,'w') as fh: + for i,pdict in enumerate(resp): + if 'ec' in pdict: + tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] + for ec in pdict['ec']: + vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] + fh.write('%s\n' % '\t'.join(tvals + vals)) + + def write_go_table(outfile, resp, column_order): + with open(outfile,'w') as fh: + for i,pdict in enumerate(resp): + if 'go' in pdict: + tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] + for go in pdict['go']: + if 'go_term' in go: + vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] + fh.write('%s\n' % '\t'.join(tvals + vals)) + else: + for go_type in go_types: + if go_type in go: + for _go in go[go_type]: + vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] + vals.append(go_type) + fh.write('%s\n' % '\t'.join(tvals + vals)) + + #Parse Command Line + parser = optparse.OptionParser() + # unipept API choice + parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], + help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) + # input files + parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) + parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) + parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) + parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) + parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) + # Unipept Flags + parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) + parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) + parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) + parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' ) + parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) + + # output fields + parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) + # Warn vs Error Flag + parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) + # output files + parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') + parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') + parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') + parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') + parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') + parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') + parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') + parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) + parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' ) + # debug + parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' ) + parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) + parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) + (options, args) = parser.parse_args() + if options.version: + print >> sys.stdout,"%s" % version + sys.exit(0) + invalid_ec = 2 if options.strict else None + peptides = [] + ## Get peptide sequences + if options.mzid: + peptides += read_mzid(options.mzid) + if options.pepxml: + peptides += read_pepxml(options.pepxml) + if options.tabular: + peptides += read_tabular(options.tabular,options.column) + if options.fasta: + peptides += read_fasta(options.fasta) + if args and len(args) > 0: + for i,peptide in enumerate(args): + if not re.match(pep_pat,peptide): + warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) + peptides.append(peptide) + if len(peptides) < 1: + warn_err("No peptides input!",exit_code=1) + column_order = pept2lca_column_order + if options.unipept == 'pept2prot': + column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order + else: + if options.extra or options.names: + column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order + else: + column_order = pept2lca_column_order + ## map to tryptic peptides + pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} + partToPeps = {} + for peptide, parts in pepToParts.iteritems(): + if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) + for part in parts: + if len(part) > 50: + warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) + if 5 <= len(part) <= 50: + partToPeps.setdefault(part,[]).append(peptide) + trypticPeptides = partToPeps.keys() + ## unipept + unipept_resp = [] + idx = range(0,len(trypticPeptides),options.max_request) + idx.append(len(trypticPeptides)) + for i in range(len(idx)-1): + post_data = [] + if options.equate_il: + post_data.append(("equate_il","true")) + if options.names or options.json: + post_data.append(("extra","true")) + post_data.append(("names","true")) + elif options.extra or options.json: + post_data.append(("extra","true")) + if options.domains: + post_data.append(("domains","true")) + post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] + if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) + params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) + #headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} + headers = {'Accept': 'application/json'} + url = '%s/%s' % (options.url.rstrip('/'),options.unipept) + if options.get: + url = '%s.json?%s' % (url,params) + req = urllib2.Request( url ) + else: + url = '%s.json' % (url) + req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) + if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) + try: + resp = urllib2.urlopen( req ) + if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) + if resp.getcode() == 200: + unipept_resp += json.loads( urllib2.urlopen( req ).read() ) + except Exception, e: + warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) + unmatched_peptides = [] + peptideMatches = [] + if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) + if options.unipept in ['pept2prot', 'pept2taxa']: + dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide + ## multiple entries per trypticPeptide for pep2prot or pep2taxa + mapping = {} + for match in unipept_resp: + mapping.setdefault(match['peptide'],[]).append(match) + for peptide in peptides: + # Get the intersection of matches to the tryptic parts + keyToMatch = None + for part in pepToParts[peptide]: + if part in mapping: + temp = {match[dupkey] : match for match in mapping[part]} + if keyToMatch: + dkeys = set(keyToMatch.keys()) - set(temp.keys()) + for k in dkeys: + del keyToMatch[k] + else: + keyToMatch = temp + ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp + if not keyToMatch: + unmatched_peptides.append(peptide) + else: + for key,match in keyToMatch.iteritems(): + match['tryptic_peptide'] = match['peptide'] + match['peptide'] = peptide + peptideMatches.append(match) + elif options.unipept in ['pept2lca']: + ## should be one response per trypticPeptide for pep2lca + respMap = {v['peptide']:v for v in unipept_resp} + ## map resp back to peptides + for peptide in peptides: + matches = list() + for part in pepToParts[peptide]: + if part in respMap: + matches.append(respMap[part]) + match = best_match(peptide,matches) + if not match: + unmatched_peptides.append(peptide) + longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] + match = {'peptide' : longest_tryptic_peptide} + match['tryptic_peptide'] = match['peptide'] + match['peptide'] = peptide + peptideMatches.append(match) + else: + respMap = {v['peptide']:v for v in unipept_resp} + ## map resp back to peptides + for peptide in peptides: + matches = list() + for part in pepToParts[peptide]: + if part in respMap and 'total_protein_count' in respMap[part]: + matches.append(respMap[part]) + match = best_match(peptide,matches) + if not match: + unmatched_peptides.append(peptide) + longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] + match = {'peptide' : longest_tryptic_peptide} + match['tryptic_peptide'] = match['peptide'] + match['peptide'] = peptide + peptideMatches.append(match) + resp = peptideMatches + if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) + ## output results + if not (options.unmatched or options.json or options.tsv or options.csv): + print >> sys.stdout, str(resp) + if options.unmatched: + with open(options.unmatched,'w') as outputFile: + for peptide in peptides: + if peptide in unmatched_peptides: + outputFile.write("%s\n" % peptide) + if options.json: + if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: + root = get_taxon_json(resp) + with open(options.json,'w') as outputFile: + outputFile.write(json.dumps(root)) + elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']: + with open(options.json,'w') as outputFile: + outputFile.write(str(resp)) + if options.ec_json: + if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: + root = get_ec_json(resp) + with open(options.ec_json,'w') as outputFile: + outputFile.write(json.dumps(root)) + if options.tsv or options.csv: + rows = [] + column_names = None + if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']: + taxa = None + ec_dict = None + go_dict = None + if options.unipept in ['peptinfo']: + (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) + if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: + (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra) + if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: + (go_dict,go_cols) = get_go_dict(resp, extra=options.extra) + for i,pdict in enumerate(resp): + peptide = pdict['peptide'] + total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' + column_names = ['peptide', 'total_protein_count'] + vals = [peptide,total_protein_count] + if ec_dict: + vals += ec_dict[peptide] + column_names += ec_cols + if go_dict: + vals += go_dict[peptide] + column_names += go_cols + if taxa: + vals += taxa[peptide][1:] + column_names += taxon_cols[1:] + rows.append(vals) + elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: + (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) + column_names = taxon_cols + rows = taxa.values() + for peptide,vals in taxa.iteritems(): + rows.append(vals) + if options.tsv: + with open(options.tsv,'w') as outputFile: + if column_names: + outputFile.write("#%s\n"% '\t'.join(column_names)) + for vals in rows: + outputFile.write("%s\n"% '\t'.join(vals)) + if options.csv: + with open(options.csv,'w') as outputFile: + if column_names: + outputFile.write("%s\n"% ','.join(column_names)) + for vals in rows: + outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) + if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: + column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order + write_ec_table(options.ec_tsv, resp, column_order) + if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: + column_order = pept2go_extra_column_order if options.extra else pept2go_column_order + write_go_table(options.go_tsv, resp, column_order) + +if __name__ == "__main__" : __main__()