changeset 0:b33376bf2290 draft

planemo upload for repository http://unipept.ugent.be/apidocs commit 704a0414b2547298b2596219998002491505d927-dirty
author galaxyp
date Wed, 24 Oct 2018 14:45:18 -0400
parents
children b65ee881ca64
files test-data/input.fasta test-data/input.tsv test-data/input_bad.fasta test-data/peptide.fa test-data/tryptic.fa test-data/tryptic.tsv unipept.py unipept.xml
diffstat 8 files changed, 1237 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.fasta	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,10 @@
+>1
+AIPQLEVARPADAYETAEAYR
+>2
+AAEGGLSR
+>3
+APVLSDSSCK
+>4
+DQIAHEGK
+>5
+ATLTSGAAR
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.tsv	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,5 @@
+1	AIPQLEVARPADAYETAEAYR	AIPQLEVARPADAYETAEAYR
+2	AAEGGLSR	AAEGQLSR
+3	APVLSDSSCK	APVLJDSSCK
+4	DQIAHEGK	DQUAHEGK
+5	ATLTSGAAR	ATLTSGAAR
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_bad.fasta	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,10 @@
+>1
+AIPQLEVARPADAYETAEAYR
+>2
+AAEGQLSR
+>3
+APVLJDSSCK
+>4
+DQUAHEGK
+>5
+ATLTSGAAR
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/peptide.fa	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,9 @@
+>tr|G3RWV1|G3RWV1_GORGO
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV
+>sp|Q9BYE9|CDHR2_HUMAN
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAVGSASVQVLVRVSALVDYERQTAMAV
+>tr|H2QS28|H2QS28_PANTR
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG
+SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSGLVDYERQTAMAV
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tryptic.fa	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,19 @@
+>trypticQTAMAV
+QTAMAV
+>trypticAAGSASVQVLVR
+AAGSASVQVLVR
+>trypticAVGSASVQVLVR
+AVGSASVQVLVR
+>trypticIPIDDLTMVVYDPDK
+IPIDDLTMVVYDPDK
+>trypticVSGLVDYER
+VSGLVDYER
+>trypticVSALVDYER
+VSALVDYER
+>trypticGSNGTFLLSLGGPDAEAFSVSPER
+GSNGTFLLSLGGPDAEAFSVSPER
+>trypticVMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
+VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tryptic.tsv	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,8 @@
+1	QTAMAV	QTAMAV
+2	AAGSASVQVLVR	AAGSASJQVLVR
+3	AVGSASVQVLVR	AVGSASVQVLVR
+4	IPIDDLTMVVYDPDK	IPIDDLTMVVYDPDK
+5	GSNGTFLLSLGGPDAEAFSVSPER	GSNGTFLLSLGGPDAEAFSVSPE
+6	VSGLVDYER	VSGLVDYER
+7	VSALVDYER	VSALVDYER
+8	VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR	VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/unipept.py	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,658 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2015, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+import json
+import logging
+import optparse
+from optparse import OptionParser
+import os
+import sys
+import re
+import urllib
+import urllib2
+
+"""
+pept2taxa	json
+pept2lca	json
+pept2prot	
+pept2ec		ecjson	ec
+pept2go			go
+pept2funct	go	ec
+peptinfo	json ecjson ec go
+
+"""
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+def warn_err(msg,exit_code=1):
+    sys.stderr.write(msg)
+    if exit_code:
+      sys.exit(exit_code)
+
+go_types = ['biological process', 'molecular function', 'cellular component']
+ec_name_dict = {
+'1' : 'Oxidoreductase',
+'1.1' : 'act on the CH-OH group of donors',
+'1.2' : 'act on the aldehyde or oxo group of donors',
+'1.3' : 'act on the CH-CH group of donors',
+'1.4' : 'act on the CH-NH2 group of donors',
+'1.5' : 'act on CH-NH group of donors',
+'1.6' : 'act on NADH or NADPH',
+'1.7' : 'act on other nitrogenous compounds as donors',
+'1.8' : 'act on a sulfur group of donors',
+'1.9' : 'act on a heme group of donors',
+'1.10' : 'act on diphenols and related substances as donors',
+'1.11' : 'act on peroxide as an acceptor -- peroxidases',
+'1.12' : 'act on hydrogen as a donor',
+'1.13' : 'act on single donors with incorporation of molecular oxygen',
+'1.14' : 'act on paired donors with incorporation of molecular oxygen',
+'1.15' : 'act on superoxide radicals as acceptors',
+'1.16' : 'oxidize metal ions',
+'1.17' : 'act on CH or CH2 groups',
+'1.18' : 'act on iron-sulfur proteins as donors',
+'1.19' : 'act on reduced flavodoxin as donor',
+'1.20' : 'act on phosphorus or arsenic as donors',
+'1.21' : 'act on X-H and Y-H to form an X-Y bond',
+'1.97' : 'other oxidoreductases',
+'2' : 'Transferase',
+'2.1' : 'transfer one-carbon groups, Methylase',
+'2.2' : 'transfer aldehyde or ketone groups',
+'2.3' : 'acyltransferases',
+'2.4' : 'glycosyltransferases',
+'2.5' : 'transfer alkyl or aryl groups, other than methyl groups',
+'2.6' : 'transfer nitrogenous groups',
+'2.7' : 'transfer phosphorus-containing groups',
+'2.8' : 'transfer sulfur-containing groups',
+'2.9' : 'transfer selenium-containing groups',
+'3' : 'Hydrolase',
+'3.1' : 'act on ester bonds',
+'3.2' : 'act on sugars - glycosylases',
+'3.3' : 'act on ether bonds',
+'3.4' : 'act on peptide bonds - Peptidase',
+'3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds',
+'3.6' : 'act on acid anhydrides',
+'3.7' : 'act on carbon-carbon bonds',
+'3.8' : 'act on halide bonds',
+'3.9' : 'act on phosphorus-nitrogen bonds',
+'3.10' : 'act on sulfur-nitrogen bonds',
+'3.11' : 'act on carbon-phosphorus bonds',
+'3.12' : 'act on sulfur-sulfur bonds',
+'3.13' : 'act on carbon-sulfur bonds',
+'4' : 'Lyase',
+'4.1' : 'carbon-carbon lyases',
+'4.2' : 'carbon-oxygen lyases',
+'4.3' : 'carbon-nitrogen lyases',
+'4.4' : 'carbon-sulfur lyases',
+'4.5' : 'carbon-halide lyases',
+'4.6' : 'phosphorus-oxygen lyases',
+'5' : 'Isomerase',
+'5.1' : 'racemases and epimerases',
+'5.2' : 'cis-trans-isomerases',
+'5.3' : 'intramolecular oxidoreductases',
+'5.4' : 'intramolecular transferases -- mutases',
+'5.5' : 'intramolecular lyases',
+'5.99' : 'other isomerases',
+'6' : 'Ligase',
+'6.1' : 'form carbon-oxygen bonds',
+'6.2' : 'form carbon-sulfur bonds',
+'6.3' : 'form carbon-nitrogen bonds',
+'6.4' : 'form carbon-carbon bonds',
+'6.5' : 'form phosphoric ester bonds',
+'6.6' : 'form nitrogen-metal bonds',
+}
+pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
+pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
+pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
+pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
+pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
+pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
+pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
+pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
+pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']]
+pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go']
+
+def __main__():
+  version = '2.0'
+  pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
+
+  def read_tabular(filepath,col):
+    peptides = []
+    with open(filepath) as fp:
+      for i,line in enumerate(fp):
+        if line.strip() == '' or line.startswith('#'):
+          continue
+        fields = line.rstrip('\n').split('\t')
+        peptide = fields[col]
+        if not re.match(pep_pat,peptide):
+          warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
+        peptides.append(peptide)
+    return peptides
+
+  def get_fasta_entries(fp):
+    name, seq = None, []
+    for line in fp:
+      line = line.rstrip()
+      if line.startswith(">"):
+        if name: yield (name, ''.join(seq))
+        name, seq = line, []
+      else:
+        seq.append(line)
+    if name: yield (name, ''.join(seq))
+
+  def read_fasta(filepath):
+    peptides = []
+    with open(filepath) as fp:
+      for id, peptide in get_fasta_entries(fp):
+        if not re.match(pep_pat,peptide):
+          warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
+        peptides.append(peptide)
+    return peptides
+
+  def read_mzid(fp):
+    peptides = []
+    for event, elem in ET.iterparse(fp):
+      if event == 'end':
+        if re.search('PeptideSequence',elem.tag):
+          peptides.append(elem.text)
+    return peptides
+
+  def read_pepxml(fp):
+    peptides = []
+    for event, elem in ET.iterparse(fp):
+      if event == 'end':
+        if re.search('search_hit',elem.tag):
+          peptides.append(elem.get('peptide'))
+    return peptides
+
+  def best_match(peptide,matches):
+    if not matches:
+      return None
+    elif len(matches) == 1:
+      return matches[0].copy()
+    else:
+      # find the most specific match (peptide is always the first column order field)
+      for col in reversed(pept2lca_extra_column_order[1:]):
+        col_id = col+"_id" if options.extra else col
+        for match in matches:
+          if 'taxon_rank' in match and match['taxon_rank'] == col:
+            return match.copy()
+          if col_id in match and match[col_id]:
+            return match.copy()
+    return None
+
+  def get_taxon_json(resp):
+    found_keys = set()
+    for i,pdict in enumerate(resp):
+      found_keys |= set(pdict.keys())
+    taxa_cols = []
+    for col in pept2lca_extra_column_order[-1:0:-1]:
+      if col+'_id' in found_keys:
+        taxa_cols.append(col)
+    id_to_node = dict()
+    def get_node(id,name,rank,child,seq):
+      if id not in id_to_node:
+        data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1,  'rank' : rank, 'sequences' : [] }
+        node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
+        id_to_node[id] = node
+      else:
+        node = id_to_node[id]
+      node['data']['count'] += 1
+      if seq is not None and seq not in node['data']['sequences']:
+         node['data']['sequences'].append(seq)
+      if child is None:
+        node['data']['self_count'] += 1
+      elif child['id'] not in node['kids']:
+        node['kids'].append(child['id'])
+        node['children'].append(child)
+      return node
+    root = get_node(1,'root','no rank',None,None)
+    for i,pdict in enumerate(resp):
+      sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
+      seq = sequence
+      child = None
+      for col in taxa_cols:
+        col_id = col+'_id'
+        if col_id in pdict and pdict.get(col_id):
+          col_name = col if col in found_keys else col+'_name'
+          child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
+          seq = None
+      if child:
+        get_node(1,'root','no rank',child,None)
+    return root
+
+  def get_ec_json(resp):
+    ecMap = dict()
+    for pdict in resp:
+      if 'ec' in pdict:
+        for ec in pdict['ec']:
+          ec_number = ec['ec_number']
+          if ec_number not in ecMap:
+            ecMap[ec_number] = []
+          ecMap[ec_number].append(pdict)
+    def get_ids(ec):
+      ids = []
+      i = len(ec)
+      while i >= 0:
+        ids.append(ec[:i])
+        i = ec.rfind('.',0,i - 1)
+      return ids
+    id_to_node = dict()
+    def get_node(id,name,child,seq):
+      if id not in id_to_node:
+        data = {'count' : 0, 'self_count' : 0, 'sequences' : [] }
+        node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
+        id_to_node[id] = node
+      else:
+        node = id_to_node[id]
+      node['data']['count'] += 1
+      if seq is not None and seq not in node['data']['sequences']:
+         node['data']['sequences'].append(seq)
+      if child is None:
+        node['data']['self_count'] += 1
+      elif child['id'] not in node['kids']:
+        node['kids'].append(child['id'])
+        node['children'].append(child)
+      return node
+    root = get_node(0,'-.-.-.-',None,None)
+    for i in range(1,7):
+      child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None)
+      get_node(0,'-.-.-.-',child,None)
+    for i,pdict in enumerate(resp):
+      sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
+      seq = sequence
+      if 'ec' in pdict:
+        for ec in pdict['ec']:
+          child = None
+          protein_count = ec['protein_count']
+          ec_number = ec['ec_number']
+          for ec_id in get_ids(ec_number):
+            child = get_node(ec_id,ec_id,child,seq)
+            seq = None
+          if child:
+            get_node(0,'-.-.-.-',child,None)
+    return root
+
+  def get_taxon_dict(resp, column_order, extra=False, names=False):
+    found_keys = set()
+    results = []
+    for i,pdict in enumerate(resp):
+      results.append(pdict)
+      found_keys |= set(pdict.keys())
+      # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
+    column_names = []
+    column_keys = []
+    for col in column_order:
+      if col in found_keys:
+        column_names.append(col)
+        column_keys.append(col)
+      elif names:
+        col_id = col+'_id'
+        col_name = col+'_name'
+        if extra:
+          if col_id in found_keys:
+            column_names.append(col_id)
+            column_keys.append(col_id)
+        if names:
+          if col_name in found_keys:
+            column_names.append(col)
+            column_keys.append(col_name)
+      else:
+        if col+'_name' in found_keys:
+          column_names.append(col)
+          column_keys.append(col+'_name')
+        elif col+'_id' in found_keys:
+          column_names.append(col)
+          column_keys.append(col+'_id')
+    # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
+    taxa = dict() ## peptide : [taxonomy]
+    for i,pdict in enumerate(results):
+      peptide = pdict['peptide'] if 'peptide' in pdict else None
+      if peptide and peptide not in taxa:
+          vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
+          taxa[peptide] = vals
+    return (taxa,column_names)
+
+  def get_ec_dict(resp, extra=False):
+    ec_cols = ['ec_numbers', 'ec_protein_counts']
+    if extra:
+      ec_cols.append('ec_names')
+    ec_dict = dict()
+    for i,pdict in enumerate(resp):
+      peptide = pdict['peptide']
+      ec_numbers = []
+      protein_counts = []
+      ec_names = []
+      if 'ec' in pdict:
+        for ec in pdict['ec']:
+          ec_numbers.append(ec['ec_number'])
+          protein_counts.append(str(ec['protein_count']))
+          if extra:
+            ec_names.append(ec['name'] if 'name' in ec else '')
+      vals = [','.join(ec_numbers),','.join(protein_counts)]
+      if extra:
+        vals.append(','.join(ec_names))
+      ec_dict[peptide] = vals
+    return (ec_dict, ec_cols)
+
+  def get_go_dict(resp, extra=False):
+    go_cols = ['go_terms', 'go_protein_counts']
+    if extra:
+      go_cols.append('go_names')
+    go_dict = dict()
+    for i,pdict in enumerate(resp):
+      peptide = pdict['peptide']
+      go_terms = []
+      protein_counts = []
+      go_names = []
+      if 'go' in pdict:
+        for go in pdict['go']:
+          if 'go_term' in go:
+            go_terms.append(go['go_term'])
+            protein_counts.append(str(go['protein_count']))
+            if extra:
+              go_names.append(go['name'] if 'name' in go else '')
+          else:
+            for go_type in go_types:
+              if go_type in go:
+                for _go in go[go_type]:
+                  go_terms.append(_go['go_term'])
+                  protein_counts.append(str(_go['protein_count']))
+                  if extra:
+                    go_names.append(_go['name'] if 'name' in _go else '')
+      vals = [','.join(go_terms),','.join(protein_counts)]
+      if extra:
+        vals.append(','.join(go_names))
+      go_dict[peptide] = vals
+    return (go_dict, go_cols)
+
+  def write_ec_table(outfile, resp, column_order):
+    with open(outfile,'w') as fh:
+      for i,pdict in enumerate(resp):
+        if 'ec' in pdict:
+          tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
+          for ec in pdict['ec']:
+            vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
+            fh.write('%s\n' % '\t'.join(tvals + vals)) 
+
+  def write_go_table(outfile, resp, column_order):
+    with open(outfile,'w') as fh:
+      for i,pdict in enumerate(resp):
+        if 'go' in pdict:
+          tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
+          for go in pdict['go']:
+            if 'go_term' in go:
+              vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
+              fh.write('%s\n' % '\t'.join(tvals + vals)) 
+            else:
+              for go_type in go_types:
+                if go_type in go:
+                  for _go in go[go_type]:
+                    vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]]
+                    vals.append(go_type)
+                    fh.write('%s\n' % '\t'.join(tvals + vals)) 
+
+  #Parse Command Line
+  parser = optparse.OptionParser()
+  # unipept API choice
+  parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], 
+      help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' )
+  # input files
+  parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
+  parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
+  parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
+  parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
+  parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
+  # Unipept Flags
+  parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
+  parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
+  parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
+  parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' )
+  parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
+  
+  # output fields
+  parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
+  # Warn vs Error Flag
+  parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
+  # output files
+  parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
+  parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
+  parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
+  parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
+  parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
+  parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
+  parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
+  parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
+  parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' )
+  # debug
+  parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' )
+  parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
+  parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
+  (options, args) = parser.parse_args()
+  if options.version:
+    print >> sys.stdout,"%s" % version
+    sys.exit(0)
+  invalid_ec = 2 if options.strict else None
+  peptides = []
+  ## Get peptide sequences
+  if options.mzid:
+    peptides += read_mzid(options.mzid)
+  if options.pepxml:
+    peptides += read_pepxml(options.pepxml)
+  if options.tabular:
+    peptides += read_tabular(options.tabular,options.column) 
+  if options.fasta:
+    peptides += read_fasta(options.fasta) 
+  if args and len(args) > 0:
+    for i,peptide in enumerate(args):
+      if not re.match(pep_pat,peptide):
+        warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
+      peptides.append(peptide) 
+  if len(peptides) < 1:
+    warn_err("No peptides input!",exit_code=1)
+  column_order = pept2lca_column_order
+  if options.unipept == 'pept2prot':
+    column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
+  else:
+    if options.extra or options.names:
+      column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
+    else:
+      column_order = pept2lca_column_order
+  ## map to tryptic peptides
+  pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
+  partToPeps = {}
+  for peptide, parts in pepToParts.iteritems():
+    if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
+    for part in parts:
+      if len(part) > 50:
+        warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
+      if 5 <= len(part) <= 50:
+        partToPeps.setdefault(part,[]).append(peptide)
+  trypticPeptides = partToPeps.keys()
+  ## unipept
+  unipept_resp = []
+  idx = range(0,len(trypticPeptides),options.max_request)
+  idx.append(len(trypticPeptides))
+  for i in range(len(idx)-1):
+    post_data = []
+    if options.equate_il:
+      post_data.append(("equate_il","true"))
+    if options.names or options.json:
+      post_data.append(("extra","true"))
+      post_data.append(("names","true"))
+    elif options.extra or options.json:
+      post_data.append(("extra","true"))
+    if options.domains:
+      post_data.append(("domains","true"))
+    post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
+    if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data))
+    params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data])
+    #headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
+    headers = {'Accept': 'application/json'}
+    url = '%s/%s' % (options.url.rstrip('/'),options.unipept)
+    if options.get:
+      url = '%s.json?%s' % (url,params)
+      req = urllib2.Request( url )
+    else:
+      url = '%s.json' % (url)
+      req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
+    if options.debug: print >> sys.stdout, "url: %s\n" % (str(url))
+    try:
+      resp = urllib2.urlopen( req ) 
+      if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode()))
+      if resp.getcode() == 200:
+        unipept_resp += json.loads( urllib2.urlopen( req ).read() )
+    except Exception, e:
+      warn_err('HTTP Error %s\n' % (str(e)),exit_code=None)
+  unmatched_peptides = []
+  peptideMatches = []
+  if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
+  if options.unipept in ['pept2prot', 'pept2taxa']:
+    dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
+    ## multiple entries per trypticPeptide for pep2prot or pep2taxa
+    mapping = {}
+    for match in unipept_resp:
+      mapping.setdefault(match['peptide'],[]).append(match)
+    for peptide in peptides:
+      # Get the intersection of matches to the tryptic parts
+      keyToMatch = None
+      for part in pepToParts[peptide]:
+        if part in mapping:
+          temp = {match[dupkey] : match  for match in mapping[part]}
+          if keyToMatch:
+            dkeys = set(keyToMatch.keys()) - set(temp.keys())
+            for k in dkeys:
+              del keyToMatch[k]
+          else:
+            keyToMatch = temp
+          ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
+      if not keyToMatch:
+        unmatched_peptides.append(peptide)
+      else:
+        for key,match in keyToMatch.iteritems():
+          match['tryptic_peptide'] = match['peptide']
+          match['peptide'] = peptide
+          peptideMatches.append(match)
+  elif options.unipept in ['pept2lca']:
+    ## should be one response per trypticPeptide for pep2lca
+    respMap = {v['peptide']:v for v in unipept_resp}
+    ## map resp back to peptides
+    for peptide in peptides:
+      matches = list()
+      for part in pepToParts[peptide]:
+        if part in respMap:
+          matches.append(respMap[part])
+      match = best_match(peptide,matches)
+      if not match:
+        unmatched_peptides.append(peptide)
+        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+        match = {'peptide' : longest_tryptic_peptide}
+      match['tryptic_peptide'] = match['peptide']
+      match['peptide'] = peptide
+      peptideMatches.append(match)
+  else:
+    respMap = {v['peptide']:v for v in unipept_resp}
+    ## map resp back to peptides
+    for peptide in peptides:
+      matches = list()
+      for part in pepToParts[peptide]:
+        if part in respMap and 'total_protein_count' in respMap[part]:
+          matches.append(respMap[part])
+      match = best_match(peptide,matches)
+      if not match:
+        unmatched_peptides.append(peptide)
+        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+        match = {'peptide' : longest_tryptic_peptide}
+      match['tryptic_peptide'] = match['peptide']
+      match['peptide'] = peptide
+      peptideMatches.append(match)
+  resp = peptideMatches
+  if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
+  ## output results
+  if not (options.unmatched or options.json or options.tsv or options.csv):
+    print >> sys.stdout, str(resp)
+  if options.unmatched:
+    with open(options.unmatched,'w') as outputFile:
+      for peptide in peptides:
+        if peptide in unmatched_peptides:
+          outputFile.write("%s\n" % peptide)
+  if options.json:
+    if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
+      root = get_taxon_json(resp)
+      with open(options.json,'w') as outputFile:
+        outputFile.write(json.dumps(root))  
+    elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']:
+      with open(options.json,'w') as outputFile:
+        outputFile.write(str(resp))
+  if options.ec_json:
+    if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+      root = get_ec_json(resp)
+      with open(options.ec_json,'w') as outputFile:
+        outputFile.write(json.dumps(root))
+  if options.tsv or options.csv:
+    rows = []
+    column_names = None
+    if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']:
+      taxa = None
+      ec_dict = None
+      go_dict = None
+      if options.unipept in ['peptinfo']:
+        (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
+      if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+        (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra)
+      if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
+        (go_dict,go_cols) = get_go_dict(resp, extra=options.extra)
+      for i,pdict in enumerate(resp):
+        peptide = pdict['peptide'] 
+        total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
+        column_names = ['peptide', 'total_protein_count']
+        vals = [peptide,total_protein_count] 
+        if ec_dict:
+          vals += ec_dict[peptide]
+          column_names += ec_cols
+        if go_dict:
+          vals += go_dict[peptide]
+          column_names += go_cols
+        if taxa:
+          vals += taxa[peptide][1:]
+          column_names += taxon_cols[1:]
+        rows.append(vals)
+    elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
+      (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
+      column_names = taxon_cols
+      rows = taxa.values()
+      for peptide,vals in taxa.iteritems():
+        rows.append(vals)
+    if options.tsv:
+      with open(options.tsv,'w') as outputFile:
+        if column_names:
+          outputFile.write("#%s\n"% '\t'.join(column_names))
+        for vals in rows:
+          outputFile.write("%s\n"% '\t'.join(vals))
+    if options.csv:
+      with open(options.csv,'w') as outputFile:
+        if column_names:
+          outputFile.write("%s\n"% ','.join(column_names))
+        for vals in rows:
+          outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
+  if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+    column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
+    write_ec_table(options.ec_tsv, resp, column_order)
+  if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
+    column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
+    write_go_table(options.go_tsv, resp, column_order)
+
+if __name__ == "__main__" : __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/unipept.xml	Wed Oct 24 14:45:18 2018 -0400
@@ -0,0 +1,518 @@
+<tool id="unipept" name="Unipept" version="4.0.0">
+    <description>retrieve taxonomy for peptides</description>
+    <macros>
+        <xml name="equate_il">
+            <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="true" label="Equate isoleucine and leucine">
+                <help>isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records</help>
+            </param>
+        </xml>
+        <xml name="extra">
+            <param name="extra" type="boolean" truevalue="-x" falsevalue="" checked="false" label="retrieve extra information">
+                <yield/>
+            </param>
+        </xml>
+        <xml name="extra_true">
+            <param name="extra" type="boolean" truevalue="-x" falsevalue="" checked="true" label="retrieve extra information">
+                <yield/>
+            </param>
+        </xml>
+        <xml name="names">
+            <param name="names" type="boolean" truevalue="-n" falsevalue="" checked="true" label="names" >
+                <help>return the names in complete taxonomic lineage</help>
+            </param>
+            <param name="allfields" type="boolean" truevalue="-A" falsevalue="" checked="false" label="allfields" >
+                <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help>
+            </param>
+        </xml>
+        <xml name="domains">
+            <param name="domains" type="boolean" truevalue="-D" falsevalue="" checked="false" label="group responses by GO namespace (biological process, molecular function, cellular component)">
+                <yield/>
+            </param>
+        </xml>
+        <xml name="selected_outputs">
+            <param name="selected_outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
+                <option value="tsv" selected="true">Tabular with one line per peptide</option>
+                <option value="csv">Comma Separated Values (.csv) with one line per peptide</option>
+                <option value="json">JSON Taxomony Tree (for pept2lca, pep2taxa, and peptinfo)</option>
+                <yield/>
+                <option value="unmatched">Unmatched peptides</option>
+            </param>
+        </xml>
+    </macros>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+      python '$__tool_directory__/unipept.py' 
+      ## --url 'http://morty.ugent.be/api/v1' -g -M 1  
+      --api=$unipept.api
+      $unipept.equate_il $unipept.extra 
+      #if $unipept.api in ['pept2lca', 'pept2taxa', 'peptinfo']:
+        $unipept.names $unipept.allfields
+      #end if
+      #if $unipept.api in ['pept2go', 'pept2funct', 'peptinfo']:
+        $unipept.domains
+      #end if
+      $strict
+      #if str($peptide_src.fmt) == 'proteomic':
+        #if $peptide_src.input.datatype.file_ext == 'fasta':
+          --fasta="$peptide_src.input"
+        #elif $peptide_src.input.datatype.file_ext == 'mzid':
+          --mzid="$peptide_src.input"
+        #elif $peptide_src.input.datatype.file_ext == 'pepxml':
+          --pepxml="$peptide_src.input"
+        #end if
+      #elif str($peptide_src.fmt) == 'tabular':
+        --tabular="$peptide_src.input_tsv"
+        #set $col = int(str($peptide_src.column)) - 1
+        --column=$col
+      #elif str($peptide_src.fmt) == 'fasta':
+        --fasta="$peptide_src.input_fasta"
+      #elif str($peptide_src.fmt) == 'mzid':
+        --mzid="$peptide_src.input_mzid"
+      #elif str($peptide_src.fmt) == 'pepxml':
+        --pepxml="$peptide_src.input_pepxml"
+      #end if
+      #if 'json' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2lca', 'pept2taxa', 'peptinfo']:
+        --json $output_json
+      #end if
+      #if 'ec_json' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2ec', 'pept2funct', 'peptinfo']:
+        --ec_json $output_ec_json
+      #end if
+      #if 'tsv' in str($selected_outputs).split(','):
+        --tsv $output_tsv
+      #end if
+      #if 'csv' in str($selected_outputs).split(','):
+        --csv $output_csv
+      #end if
+      #if 'ec_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2ec', 'pept2funct', 'peptinfo']:
+        --ec_tsv $output_ec_tsv
+      #end if
+      #if 'go_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2go', 'pept2funct', 'peptinfo']:
+        --go_tsv $output_go_tsv
+      #end if
+      #if 'unmatched' in str($selected_outputs).split(','):
+        --unmatched $output_unmatched
+      #end if
+    ]]></command>
+    <inputs>
+      <conditional name="unipept">
+          <param name="api" type="select" label="Unipept application" >
+              <option value="pept2lca" selected="true">pept2lca: lowest common ancestor</option>
+              <option value="pept2taxa">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>
+              <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option>
+              <option value="pept2ec">pept2ec: Tryptic peptides and associated EC terms</option>
+              <option value="pept2go">pept2go: Tryptic peptides and associated GO terms</option>
+              <option value="pept2funct">pept2funct: Tryptic peptides and associated EC and GO terms</option>
+              <option value="peptinfo">peptinfo: Tryptic peptides and associated EC and GO terms and lowest common ancestor taxonomy</option>
+          </param>
+          <when value="pept2lca">
+              <expand macro="equate_il" />
+              <expand macro="extra">
+                  <help>Return the complete lineage of the taxonomic lowest common ancestor, and include ID fields.</help>
+              </expand>
+              <expand macro="names" />
+          </when>
+          <when value="pept2taxa">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the complete lineage of each organism, and include ID fields.</help>
+              </expand>
+              <expand macro="names" />
+          </when>
+          <when value="pept2prot">
+              <expand macro="equate_il" />
+              <expand macro="extra">
+                  <help>Return additional information fields: taxon_name, ec_references, go_references, refseq_ids, refseq_protein_ids, insdc_ids, insdc_protein_ids
+                        WARNING: Huge perfomance penalty!  Only use for small number of peptides when the extra infomation is required.
+                  </help>
+              </expand>
+          </when>
+          <when value="pept2ec">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the EC-number.
+                  </help>
+              </expand>
+          </when>
+          <when value="pept2go">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the GO-term.
+                  </help>
+              </expand>
+              <expand macro="domains" />
+          </when>
+          <when value="pept2funct">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the  EC-number and GO-term.
+                  </help>
+              </expand>
+              <expand macro="domains" />
+          </when>
+          <when value="peptinfo">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the  EC-number and GO-term.
+                  </help>
+              </expand>
+              <expand macro="domains" />
+              <expand macro="names" />
+          </when>
+      </conditional>
+      <conditional name="peptide_src">
+        <param name="fmt" type="select" label="Peptides input format" >
+          <option value="proteomic">proteomics formats:  mzid, pepxml, fasta</option>
+          <option value="tabular">tabular</option>
+          <option value="fasta">fasta</option>
+          <option value="mzid">mzid</option>
+          <option value="pepxml">pepxml</option>
+        </param>
+        <when value="proteomic">
+          <param name="input" type="data" format="mzid,pepxml,fasta" label="Peptide Input" />
+        </when>
+        <when value="tabular">
+          <param name="input_tsv" type="data" format="tabular" label="Tabular Input Containing Peptide column" />
+          <param name="column" label="Select column with peptides" type="data_column" numerical="false" data_ref="input_tsv" />
+        </when>
+        <when value="fasta">
+          <param name="input_fasta" type="data" format="fasta" label="Peptide Fasta Input" />
+        </when>
+        <when value="mzid">
+          <param name="input_mzid" type="data" format="mzid" label="mzIndetML Input" />
+        </when>
+        <when value="pepxml">
+          <param name="input_pepxml" type="data" format="pepxml" label="mzIndetML Input" />
+        </when>
+      </conditional>
+      <param name="selected_outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
+       <option value="tsv" selected="true">Tabular with one line per peptide</option>
+       <option value="csv">Comma Separated Values (.csv) with one line per peptide</option>
+        <option value="json">JSON Taxomony Tree (for pept2lca, pep2taxa, and peptinfo)</option>
+        <option value="go_tsv">Peptide GO terms in normalized tabular (for pept2go, pept2funct, and peptinfo)</option>
+        <option value="ec_tsv">Peptide EC terms in normalized tabular (for pept2ec, pept2funct, and peptinfo)</option>
+        <option value="ec_json">JSON EC Coverage Tree (for pept2ec, pep2funct, and peptinfo)</option>
+        <option value="unmatched">Unmatched peptides</option>
+      </param>
+      <param name="strict" type="boolean" truevalue="--strict" falsevalue="" checked="false" label="Exit with error on invalid peptides, otherwise ignore them"/>
+    </inputs>
+    <outputs>
+      <data name="output_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} Taxonomy json"> 
+        <filter>'json' in selected_outputs and unipept['api'] in ('pept2lca', 'pept2taxa', 'peptinfo')</filter>
+        <change_format>
+            <when input="api" value="pept2prot" format="json" />
+        </change_format>
+      </data> 
+      <data name="output_ec_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} EC json">
+        <filter>'ec_json' in selected_outputs and unipept['api'] in ('pept2ec', 'pept2funct', 'peptinfo')</filter>
+      </data>
+      <data name="output_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} tsv"> 
+        <filter>'tsv' in selected_outputs</filter>
+        <actions>
+            <action name="comment_lines" type="metadata" default="1" />
+            <!--
+            <conditional name="unipept.api">
+                <when value="pept2funct">
+                    <action name="column_names" type="metadata" default="peptide,total_protein_count,ec_numbers,ec_protein_counts,ec_names,go_terms,go_protein_counts,go_names" />
+                </when>
+                <when value="pept2go">
+                    <action name="column_names" type="metadata" default="peptide,total_protein_count,go_terms,go_protein_counts,go_names" />
+                </when>
+                <when value="pept2ec">
+                    <action name="column_names" type="metadata" default="peptide,total_protein_count,ec_numbers,ec_protein_counts,ec_names" />
+                </when>
+            </conditional>
+            -->
+        </actions>
+peptide,total_protein_count,ec_numbers,ec_protein_counts,ec_names,go_terms,go_protein_counts,go_names
+      </data> 
+      <data name="output_csv" format="csv" label="${tool.name} ${unipept.api} on ${on_string} csv"> 
+        <filter>'csv' in selected_outputs</filter>
+      </data> 
+      <data name="output_ec_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} EC tsv"> 
+        <filter>'ec_tsv' in selected_outputs and unipept['api'] in ('pept2ec', 'pept2funct', 'peptinfo')</filter>
+        <actions>
+            <action name="column_names" type="metadata" default="Peptide,Total Protein Count,EC Number,Protein Count,EC Name" />
+        </actions>
+      </data> 
+      <data name="output_go_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} GO tsv"> 
+        <filter>'go_tsv' in selected_outputs and unipept['api'] in ('pept2go', 'pept2funct', 'peptinfo')</filter>
+        <actions>
+            <action name="column_names" type="metadata" default="Peptide,Total Protein Count,GO Term,Protein Count,GO Name" />
+        </actions>
+      </data> 
+      <data name="output_unmatched" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} unmatched"> 
+        <filter>'unmatched' in selected_outputs</filter>
+        <actions>
+            <action name="column_names" type="metadata" default="Unmatched Peptide" />
+        </actions>
+      </data> 
+    </outputs>
+    <tests>
+      <test>
+        <param name="api" value="pept2lca"/>
+        <param name="fmt" value="tabular"/>
+        <param name="input_tsv" value="tryptic.tsv"/>
+        <param name="column" value="2"/>
+        <param name="extra" value="True"/>
+        <param name="names" value="True"/>
+        <param name="selected_outputs" value="tsv,unmatched"/>
+        <output name="output_tsv">
+            <assert_contents>
+              <has_text text="Homininae" />
+            </assert_contents>
+        </output>
+        <output name="output_unmatched">
+            <assert_contents>
+              <has_text text="QTAMAV" />
+            </assert_contents>
+        </output>
+      </test>
+      <test>
+        <param name="api" value="pept2lca"/>
+        <param name="fmt" value="fasta"/>
+        <param name="input_fasta" value="peptide.fa"/>
+        <param name="equate_il" value="True"/>
+        <param name="extra" value="True"/>
+        <param name="names" value="True"/>
+        <param name="selected_outputs" value="json,tsv"/>
+        <output name="output_json">
+            <assert_contents>
+              <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" />
+            </assert_contents>
+        </output>
+        <output name="output_tsv">
+            <assert_contents>
+              <has_text text="9606" />
+              <has_text text="9598" />
+            </assert_contents>
+        </output>
+      </test>
+      <test>
+        <param name="api" value="pept2taxa"/>
+        <param name="fmt" value="fasta"/>
+        <param name="input_fasta" value="peptide.fa"/>
+        <param name="equate_il" value="True"/>
+        <param name="extra" value="False"/>
+        <param name="names" value="False"/>
+        <param name="selected_outputs" value="tsv"/>
+        <output name="output_tsv">
+            <assert_contents>
+              <has_text text="sapiens" />
+              <has_text text="troglodytes" />
+              <has_text text="Gorilla" />
+              <has_text text="Macaca" />
+            </assert_contents>
+        </output>
+      </test>
+      <test>
+        <param name="api" value="pept2lca"/>
+        <param name="fmt" value="tabular"/>
+        <param name="input_tsv" value="tryptic.tsv"/>
+        <param name="column" value="2"/>
+        <param name="extra" value="True"/>
+        <param name="names" value="True"/>
+        <param name="selected_outputs" value="tsv,ec_tsv,go_tsv,unmatched"/>
+        <output name="output_tsv">
+            <assert_contents>
+              <has_text text="3.2.1.17" />
+            </assert_contents>
+        </output>
+        <output name="output_ec_tsv">
+            <assert_contents>
+              <has_text text="3.2.1.17" />
+            </assert_contents>
+        </output>
+        <output name="output_go_tsv">
+            <assert_contents>
+              <has_text text="GO:0006412" />
+            </assert_contents>
+        </output>
+      </test>
+    </tests>
+    <help><![CDATA[
+    **Unipept** 
+
+    Retrieve Uniprot and taxanomic information for trypic peptides.
+    
+    Unipept API documentation - http://unipept.ugent.be/apidocs 
+
+    **Input**
+
+    Input peptides can be retrieved from tabular, fasta, mzid, or pepxml datasets.  
+ 
+    Processing deatils::
+
+        The input peptides are split into typtic peptide fragments in order to match the Unipept records.   
+        Only fragments that are complete tryptic peptides between 5 and 50 animo acid in length will be matched by Unipept.
+        The match to the most specific tryptic fragment is reported.
+
+
+    **Unipept APIs**
+
+    **pept2prot**  - http://unipept.ugent.be/apidocs/pept2prot
+
+    Returns the list of UniProt entries containing a given tryptic peptide. This is the same information as provided on the Protein matches tab when performing a search with the Tryptic Peptide Analysis in the web interface. 
+
+    By default, each object contains the following information fields extracted from the UniProt record::
+
+        peptide: the peptide that matched this record
+        uniprot_id: the UniProt accession number of the matching record
+        taxon_id: the NCBI taxon id of the organism associated with the matching record
+
+    When the extra parameter is set to true, objects contain the following additional fields extracted from the UniProt record::
+
+        taxon_name: the name of the organism associated with the matching UniProt record
+        ec_references: a space separated list of associated EC numbers
+        go_references: a space separated list of associated GO terms
+        refseq_ids: a space separated list of associated RefSeq accession numbers
+        refseq_protein_ids: a space separated list of associated RefSeq protein accession numbers
+        insdc_ids: a space separated list of associated insdc accession numbers
+        insdc_protein_ids: a space separated list of associated insdc protein accession numbers
+
+
+    **pept2taxa**  - http://unipept.ugent.be/apidocs/pept2taxa
+
+    Returns the set of organisms associated with the UniProt entries containing a given tryptic peptide. This is the same information as provided on the Lineage table tab when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        taxon_id: the NCBI taxon id of the organism associated with the matching record
+        taxon_name: the name of the organism associated with the matching record
+        taxon_rank: the taxonomic rank of the organism associated with the matching record
+
+    When the extra parameter is set to true, objects contain additional information about the lineages of the organism extracted from the NCBI taxonomy. The taxon id of each rank in the lineage is specified using the following information fields::
+
+        superkingdom_id
+        kingdom_id
+        subkingdom_id
+        superphylum_id
+        phylum_id
+        subphylum_id
+        superclass_id
+        class_id
+        subclass_id
+        infraclass_id
+        superorder_id
+        order_id
+        suborder_id
+        infraorder_id
+        parvorder_id
+        superfamily_id
+        family_id
+        subfamily_id
+        tribe_id
+        subtribe_id
+        genus_id
+        subgenus_id
+        species_group_id
+        species_subgroup_id
+        species_id
+        subspecies_id
+        varietas_id
+        forma_id
+
+
+    **pept2lca**  - http://unipept.ugent.be/apidocs/pept2lca
+
+    Returns the taxonomic lowest common ancestor for a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        taxon_id: the NCBI taxon id of the organism associated with the matching record
+        taxon_name: the name of the organism associated with the matching record
+        taxon_rank: the taxonomic rank of the organism associated with the matching record
+
+    When the extra parameter is set to true, objects contain additional information about the lineage of the taxonomic lowest common ancestor extracted from the NCBI taxonomy. The taxon id of each rank in the lineage is specified using the following information fields::
+
+        superkingdom_id
+        kingdom_id
+        subkingdom_id
+        superphylum_id
+        phylum_id
+        subphylum_id
+        superclass_id
+        class_id
+        subclass_id
+        infraclass_id
+        superorder_id
+        order_id
+        suborder_id
+        infraorder_id
+        parvorder_id
+        superfamily_id
+        family_id
+        subfamily_id
+        tribe_id
+        subtribe_id
+        genus_id
+        subgenus_id
+        species_group_id
+        species_subgroup_id
+        species_id
+        subspecies_id
+        varietas_id
+        forma_id
+
+    **pept2ec**  - http://unipept.ugent.be/apidocs/pept2ec
+
+    Returns the functional EC-numbers associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        total_protein_count: Total amount of proteins matched with the given peptide
+        ec_number: EC-number associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current EC-number.
+        name: Optional, name of the EC-number. Included when the extra parameter is set to true.
+
+
+    **pept2go**  - http://unipept.ugent.be/apidocs/pept2go
+
+    Returns the functional GO-terms associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        total_protein_count: Total amount of proteins matched with the given peptide
+        go_term: The GO-term associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current GO-term.
+        name: Optional, name of the GO-term. Included when the extra parameter is set to true.
+
+
+    **pept2funct**  - http://unipept.ugent.be/apidocs/pept2funct
+
+    Returns the functional EC-numbers and GO-terms associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        total_protein_count: Total amount of proteins matched with the given peptide
+        ec_number: EC-number associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current EC-number.
+        name: Optional, name of the EC-number. Included when the extra parameter is set to true.
+        go_term: The GO-term associated with the current tryptic peptide.
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current GO-term.
+        name: Optional, name of the GO-term. Included when the extra parameter is set to true.
+
+
+    **Attributions**
+
+    The Unipept metaproteomics analysis pipeline
+    Bart Mesuere1,*, Griet Debyser2, Maarten Aerts3, Bart Devreese2, Peter Vandamme3 andPeter Dawyndt1
+    Article first published online: 11 FEB 2015
+    DOI: 10.1002/pmic.201400361
+    http://onlinelibrary.wiley.com/doi/10.1002/pmic.201400361/abstract;jsessionid=BFF1994E4C14DA73D7C907EB208AD710.f04t04
+
+    ]]></help>
+  <citations>
+    <citation type="doi">doi:10.1002/pmic.201400361</citation>
+  </citations>
+
+</tool>