Mercurial > repos > galaxyp > unipept

--- a/unipept.py	Tue Jan 22 20:58:28 2019 -0500
+++ b/unipept.py	Wed Apr 29 23:52:22 2020 +0000
@@ -1,663 +1,732 @@
 #!/usr/bin/env python
 """
 #
-#------------------------------------------------------------------------------
-#                         University of Minnesota
-#         Copyright 2015, Regents of the University of Minnesota
-#------------------------------------------------------------------------------
 # Author:
 #
 #  James E Johnson
 #
 #------------------------------------------------------------------------------
 """
-
 import json
-import logging
 import optparse
-from optparse import OptionParser
-import os
 import sys
 import re
-import urllib
-import urllib2
+import urllib.error
+import urllib.parse
+import urllib.request

-"""
-pept2taxa	json
-pept2lca	json
-pept2prot
-pept2ec		ecjson	ec
-pept2go			go
-pept2funct	go	ec
-peptinfo	json ecjson ec go
-
-"""

 try:
     import xml.etree.cElementTree as ET
 except ImportError:
     import xml.etree.ElementTree as ET

-def warn_err(msg,exit_code=1):
+
+def warn_err(msg, exit_code=1):
     sys.stderr.write(msg)
     if exit_code:
-      sys.exit(exit_code)
+        sys.exit(exit_code)
+

 go_types = ['biological process', 'molecular function', 'cellular component']
+ipr_types = ['Domain', 'Family', 'Homologous_superfamily', 'Repeat', 'Conserved_site', 'Active_site', 'Binding_site', 'PTM']
 ec_name_dict = {
-'1' : 'Oxidoreductase',
-'1.1' : 'act on the CH-OH group of donors',
-'1.2' : 'act on the aldehyde or oxo group of donors',
-'1.3' : 'act on the CH-CH group of donors',
-'1.4' : 'act on the CH-NH2 group of donors',
-'1.5' : 'act on CH-NH group of donors',
-'1.6' : 'act on NADH or NADPH',
-'1.7' : 'act on other nitrogenous compounds as donors',
-'1.8' : 'act on a sulfur group of donors',
-'1.9' : 'act on a heme group of donors',
-'1.10' : 'act on diphenols and related substances as donors',
-'1.11' : 'act on peroxide as an acceptor -- peroxidases',
-'1.12' : 'act on hydrogen as a donor',
-'1.13' : 'act on single donors with incorporation of molecular oxygen',
-'1.14' : 'act on paired donors with incorporation of molecular oxygen',
-'1.15' : 'act on superoxide radicals as acceptors',
-'1.16' : 'oxidize metal ions',
-'1.17' : 'act on CH or CH2 groups',
-'1.18' : 'act on iron-sulfur proteins as donors',
-'1.19' : 'act on reduced flavodoxin as donor',
-'1.20' : 'act on phosphorus or arsenic as donors',
-'1.21' : 'act on X-H and Y-H to form an X-Y bond',
-'1.97' : 'other oxidoreductases',
-'2' : 'Transferase',
-'2.1' : 'transfer one-carbon groups, Methylase',
-'2.2' : 'transfer aldehyde or ketone groups',
-'2.3' : 'acyltransferases',
-'2.4' : 'glycosyltransferases',
-'2.5' : 'transfer alkyl or aryl groups, other than methyl groups',
-'2.6' : 'transfer nitrogenous groups',
-'2.7' : 'transfer phosphorus-containing groups',
-'2.8' : 'transfer sulfur-containing groups',
-'2.9' : 'transfer selenium-containing groups',
-'3' : 'Hydrolase',
-'3.1' : 'act on ester bonds',
-'3.2' : 'act on sugars - glycosylases',
-'3.3' : 'act on ether bonds',
-'3.4' : 'act on peptide bonds - Peptidase',
-'3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds',
-'3.6' : 'act on acid anhydrides',
-'3.7' : 'act on carbon-carbon bonds',
-'3.8' : 'act on halide bonds',
-'3.9' : 'act on phosphorus-nitrogen bonds',
-'3.10' : 'act on sulfur-nitrogen bonds',
-'3.11' : 'act on carbon-phosphorus bonds',
-'3.12' : 'act on sulfur-sulfur bonds',
-'3.13' : 'act on carbon-sulfur bonds',
-'4' : 'Lyase',
-'4.1' : 'carbon-carbon lyases',
-'4.2' : 'carbon-oxygen lyases',
-'4.3' : 'carbon-nitrogen lyases',
-'4.4' : 'carbon-sulfur lyases',
-'4.5' : 'carbon-halide lyases',
-'4.6' : 'phosphorus-oxygen lyases',
-'5' : 'Isomerase',
-'5.1' : 'racemases and epimerases',
-'5.2' : 'cis-trans-isomerases',
-'5.3' : 'intramolecular oxidoreductases',
-'5.4' : 'intramolecular transferases -- mutases',
-'5.5' : 'intramolecular lyases',
-'5.99' : 'other isomerases',
-'6' : 'Ligase',
-'6.1' : 'form carbon-oxygen bonds',
-'6.2' : 'form carbon-sulfur bonds',
-'6.3' : 'form carbon-nitrogen bonds',
-'6.4' : 'form carbon-carbon bonds',
-'6.5' : 'form phosphoric ester bonds',
-'6.6' : 'form nitrogen-metal bonds',
+    '1': 'Oxidoreductase',
+    '1.1': 'act on the CH-OH group of donors',
+    '1.2': 'act on the aldehyde or oxo group of donors',
+    '1.3': 'act on the CH-CH group of donors',
+    '1.4': 'act on the CH-NH2 group of donors',
+    '1.5': 'act on CH-NH group of donors',
+    '1.6': 'act on NADH or NADPH',
+    '1.7': 'act on other nitrogenous compounds as donors',
+    '1.8': 'act on a sulfur group of donors',
+    '1.9': 'act on a heme group of donors',
+    '1.10': 'act on diphenols and related substances as donors',
+    '1.11': 'act on peroxide as an acceptor -- peroxidases',
+    '1.12': 'act on hydrogen as a donor',
+    '1.13': 'act on single donors with incorporation of molecular oxygen',
+    '1.14': 'act on paired donors with incorporation of molecular oxygen',
+    '1.15': 'act on superoxide radicals as acceptors',
+    '1.16': 'oxidize metal ions',
+    '1.17': 'act on CH or CH2 groups',
+    '1.18': 'act on iron-sulfur proteins as donors',
+    '1.19': 'act on reduced flavodoxin as donor',
+    '1.20': 'act on phosphorus or arsenic as donors',
+    '1.21': 'act on X-H and Y-H to form an X-Y bond',
+    '1.97': 'other oxidoreductases',
+    '2': 'Transferase',
+    '2.1': 'transfer one-carbon groups, Methylase',
+    '2.2': 'transfer aldehyde or ketone groups',
+    '2.3': 'acyltransferases',
+    '2.4': 'glycosyltransferases',
+    '2.5': 'transfer alkyl or aryl groups, other than methyl groups',
+    '2.6': 'transfer nitrogenous groups',
+    '2.7': 'transfer phosphorus-containing groups',
+    '2.8': 'transfer sulfur-containing groups',
+    '2.9': 'transfer selenium-containing groups',
+    '3': 'Hydrolase',
+    '3.1': 'act on ester bonds',
+    '3.2': 'act on sugars - glycosylases',
+    '3.3': 'act on ether bonds',
+    '3.4': 'act on peptide bonds - Peptidase',
+    '3.5': 'act on carbon-nitrogen bonds, other than peptide bonds',
+    '3.6': 'act on acid anhydrides',
+    '3.7': 'act on carbon-carbon bonds',
+    '3.8': 'act on halide bonds',
+    '3.9': 'act on phosphorus-nitrogen bonds',
+    '3.10': 'act on sulfur-nitrogen bonds',
+    '3.11': 'act on carbon-phosphorus bonds',
+    '3.12': 'act on sulfur-sulfur bonds',
+    '3.13': 'act on carbon-sulfur bonds',
+    '4': 'Lyase',
+    '4.1': 'carbon-carbon lyases',
+    '4.2': 'carbon-oxygen lyases',
+    '4.3': 'carbon-nitrogen lyases',
+    '4.4': 'carbon-sulfur lyases',
+    '4.5': 'carbon-halide lyases',
+    '4.6': 'phosphorus-oxygen lyases',
+    '5': 'Isomerase',
+    '5.1': 'racemases and epimerases',
+    '5.2': 'cis-trans-isomerases',
+    '5.3': 'intramolecular oxidoreductases',
+    '5.4': 'intramolecular transferases -- mutases',
+    '5.5': 'intramolecular lyases',
+    '5.99': 'other isomerases',
+    '6': 'Ligase',
+    '6.1': 'form carbon-oxygen bonds',
+    '6.2': 'form carbon-sulfur bonds',
+    '6.3': 'form carbon-nitrogen bonds',
+    '6.4': 'form carbon-carbon bonds',
+    '6.5': 'form phosphoric ester bonds',
+    '6.6': 'form nitrogen-metal bonds',
 }
-pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
-pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
+pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name']
+pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma']
 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
-pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
-pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
+pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id']
+pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids']
 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']]
-pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go']
+pept2interpro_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count']]
+pept2interpro_extra_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count', 'type', 'name']]
+pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go', 'ipr']
+

 def __main__():
-  version = '2.0'
-  pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
+    version = '4.3'
+    pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'

-  def read_tabular(filepath,col):
-    peptides = []
-    with open(filepath) as fp:
-      for i,line in enumerate(fp):
-        if line.strip() == '' or line.startswith('#'):
-          continue
-        fields = line.rstrip('\n').split('\t')
-        peptide = fields[col]
-        if not re.match(pep_pat,peptide):
-          warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
-        peptides.append(peptide)
-    return peptides
+    def read_tabular(filepath, col):
+        peptides = []
+        with open(filepath) as fp:
+            for i, line in enumerate(fp):
+                if line.strip() == '' or line.startswith('#'):
+                    continue
+                fields = line.rstrip('\n').split('\t')
+                peptide = fields[col]
+                if not re.match(pep_pat, peptide):
+                    warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide, i, col, filepath), exit_code=invalid_ec)
+                peptides.append(peptide)
+        return peptides

-  def get_fasta_entries(fp):
-    name, seq = None, []
-    for line in fp:
-      line = line.rstrip()
-      if line.startswith(">"):
-        if name: yield (name, ''.join(seq))
-        name, seq = line, []
-      else:
-        seq.append(line)
-    if name: yield (name, ''.join(seq))
+    def get_fasta_entries(fp):
+        name, seq = None, []
+        for line in fp:
+            line = line.rstrip()
+            if line.startswith(">"):
+                if name:
+                    yield (name, ''.join(seq))
+                name, seq = line, []
+            else:
+                seq.append(line)
+        if name:
+            yield (name, ''.join(seq))

-  def read_fasta(filepath):
-    peptides = []
-    with open(filepath) as fp:
-      for id, peptide in get_fasta_entries(fp):
-        if not re.match(pep_pat,peptide):
-          warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
-        peptides.append(peptide)
-    return peptides
+    def read_fasta(filepath):
+        peptides = []
+        with open(filepath) as fp:
+            for id, peptide in get_fasta_entries(fp):
+                if not re.match(pep_pat, peptide):
+                    warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide, id, filepath), exit_code=invalid_ec)
+                peptides.append(peptide)
+        return peptides

-  def read_mzid(fp):
-    peptides = []
-    for event, elem in ET.iterparse(fp):
-      if event == 'end':
-        if re.search('PeptideSequence',elem.tag):
-          peptides.append(elem.text)
-    return peptides
+    def read_mzid(fp):
+        peptides = []
+        for event, elem in ET.iterparse(fp):
+            if event == 'end':
+                if re.search('PeptideSequence', elem.tag):
+                    peptides.append(elem.text)
+        return peptides

-  def read_pepxml(fp):
-    peptides = []
-    for event, elem in ET.iterparse(fp):
-      if event == 'end':
-        if re.search('search_hit',elem.tag):
-          peptides.append(elem.get('peptide'))
-    return peptides
+    def read_pepxml(fp):
+        peptides = []
+        for event, elem in ET.iterparse(fp):
+            if event == 'end':
+                if re.search('search_hit', elem.tag):
+                    peptides.append(elem.get('peptide'))
+        return peptides

-  def best_match(peptide,matches):
-    if not matches:
-      return None
-    elif len(matches) == 1:
-      return matches[0].copy()
-    elif 'taxon_rank' in matches[0]:
-      # find the most specific match (peptide is always the first column order field)
-      for col in reversed(pept2lca_extra_column_order[1:]):
-        col_id = col+"_id" if options.extra else col
-        for match in matches:
-          if 'taxon_rank' in match and match['taxon_rank'] == col:
-            return match.copy()
-          if col_id in match and match[col_id]:
-            return match.copy()
-    else:
-      return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy()
-    return None
+    def best_match(peptide, matches):
+        if not matches:
+            return None
+        elif len(matches) == 1:
+            return matches[0].copy()
+        elif 'taxon_rank' in matches[0]:
+            # find the most specific match (peptide is always the first column order field)
+            for col in reversed(pept2lca_extra_column_order[1:]):
+                col_id = col + "_id" if options.extra else col
+                for match in matches:
+                    if 'taxon_rank' in match and match['taxon_rank'] == col:
+                        return match.copy()
+                    if col_id in match and match[col_id]:
+                        return match.copy()
+        else:
+            return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy()
+        return None
+
+    def get_taxon_json(resp):
+        found_keys = set()
+        for i, pdict in enumerate(resp):
+            found_keys |= set(pdict.keys())
+        taxa_cols = []
+        for col in pept2lca_extra_column_order[-1:0:-1]:
+            if col + '_id' in found_keys:
+                taxa_cols.append(col)
+        id_to_node = dict()

-  def get_taxon_json(resp):
-    found_keys = set()
-    for i,pdict in enumerate(resp):
-      found_keys |= set(pdict.keys())
-    taxa_cols = []
-    for col in pept2lca_extra_column_order[-1:0:-1]:
-      if col+'_id' in found_keys:
-        taxa_cols.append(col)
-    id_to_node = dict()
-    def get_node(id,name,rank,child,seq):
-      if id not in id_to_node:
-        data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1,  'rank' : rank, 'sequences' : [] }
-        node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
-        id_to_node[id] = node
-      else:
-        node = id_to_node[id]
-      node['data']['count'] += 1
-      if seq is not None and seq not in node['data']['sequences']:
-         node['data']['sequences'].append(seq)
-      if child is None:
-        node['data']['self_count'] += 1
-      elif child['id'] not in node['kids']:
-        node['kids'].append(child['id'])
-        node['children'].append(child)
-      return node
-    root = get_node(1,'root','no rank',None,None)
-    for i,pdict in enumerate(resp):
-      sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
-      seq = sequence
-      child = None
-      for col in taxa_cols:
-        col_id = col+'_id'
-        if col_id in pdict and pdict.get(col_id):
-          col_name = col if col in found_keys else col+'_name'
-          child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
-          seq = None
-      if child:
-        get_node(1,'root','no rank',child,None)
-    return root
+        def get_node(id, name, rank, child, seq):
+            if id not in id_to_node:
+                data = {'count': 0, 'self_count': 0, 'valid_taxon': 1, 'rank': rank, 'sequences': []}
+                node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data}
+                id_to_node[id] = node
+            else:
+                node = id_to_node[id]
+            node['data']['count'] += 1
+            if seq is not None and seq not in node['data']['sequences']:
+                node['data']['sequences'].append(seq)
+            if child is None:
+                node['data']['self_count'] += 1
+            elif child['id'] not in node['kids']:
+                node['kids'].append(child['id'])
+                node['children'].append(child)
+            return node
+        root = get_node(1, 'root', 'no rank', None, None)
+        for i, pdict in enumerate(resp):
+            sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None))
+            seq = sequence
+            child = None
+            for col in taxa_cols:
+                col_id = col + '_id'
+                if col_id in pdict and pdict.get(col_id):
+                    col_name = col if col in found_keys else col + '_name'
+                    child = get_node(pdict.get(col_id, None), pdict.get(col_name, ''), col, child, seq)
+                    seq = None
+            if child is not None:
+                get_node(1, 'root', 'no rank', child, None)
+        return root
+
+    def get_ec_json(resp):
+        ecMap = dict()
+        for pdict in resp:
+            if 'ec' in pdict:
+                for ec in pdict['ec']:
+                    ec_number = ec['ec_number']
+                    if ec_number not in ecMap:
+                        ecMap[ec_number] = []
+                    ecMap[ec_number].append(pdict)
+
+        def get_ids(ec):
+            ids = []
+            i = len(ec)
+            while i >= 0:
+                ids.append(ec[:i])
+                i = ec.rfind('.', 0, i - 1)
+            return ids
+        id_to_node = dict()

-  def get_ec_json(resp):
-    ecMap = dict()
-    for pdict in resp:
-      if 'ec' in pdict:
-        for ec in pdict['ec']:
-          ec_number = ec['ec_number']
-          if ec_number not in ecMap:
-            ecMap[ec_number] = []
-          ecMap[ec_number].append(pdict)
-    def get_ids(ec):
-      ids = []
-      i = len(ec)
-      while i >= 0:
-        ids.append(ec[:i])
-        i = ec.rfind('.',0,i - 1)
-      return ids
-    id_to_node = dict()
-    def get_node(id,name,child,seq):
-      if id not in id_to_node:
-        data = {'count' : 0, 'self_count' : 0, 'sequences' : [] }
-        node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
-        id_to_node[id] = node
-      else:
-        node = id_to_node[id]
-      node['data']['count'] += 1
-      if seq is not None and seq not in node['data']['sequences']:
-         node['data']['sequences'].append(seq)
-      if child is None:
-        node['data']['self_count'] += 1
-      elif child['id'] not in node['kids']:
-        node['kids'].append(child['id'])
-        node['children'].append(child)
-      return node
-    root = get_node(0,'-.-.-.-',None,None)
-    for i in range(1,7):
-      child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None)
-      get_node(0,'-.-.-.-',child,None)
-    for i,pdict in enumerate(resp):
-      sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
-      seq = sequence
-      if 'ec' in pdict:
-        for ec in pdict['ec']:
-          child = None
-          protein_count = ec['protein_count']
-          ec_number = ec['ec_number']
-          for ec_id in get_ids(ec_number):
-            ec_name = str(ec_id)
-            ## if len(ec_id) == 3:
-            ##   ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])],  ec_name_dict[str(ec_id)])
-            child = get_node(ec_id,ec_name,child,seq)
-            seq = None
-          if child:
-            get_node(0,'-.-.-.-',child,None)
-    return root
+        def get_node(id, name, child, seq):
+            if id not in id_to_node:
+                data = {'count': 0, 'self_count': 0, 'sequences': []}
+                node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data}
+                id_to_node[id] = node
+            else:
+                node = id_to_node[id]
+            node['data']['count'] += 1
+            if seq is not None and seq not in node['data']['sequences']:
+                node['data']['sequences'].append(seq)
+            if child is None:
+                node['data']['self_count'] += 1
+            elif child['id'] not in node['kids']:
+                node['kids'].append(child['id'])
+                node['children'].append(child)
+            return node
+        root = get_node(0, '-.-.-.-', None, None)
+        for i in range(1, 7):
+            child = get_node(str(i), '%s\n%s' % (str(i), ec_name_dict[str(i)]), None, None)
+            get_node(0, '-.-.-.-', child, None)
+        for i, pdict in enumerate(resp):
+            sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None))
+            seq = sequence
+            if 'ec' in pdict:
+                for ec in pdict['ec']:
+                    child = None
+                    ec_number = ec['ec_number']
+                    for ec_id in get_ids(ec_number):
+                        ec_name = str(ec_id)
+                        child = get_node(ec_id, ec_name, child, seq)
+                        seq = None
+                    if child:
+                        get_node(0, '-.-.-.-', child, None)
+        return root

-  def get_taxon_dict(resp, column_order, extra=False, names=False):
-    found_keys = set()
-    results = []
-    for i,pdict in enumerate(resp):
-      results.append(pdict)
-      found_keys |= set(pdict.keys())
-      # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
-    column_names = []
-    column_keys = []
-    for col in column_order:
-      if col in found_keys:
-        column_names.append(col)
-        column_keys.append(col)
-      elif names:
-        col_id = col+'_id'
-        col_name = col+'_name'
+    def get_taxon_dict(resp, column_order, extra=False, names=False):
+        found_keys = set()
+        results = []
+        for i, pdict in enumerate(resp):
+            results.append(pdict)
+            found_keys |= set(pdict.keys())
+            # print >> sys.stderr, "%s\n%s" % (pdict.keys(), found_keys)
+        column_names = []
+        column_keys = []
+        for col in column_order:
+            if col in found_keys:
+                column_names.append(col)
+                column_keys.append(col)
+            elif names:
+                col_id = col + '_id'
+                col_name = col + '_name'
+                if extra:
+                    if col_id in found_keys:
+                        column_names.append(col_id)
+                        column_keys.append(col_id)
+                if names:
+                    if col_name in found_keys:
+                        column_names.append(col)
+                        column_keys.append(col_name)
+            else:
+                if col + '_name' in found_keys:
+                    column_names.append(col)
+                    column_keys.append(col + '_name')
+                elif col + '_id' in found_keys:
+                    column_names.append(col)
+                    column_keys.append(col + '_id')
+        # print >> sys.stderr, "%s\n%s" % (column_names, column_keys)
+        taxa = dict()  # peptide: [taxonomy]
+        for i, pdict in enumerate(results):
+            peptide = pdict['peptide'] if 'peptide' in pdict else None
+            if peptide and peptide not in taxa:
+                vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
+                taxa[peptide] = vals
+        return (taxa, column_names)
+
+    def get_ec_dict(resp, extra=False):
+        ec_cols = ['ec_numbers', 'ec_protein_counts']
         if extra:
-          if col_id in found_keys:
-            column_names.append(col_id)
-            column_keys.append(col_id)
-        if names:
-          if col_name in found_keys:
-            column_names.append(col)
-            column_keys.append(col_name)
-      else:
-        if col+'_name' in found_keys:
-          column_names.append(col)
-          column_keys.append(col+'_name')
-        elif col+'_id' in found_keys:
-          column_names.append(col)
-          column_keys.append(col+'_id')
-    # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
-    taxa = dict() ## peptide : [taxonomy]
-    for i,pdict in enumerate(results):
-      peptide = pdict['peptide'] if 'peptide' in pdict else None
-      if peptide and peptide not in taxa:
-          vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
-          taxa[peptide] = vals
-    return (taxa,column_names)
+            ec_cols.append('ec_names')
+        ec_dict = dict()
+        for i, pdict in enumerate(resp):
+            peptide = pdict['peptide']
+            ec_numbers = []
+            protein_counts = []
+            ec_names = []
+            if 'ec' in pdict:
+                for ec in pdict['ec']:
+                    ec_numbers.append(ec['ec_number'])
+                    protein_counts.append(str(ec['protein_count']))
+                    if extra:
+                        ec_names.append(ec['name'] if 'name' in ec else '')
+            vals = [','.join(ec_numbers), ','.join(protein_counts)]
+            if extra:
+                vals.append(','.join(ec_names))
+            ec_dict[peptide] = vals
+        return (ec_dict, ec_cols)

-  def get_ec_dict(resp, extra=False):
-    ec_cols = ['ec_numbers', 'ec_protein_counts']
-    if extra:
-      ec_cols.append('ec_names')
-    ec_dict = dict()
-    for i,pdict in enumerate(resp):
-      peptide = pdict['peptide']
-      ec_numbers = []
-      protein_counts = []
-      ec_names = []
-      if 'ec' in pdict:
-        for ec in pdict['ec']:
-          ec_numbers.append(ec['ec_number'])
-          protein_counts.append(str(ec['protein_count']))
-          if extra:
-            ec_names.append(ec['name'] if 'name' in ec else '')
-      vals = [','.join(ec_numbers),','.join(protein_counts)]
-      if extra:
-        vals.append(','.join(ec_names))
-      ec_dict[peptide] = vals
-    return (ec_dict, ec_cols)
+    def get_go_dict(resp, extra=False):
+        go_cols = ['go_terms', 'go_protein_counts']
+        if extra:
+            go_cols.append('go_names')
+        go_dict = dict()
+        for i, pdict in enumerate(resp):
+            peptide = pdict['peptide']
+            go_terms = []
+            protein_counts = []
+            go_names = []
+            if 'go' in pdict:
+                for go in pdict['go']:
+                    if 'go_term' in go:
+                        go_terms.append(go['go_term'])
+                        protein_counts.append(str(go['protein_count']))
+                        if extra:
+                            go_names.append(go['name'] if 'name' in go else '')
+                    else:
+                        for go_type in go_types:
+                            if go_type in go:
+                                for _go in go[go_type]:
+                                    go_terms.append(_go['go_term'])
+                                    protein_counts.append(str(_go['protein_count']))
+                                    if extra:
+                                        go_names.append(_go['name'] if 'name' in _go else '')
+            vals = [','.join(go_terms), ','.join(protein_counts)]
+            if extra:
+                vals.append(','.join(go_names))
+            go_dict[peptide] = vals
+        return (go_dict, go_cols)

-  def get_go_dict(resp, extra=False):
-    go_cols = ['go_terms', 'go_protein_counts']
-    if extra:
-      go_cols.append('go_names')
-    go_dict = dict()
-    for i,pdict in enumerate(resp):
-      peptide = pdict['peptide']
-      go_terms = []
-      protein_counts = []
-      go_names = []
-      if 'go' in pdict:
-        for go in pdict['go']:
-          if 'go_term' in go:
-            go_terms.append(go['go_term'])
-            protein_counts.append(str(go['protein_count']))
+    def get_ipr_dict(resp, extra=False):
+        ipr_cols = ['ipr_codes', 'ipr_protein_counts']
+        if extra:
+            ipr_cols.append('ipr_types')
+            ipr_cols.append('ipr_names')
+        ipr_dict = dict()
+        for i, pdict in enumerate(resp):
+            peptide = pdict['peptide']
+            ipr_codes = []
+            protein_counts = []
+            ipr_names = []
+            ipr_types = []
+            if 'ipr' in pdict:
+                for ipr in pdict['ipr']:
+                    if 'code' in ipr:
+                        ipr_codes.append(ipr['code'])
+                        protein_counts.append(str(ipr['protein_count']))
+                        if extra:
+                            ipr_types.append(ipr['type'] if 'type' in ipr else '')
+                            ipr_names.append(ipr['name'] if 'name' in ipr else '')
+                    else:
+                        for ipr_type in ipr_types:
+                            if ipr_type in ipr:
+                                for _ipr in ipr[ipr_type]:
+                                    ipr_codes.append(_ipr['code'])
+                                    protein_counts.append(str(_ipr['protein_count']))
+                                    if extra:
+                                        ipr_types.append(ipr_type)
+                                        ipr_names.append(_ipr['name'] if 'name' in _ipr else '')
+            vals = [','.join(ipr_codes), ','.join(protein_counts)]
             if extra:
-              go_names.append(go['name'] if 'name' in go else '')
-          else:
-            for go_type in go_types:
-              if go_type in go:
-                for _go in go[go_type]:
-                  go_terms.append(_go['go_term'])
-                  protein_counts.append(str(_go['protein_count']))
-                  if extra:
-                    go_names.append(_go['name'] if 'name' in _go else '')
-      vals = [','.join(go_terms),','.join(protein_counts)]
-      if extra:
-        vals.append(','.join(go_names))
-      go_dict[peptide] = vals
-    return (go_dict, go_cols)
+                vals.append(','.join(ipr_types))
+                vals.append(','.join(ipr_names))
+            ipr_dict[peptide] = vals
+        return (ipr_dict, ipr_cols)

-  def write_ec_table(outfile, resp, column_order):
-    with open(outfile,'w') as fh:
-      for i,pdict in enumerate(resp):
-        if 'ec' in pdict:
-          tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
-          for ec in pdict['ec']:
-            vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
-            fh.write('%s\n' % '\t'.join(tvals + vals))
+    def write_ec_table(outfile, resp, column_order):
+        with open(outfile, 'w') as fh:
+            for i, pdict in enumerate(resp):
+                if 'ec' in pdict:
+                    tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
+                    for ec in pdict['ec']:
+                        vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
+                        fh.write('%s\n' % '\t'.join(tvals + vals))

-  def write_go_table(outfile, resp, column_order):
-    with open(outfile,'w') as fh:
-      for i,pdict in enumerate(resp):
-        if 'go' in pdict:
-          tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
-          for go in pdict['go']:
-            if 'go_term' in go:
-              vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
-              fh.write('%s\n' % '\t'.join(tvals + vals))
-            else:
-              for go_type in go_types:
-                if go_type in go:
-                  for _go in go[go_type]:
-                    vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]]
-                    vals.append(go_type)
-                    fh.write('%s\n' % '\t'.join(tvals + vals))
+    def write_go_table(outfile, resp, column_order):
+        with open(outfile, 'w') as fh:
+            for i, pdict in enumerate(resp):
+                if 'go' in pdict:
+                    tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
+                    for go in pdict['go']:
+                        if 'go_term' in go:
+                            vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
+                            fh.write('%s\n' % '\t'.join(tvals + vals))
+                        else:
+                            for go_type in go_types:
+                                if go_type in go:
+                                    for _go in go[go_type]:
+                                        vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]]
+                                        vals.append(go_type)
+                                        fh.write('%s\n' % '\t'.join(tvals + vals))
+
+    def write_ipr_table(outfile, resp, column_order):
+        with open(outfile, 'w') as fh:
+            for i, pdict in enumerate(resp):
+                if 'ipr' in pdict:
+                    tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
+                    for ipr in pdict['ipr']:
+                        if 'code' in ipr:
+                            vals = [str(ipr[x]) if x in ipr and ipr[x] else '' for x in column_order[-1]]
+                            fh.write('%s\n' % '\t'.join(tvals + vals))
+                        else:
+                            for ipr_type in ipr_types:
+                                if ipr_type in ipr:
+                                    for _ipr in ipr[ipr_type]:
+                                        vals = [str(_ipr[x]) if x in _ipr and _ipr[x] else '' for x in column_order[-1]]
+                                        vals.append(ipr_type)
+                                        fh.write('%s\n' % '\t'.join(tvals + vals))

-  #Parse Command Line
-  parser = optparse.OptionParser()
-  # unipept API choice
-  parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'],
-      help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' )
-  # input files
-  parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
-  parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
-  parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
-  parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
-  parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
-  # Unipept Flags
-  parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
-  parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
-  parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
-  parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' )
-  parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
-
-  # output fields
-  parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
-  # Warn vs Error Flag
-  parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
-  # output files
-  parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
-  parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
-  parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
-  parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
-  parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
-  parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
-  parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
-  parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
-  parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' )
-  # debug
-  parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' )
-  parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
-  parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
-  (options, args) = parser.parse_args()
-  if options.version:
-    print >> sys.stdout,"%s" % version
-    sys.exit(0)
-  invalid_ec = 2 if options.strict else None
-  peptides = []
-  ## Get peptide sequences
-  if options.mzid:
-    peptides += read_mzid(options.mzid)
-  if options.pepxml:
-    peptides += read_pepxml(options.pepxml)
-  if options.tabular:
-    peptides += read_tabular(options.tabular,options.column)
-  if options.fasta:
-    peptides += read_fasta(options.fasta)
-  if args and len(args) > 0:
-    for i,peptide in enumerate(args):
-      if not re.match(pep_pat,peptide):
-        warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
-      peptides.append(peptide)
-  if len(peptides) < 1:
-    warn_err("No peptides input!",exit_code=1)
-  column_order = pept2lca_column_order
-  if options.unipept == 'pept2prot':
-    column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
-  else:
-    if options.extra or options.names:
-      column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
-    else:
-      column_order = pept2lca_column_order
-  ## map to tryptic peptides
-  pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
-  partToPeps = {}
-  for peptide, parts in pepToParts.iteritems():
-    if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
-    for part in parts:
-      if len(part) > 50:
-        warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
-      if 5 <= len(part) <= 50:
-        partToPeps.setdefault(part,[]).append(peptide)
-  trypticPeptides = partToPeps.keys()
-  ## unipept
-  unipept_resp = []
-  idx = range(0,len(trypticPeptides),options.max_request)
-  idx.append(len(trypticPeptides))
-  for i in range(len(idx)-1):
-    post_data = []
-    if options.equate_il:
-      post_data.append(("equate_il","true"))
-    if options.names or options.json:
-      post_data.append(("extra","true"))
-      post_data.append(("names","true"))
-    elif options.extra or options.json:
-      post_data.append(("extra","true"))
-    if options.domains:
-      post_data.append(("domains","true"))
-    post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
-    if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data))
-    headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
-    ## headers = {'Accept': 'application/json'}
-    url = '%s/%s' % (options.url.rstrip('/'),options.unipept)
-    if options.get:
-      params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data])
-      url = '%s.json?%s' % (url,params)
-      req = urllib2.Request( url )
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    # unipept API choice
+    parser.add_option('-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca', 'pept2taxa', 'pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo'],
+                      help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo')
+    # input files
+    parser.add_option('-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column')
+    parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences')
+    parser.add_option('-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences')
+    parser.add_option('-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences')
+    parser.add_option('-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences')
+    # Unipept Flags
+    parser.add_option('-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records')
+    parser.add_option('-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor')
+    parser.add_option('-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor')
+    parser.add_option('-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component')
+    parser.add_option('-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request')
+
+    # output fields
+    parser.add_option('-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs')
+    # Warn vs Error Flag
+    parser.add_option('-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide')
+    # output files
+    parser.add_option('-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
+    parser.add_option('-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
+    parser.add_option('-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
+    parser.add_option('-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
+    parser.add_option('-I', '--ipr_tsv', dest='ipr_tsv', default=None, help='Output file path for InterPro TAB-separated-values (.tsv) formatted results')
+    parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
+    parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
+    parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
+    parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches')
+    parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/')
+    # debug
+    parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST')
+    parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging')
+    parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit')
+    (options, args) = parser.parse_args()
+    if options.version:
+        print('%s' % version)
+        sys.exit(0)
+    invalid_ec = 2 if options.strict else None
+    peptides = []
+    # Get peptide sequences
+    if options.mzid:
+        peptides += read_mzid(options.mzid)
+    if options.pepxml:
+        peptides += read_pepxml(options.pepxml)
+    if options.tabular:
+        peptides += read_tabular(options.tabular, options.column)
+    if options.fasta:
+        peptides += read_fasta(options.fasta)
+    if args and len(args) > 0:
+        for i, peptide in enumerate(args):
+            if not re.match(pep_pat, peptide):
+                warn_err('"%s" is not a peptide (arg %d)\n' % (peptide, i), exit_code=invalid_ec)
+            peptides.append(peptide)
+    if len(peptides) < 1:
+        warn_err("No peptides input!", exit_code=1)
+    column_order = pept2lca_column_order
+    if options.unipept == 'pept2prot':
+        column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
     else:
-      url = '%s.json' % (url)
-      req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
-    if options.debug: print >> sys.stdout, "url: %s\n" % (str(url))
-    try:
-      resp = urllib2.urlopen( req )
-      if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode()))
-      if resp.getcode() == 200:
-        unipept_resp += json.loads( urllib2.urlopen( req ).read() )
-    except Exception, e:
-      warn_err('HTTP Error %s\n' % (str(e)),exit_code=None)
-  unmatched_peptides = []
-  peptideMatches = []
-  if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
-  if options.unipept in ['pept2prot', 'pept2taxa']:
-    dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
-    ## multiple entries per trypticPeptide for pep2prot or pep2taxa
-    mapping = {}
-    for match in unipept_resp:
-      mapping.setdefault(match['peptide'],[]).append(match)
-    for peptide in peptides:
-      # Get the intersection of matches to the tryptic parts
-      keyToMatch = None
-      for part in pepToParts[peptide]:
-        if part in mapping:
-          temp = {match[dupkey] : match  for match in mapping[part]}
-          if keyToMatch:
-            dkeys = set(keyToMatch.keys()) - set(temp.keys())
-            for k in dkeys:
-              del keyToMatch[k]
-          else:
-            keyToMatch = temp
-          ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
-      if not keyToMatch:
-        unmatched_peptides.append(peptide)
-      else:
-        for key,match in keyToMatch.iteritems():
-          match['tryptic_peptide'] = match['peptide']
-          match['peptide'] = peptide
-          peptideMatches.append(match)
-  elif options.unipept in ['pept2lca', 'peptinfo']:
-    ## should be one response per trypticPeptide for pep2lca
-    respMap = {v['peptide']:v for v in unipept_resp}
-    ## map resp back to peptides
-    for peptide in peptides:
-      matches = list()
-      for part in pepToParts[peptide]:
-        if part in respMap:
-          matches.append(respMap[part])
-      match = best_match(peptide,matches)
-      if not match:
-        unmatched_peptides.append(peptide)
-        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
-        match = {'peptide' : longest_tryptic_peptide}
-      match['tryptic_peptide'] = match['peptide']
-      match['peptide'] = peptide
-      peptideMatches.append(match)
-  else:
-    respMap = {v['peptide']:v for v in unipept_resp}
-    ## map resp back to peptides
-    for peptide in peptides:
-      matches = list()
-      for part in pepToParts[peptide]:
-        if part in respMap and 'total_protein_count' in respMap[part]:
-          matches.append(respMap[part])
-      match = best_match(peptide,matches)
-      if not match:
-        unmatched_peptides.append(peptide)
-        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
-        match = {'peptide' : longest_tryptic_peptide}
-      match['tryptic_peptide'] = match['peptide']
-      match['peptide'] = peptide
-      peptideMatches.append(match)
-  resp = peptideMatches
-  if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
-  ## output results
-  if not (options.unmatched or options.json or options.tsv or options.csv):
-    print >> sys.stdout, str(resp)
-  if options.unmatched:
-    with open(options.unmatched,'w') as outputFile:
-      for peptide in peptides:
-        if peptide in unmatched_peptides:
-          outputFile.write("%s\n" % peptide)
-  if options.json:
-    if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
-      root = get_taxon_json(resp)
-      with open(options.json,'w') as outputFile:
-        outputFile.write(json.dumps(root))
-    elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']:
-      with open(options.json,'w') as outputFile:
-        outputFile.write(str(resp))
-  if options.ec_json:
-    if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
-      root = get_ec_json(resp)
-      with open(options.ec_json,'w') as outputFile:
-        outputFile.write(json.dumps(root))
-  if options.tsv or options.csv:
-    rows = []
-    column_names = None
-    if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']:
-      taxa = None
-      ec_dict = None
-      go_dict = None
-      if options.unipept in ['peptinfo']:
-        (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
-      if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
-        (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra)
-      if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
-        (go_dict,go_cols) = get_go_dict(resp, extra=options.extra)
-      for i,pdict in enumerate(resp):
-        peptide = pdict['peptide']
-        total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
-        column_names = ['peptide', 'total_protein_count']
-        vals = [peptide,total_protein_count]
-        if ec_dict:
-          vals += ec_dict[peptide]
-          column_names += ec_cols
-        if go_dict:
-          vals += go_dict[peptide]
-          column_names += go_cols
-        if taxa:
-          vals += taxa[peptide][1:]
-          column_names += taxon_cols[1:]
-        rows.append(vals)
-    elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
-      (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
-      column_names = taxon_cols
-      rows = taxa.values()
-      for peptide,vals in taxa.iteritems():
-        rows.append(vals)
-    if options.tsv:
-      with open(options.tsv,'w') as outputFile:
-        if column_names:
-          outputFile.write("#%s\n"% '\t'.join(column_names))
-        for vals in rows:
-          outputFile.write("%s\n"% '\t'.join(vals))
-    if options.csv:
-      with open(options.csv,'w') as outputFile:
-        if column_names:
-          outputFile.write("%s\n"% ','.join(column_names))
-        for vals in rows:
-          outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
-  if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
-    column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
-    write_ec_table(options.ec_tsv, resp, column_order)
-  if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
-    column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
-    write_go_table(options.go_tsv, resp, column_order)
+        if options.extra or options.names:
+            column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
+        else:
+            column_order = pept2lca_column_order
+    # map to tryptic peptides
+    pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides}
+    partToPeps = {}
+    for peptide, parts in pepToParts.items():
+        if options.debug:
+            print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr)
+        for part in parts:
+            if len(part) > 50:
+                warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide, len(part), part), exit_code=None)
+            if 5 <= len(part) <= 50:
+                partToPeps.setdefault(part, []).append(peptide)
+    trypticPeptides = list(partToPeps.keys())
+    # unipept
+    unipept_resp = []
+    idx = list(range(0, len(trypticPeptides), options.max_request))
+    idx.append(len(trypticPeptides))
+    for i in range(len(idx) - 1):
+        post_data = []
+        if options.equate_il:
+            post_data.append(('equate_il', 'true'))
+        if options.names or options.json:
+            post_data.append(('extra', 'true'))
+            post_data.append(('names', 'true'))
+        elif options.extra or options.json:
+            post_data.append(('extra', 'true'))
+        if options.domains:
+            post_data.append(('domains', 'true'))
+        post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i + 1]]]
+        if options.debug:
+            print('post_data: %s\n' % (str(post_data)), file=sys.stderr)
+        headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
+        url = '%s/%s' % (options.url.rstrip('/'), options.unipept)
+        if options.get:
+            params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data])
+            url = '%s.json?%s' % (url, params)
+            req = urllib.request.Request(url)
+        else:
+            url = '%s.json' % (url)
+            data = urllib.parse.urlencode(post_data).encode()
+            params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data])
+            if options.debug:
+                print('data:\n%s\n' % (data), file=sys.stderr)
+            req = urllib.request.Request(url, headers=headers, data=urllib.parse.urlencode(post_data).encode(), method='POST')
+        if options.debug:
+            print("url: %s\n" % (str(url)), file=sys.stderr)
+        try:
+            resp = urllib.request.urlopen(req)
+            rdata = resp.read()
+            if options.debug:
+                print("%s %s\n" % (url, str(resp.getcode())), file=sys.stderr)
+            if resp.getcode() == 200:
+                if options.debug:
+                    print("rdata: \n%s\n\n" % (rdata), file=sys.stderr)
+                unipept_resp += json.loads(rdata)
+                # unipept_resp += json.loads(urllib.request.urlopen(req).read())
+        except Exception as e:
+            warn_err('HTTP Error %s\n' % (str(e)), exit_code=None)
+    unmatched_peptides = []
+    peptideMatches = []
+    if options.debug:
+        print("unipept response: %s\n" % str(unipept_resp), file=sys.stderr)
+    if options.unipept in ['pept2prot', 'pept2taxa']:
+        dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id'  # should only keep one of these per input peptide
+        # multiple entries per trypticPeptide for pep2prot or pep2taxa
+        mapping = {}
+        for match in unipept_resp:
+            mapping.setdefault(match['peptide'], []).append(match)
+        for peptide in peptides:
+            # Get the intersection of matches to the tryptic parts
+            keyToMatch = None
+            for part in pepToParts[peptide]:
+                if part in mapping:
+                    temp = {match[dupkey]: match for match in mapping[part]}
+                    if keyToMatch:
+                        dkeys = set(keyToMatch.keys()) - set(temp.keys())
+                        for k in dkeys:
+                            del keyToMatch[k]
+                    else:
+                        keyToMatch = temp
+                    # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
+            if not keyToMatch:
+                unmatched_peptides.append(peptide)
+            else:
+                for key, match in keyToMatch.items():
+                    match['tryptic_peptide'] = match['peptide']
+                    match['peptide'] = peptide
+                    peptideMatches.append(match)
+    elif options.unipept in ['pept2lca', 'peptinfo']:
+        # should be one response per trypticPeptide for pep2lca
+        respMap = {v['peptide']: v for v in unipept_resp}
+        # map resp back to peptides
+        for peptide in peptides:
+            matches = list()
+            for part in pepToParts[peptide]:
+                if part in respMap:
+                    matches.append(respMap[part])
+            match = best_match(peptide, matches)
+            if not match:
+                unmatched_peptides.append(peptide)
+                longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+                match = {'peptide': longest_tryptic_peptide}
+            match['tryptic_peptide'] = match['peptide']
+            match['peptide'] = peptide
+            peptideMatches.append(match)
+    else:
+        respMap = {v['peptide']: v for v in unipept_resp}
+        # map resp back to peptides
+        for peptide in peptides:
+            matches = list()
+            for part in pepToParts[peptide]:
+                if part in respMap and 'total_protein_count' in respMap[part]:
+                    matches.append(respMap[part])
+            match = best_match(peptide, matches)
+            if not match:
+                unmatched_peptides.append(peptide)
+                longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+                match = {'peptide': longest_tryptic_peptide}
+            match['tryptic_peptide'] = match['peptide']
+            match['peptide'] = peptide
+            peptideMatches.append(match)
+    resp = peptideMatches
+    if options.debug:
+        print("\nmapped response: %s\n" % str(resp), file=sys.stderr)
+    # output results
+    if not (options.unmatched or options.json or options.tsv or options.csv):
+        print(str(resp))
+    if options.unmatched:
+        with open(options.unmatched, 'w') as outputFile:
+            for peptide in peptides:
+                if peptide in unmatched_peptides:
+                    outputFile.write("%s\n" % peptide)
+    if options.json:
+        if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
+            root = get_taxon_json(resp)
+            with open(options.json, 'w') as outputFile:
+                outputFile.write(json.dumps(root))
+        elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct']:
+            with open(options.json, 'w') as outputFile:
+                outputFile.write(str(resp))
+    if options.ec_json:
+        if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+            root = get_ec_json(resp)
+            with open(options.ec_json, 'w') as outputFile:
+                outputFile.write(json.dumps(root))
+    if options.tsv or options.csv:
+        rows = []
+        column_names = None
+        if options.unipept in ['pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo']:
+            taxa = None
+            ec_dict = None
+            go_dict = None
+            ipr_dict = None
+            if options.unipept in ['peptinfo']:
+                (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
+            if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+                (ec_dict, ec_cols) = get_ec_dict(resp, extra=options.extra)
+            if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
+                (go_dict, go_cols) = get_go_dict(resp, extra=options.extra)
+            if options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']:
+                (ipr_dict, ipr_cols) = get_ipr_dict(resp, extra=options.extra)
+            for i, pdict in enumerate(resp):
+                peptide = pdict['peptide']
+                total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
+                column_names = ['peptide', 'total_protein_count']
+                vals = [peptide, total_protein_count]
+                if ec_dict:
+                    vals += ec_dict[peptide]
+                    column_names += ec_cols
+                if go_dict:
+                    vals += go_dict[peptide]
+                    column_names += go_cols
+                if ipr_dict:
+                    vals += ipr_dict[peptide]
+                    column_names += ipr_cols
+                if taxa:
+                    vals += taxa[peptide][1:]
+                    column_names += taxon_cols[1:]
+                rows.append(vals)
+        elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
+            (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
+            column_names = taxon_cols
+            rows = list(taxa.values())
+            for peptide, vals in taxa.items():
+                rows.append(vals)
+        if options.tsv:
+            with open(options.tsv, 'w') as outputFile:
+                if column_names:
+                    outputFile.write("#%s\n" % '\t'.join(column_names))
+                for vals in rows:
+                    outputFile.write("%s\n" % '\t'.join(vals))
+        if options.csv:
+            with open(options.csv, 'w') as outputFile:
+                if column_names:
+                    outputFile.write("%s\n" % ','.join(column_names))
+                for vals in rows:
+                    outputFile.write("%s\n" % ','.join(['"%s"' % (v if v else '') for v in vals]))
+    if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
+        column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
+        write_ec_table(options.ec_tsv, resp, column_order)
+    if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
+        column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
+        write_go_table(options.go_tsv, resp, column_order)
+    if options.ipr_tsv and options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']:
+        column_order = pept2interpro_extra_column_order if options.extra else pept2interpro_column_order
+        write_ipr_table(options.ipr_tsv, resp, column_order)

-if __name__ == "__main__" : __main__()
+
+if __name__ == "__main__":
+    __main__()
--- a/unipept.xml	Tue Jan 22 20:58:28 2019 -0500
+++ b/unipept.xml	Wed Apr 29 23:52:22 2020 +0000
@@ -1,4 +1,4 @@
-<tool id="unipept" name="Unipept" version="4.0.0">
+<tool id="unipept" name="Unipept" version="4.3.0">
     <description>retrieve taxonomy for peptides</description>
     <macros>
         <xml name="equate_il">
@@ -40,7 +40,7 @@
         </xml>
     </macros>
     <requirements>
-        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="3">python</requirement>
     </requirements>
     <stdio>
         <exit_code range="1:" />
@@ -94,6 +94,9 @@
       #if 'go_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2go', 'pept2funct', 'peptinfo']:
         --go_tsv $output_go_tsv
       #end if
+      #if 'ipr_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2interpro', 'pept2funct', 'peptinfo']:
+        --ipr_tsv $output_ipr_tsv
+      #end if
       #if 'unmatched' in str($selected_outputs).split(','):
         --unmatched $output_unmatched
       #end if
@@ -106,6 +109,7 @@
               <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option>
               <option value="pept2ec">pept2ec: Tryptic peptides and associated EC terms</option>
               <option value="pept2go">pept2go: Tryptic peptides and associated GO terms</option>
+              <option value="pept2interpro">pept2interpro: Tryptic peptides and associated InterPro entries</option>
               <option value="pept2funct">pept2funct: Tryptic peptides and associated EC and GO terms</option>
               <option value="peptinfo">peptinfo: Tryptic peptides and associated EC and GO terms and lowest common ancestor taxonomy</option>
           </param>
@@ -146,6 +150,14 @@
               </expand>
               <expand macro="domains" />
           </when>
+          <when value="pept2interpro">
+              <expand macro="equate_il" />
+              <expand macro="extra_true">
+                  <help>Return the name of the GO-term.
+                  </help>
+              </expand>
+              <expand macro="domains" />
+          </when>
           <when value="pept2funct">
               <expand macro="equate_il" />
               <expand macro="extra_true">
@@ -194,6 +206,7 @@
        <option value="csv">Comma Separated Values (.csv) with one line per peptide</option>
         <option value="json">JSON Taxomony Tree (for pept2lca, pep2taxa, and peptinfo)</option>
         <option value="go_tsv">Peptide GO terms in normalized tabular (for pept2go, pept2funct, and peptinfo)</option>
+        <option value="ipr_tsv">Peptide InterPro entries in normalized tabular (for pept2interpro, pept2funct, and peptinfo)</option>
         <option value="ec_tsv">Peptide EC terms in normalized tabular (for pept2ec, pept2funct, and peptinfo)</option>
         <option value="ec_json">JSON EC Coverage Tree (for pept2ec, pep2funct, and peptinfo)</option>
         <option value="unmatched">Unmatched peptides</option>
@@ -244,6 +257,13 @@
             <action name="column_names" type="metadata" default="Peptide,Total Protein Count,GO Term,Protein Count,GO Name" />
         </actions>
       </data>
+      <data name="output_ipr_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} InterPro tsv">
+        <filter>'ipr_tsv' in selected_outputs and unipept['api'] in ('pept2interpro', 'pept2funct', 'peptinfo')</filter>
+        <actions>
+            <action name="column_names" type="metadata" default="Peptide,Total Protein Count,InterPro Code,Protein Count,InterPro Type,InterPro Name" />
+        </actions>
+      </data>
+
       <data name="output_unmatched" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} unmatched">
         <filter>'unmatched' in selected_outputs</filter>
         <actions>
@@ -279,6 +299,8 @@
         <param name="extra" value="True"/>
         <param name="names" value="True"/>
         <param name="selected_outputs" value="json,tsv"/>
+<!--
+-->
         <output name="output_json">
             <assert_contents>
               <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" />
@@ -303,7 +325,6 @@
             <assert_contents>
               <has_text text="sapiens" />
               <has_text text="paniscus" />
-              <has_text text="Gorilla" />
             </assert_contents>
         </output>
       </test>
@@ -314,11 +335,12 @@
         <param name="column" value="2"/>
         <param name="extra" value="True"/>
         <param name="names" value="True"/>
-        <param name="selected_outputs" value="tsv,ec_tsv,go_tsv,unmatched"/>
+        <param name="selected_outputs" value="tsv,ec_tsv,go_tsv,ipr_tsv,unmatched"/>
         <output name="output_tsv">
             <assert_contents>
               <has_text text="GO:0004802" />
               <has_text text="2.2.1.1" />
+              <has_text text="IPR005475" />
             </assert_contents>
         </output>
         <output name="output_ec_tsv">
@@ -331,6 +353,11 @@
               <has_text text="GO:0004802" />
             </assert_contents>
         </output>
+        <output name="output_ipr_tsv">
+            <assert_contents>
+              <has_text text="IPR005475" />
+            </assert_contents>
+        </output>
       </test>
     </tests>
     <help><![CDATA[
@@ -485,6 +512,19 @@
         name: Optional, name of the GO-term. Included when the extra parameter is set to true.


+    **pept2interpro**  - http://unipept.ugent.be/apidocs/pept2interpro
+
+    Returns the functional InterPro entries associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
+
+    By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::
+
+        peptide: the peptide that matched this record
+        total_protein_count: Total amount of proteins matched with the given peptide
+        code: InterPro entry code associated with the current tryptic peptide
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current InterPro code.
+        type: Optional, type of the InterPro entry. Included when the extra parameter is set to true.
+        name: Optional, name of the InterPro entry. Included when the extra parameter is set to true.
+
     **pept2funct**  - http://unipept.ugent.be/apidocs/pept2funct

     Returns the functional EC-numbers and GO-terms associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.
@@ -499,6 +539,10 @@
         go_term: The GO-term associated with the current tryptic peptide.
         protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current GO-term.
         name: Optional, name of the GO-term. Included when the extra parameter is set to true.
+        code: InterPro entry code associated with the current tryptic peptide
+        protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current InterPro code.
+        type: Optional, type of the InterPro entry. Included when the extra parameter is set to true.
+        name: Optional, name of the InterPro entry. Included when the extra parameter is set to true.


     **Attributions**