Mercurial > repos > galaxyp > iedb_netmhcpan
comparison nextgen_iedb_api.py @ 0:4fb283cf23a5 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/iedb_netmhcpan commit 0ac7534c8d9f5bfea21b998286f822784e62da08
| author | galaxyp |
|---|---|
| date | Wed, 09 Jul 2025 12:56:02 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4fb283cf23a5 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 This file was adapted from the iedb_apy.py file of the iedb_api tool. | |
| 5 It uses the newer "Next-Generation" IEDB API, and is constrained to | |
| 6 the mhci and mhcii tool groups. | |
| 7 """ | |
| 8 | |
| 9 import argparse | |
| 10 import json | |
| 11 import os.path | |
| 12 import re | |
| 13 import sys | |
| 14 import time | |
| 15 import urllib.request | |
| 16 from urllib.error import HTTPError | |
| 17 | |
| 18 # IEDB tool groups and predictor methods | |
| 19 mhci_methods = ['netmhcpan_el', 'netmhcpan_ba'] | |
| 20 mhcii_methods = ['netmhciipan_el', 'netmhciipan_ba'] | |
| 21 tool_group_methods = {'mhci': mhci_methods, | |
| 22 'mhcii': mhcii_methods} | |
| 23 all_methods = set(mhci_methods + mhcii_methods) | |
| 24 | |
| 25 # Values for polling backoff | |
| 26 max_backoff_count = 25 | |
| 27 init_poll_sleep = 10 | |
| 28 poll_retries = 50 | |
| 29 requests_before_backoff = 5 | |
| 30 | |
| 31 | |
| 32 def parse_alleles(allelefile): | |
| 33 """Returns a dictionary with alleles from input file.""" | |
| 34 alleles = [] | |
| 35 with open(allelefile, 'r') as fh: | |
| 36 for line in fh: | |
| 37 allele = line.strip() | |
| 38 alleles.append(allele) | |
| 39 return alleles | |
| 40 | |
| 41 | |
| 42 def parse_sequence_column(sequence_file_lines, col): | |
| 43 """Sequences may come from a specific column in a TSV file. | |
| 44 | |
| 45 Parse these sequences out while checking each against a regex for validity. | |
| 46 """ | |
| 47 aapat = '^[ABCDEFGHIKLMNPQRSTVWY]+$' | |
| 48 sequences = [] | |
| 49 for i, line in enumerate(sequence_file_lines): | |
| 50 fields = line.split('\t') | |
| 51 if len(fields) > col: | |
| 52 seq = re.sub('[_*]', '', fields[col].strip()) | |
| 53 if not re.match(aapat, seq): | |
| 54 warn_err(f'Line {i}, Not a peptide: {seq}') | |
| 55 else: | |
| 56 sequences.append(seq) | |
| 57 else: | |
| 58 warn_err('Invalid value for -c/--column') | |
| 59 break | |
| 60 return sequences | |
| 61 | |
| 62 | |
| 63 def iedb_request(req, timeout, retries, error_retry_sleep, response_fn=None, req_data=None): | |
| 64 """Handles HTTP request and exceptions. Allows for a callback to parse IEDB response.""" | |
| 65 for retry in range(1, retries + 1): | |
| 66 response = None | |
| 67 try: | |
| 68 response = urllib.request.urlopen(req, req_data, timeout=timeout) | |
| 69 except HTTPError as e: | |
| 70 warn_err(f'{retry} of {retries} Error connecting to IEDB server. \ | |
| 71 HTTP status code: {e.code}') | |
| 72 time.sleep(error_retry_sleep) | |
| 73 except Exception as e: | |
| 74 warn_err(f'Error getting results from IEDB server: {e}') | |
| 75 return None | |
| 76 | |
| 77 if response and response.getcode() == 200: | |
| 78 # If no callback, return results | |
| 79 if not response_fn: | |
| 80 response_string = response.read().decode('utf-8') | |
| 81 response_json = json.loads(response_string) | |
| 82 return response_json | |
| 83 | |
| 84 # Retry if response_fn callback deems necessary, i.e. results from job are not ready. | |
| 85 response_json = response_fn(response, retry) | |
| 86 if response_json: | |
| 87 return response_json | |
| 88 else: | |
| 89 code = response.getcode() if response else 1 | |
| 90 warn_err(f'Error connecting to IEDB server. HTTP status code: {code}') | |
| 91 | |
| 92 warn_err(f'No successful response from IEDB in {retries} retries') | |
| 93 return None | |
| 94 | |
| 95 | |
| 96 def pipeline_request(url, tool_group, sequence_text, alleles, length_range, | |
| 97 methods, peptide_shift, timeout=300, retries=3, error_retry_sleep=300): | |
| 98 """Submits job to IEDB pipeline and polls API until results are ready. | |
| 99 | |
| 100 Returns response JSON from IEDB. | |
| 101 """ | |
| 102 | |
| 103 # Set up input parameters for IEDB NetMHCPan or NetMHCIIPan job | |
| 104 input_parameters = { | |
| 105 'alleles': alleles, | |
| 106 'peptide_length_range': length_range, | |
| 107 'predictors': [{'type': 'binding', 'method': m} for m in methods], | |
| 108 'peptide_shift': peptide_shift | |
| 109 } | |
| 110 | |
| 111 if peptide_shift: | |
| 112 input_parameters['peptide_shift'] = peptide_shift | |
| 113 | |
| 114 stage = { | |
| 115 'stage_number': 1, | |
| 116 'tool_group': tool_group, | |
| 117 'input_sequence_text': sequence_text, | |
| 118 'input_parameters': input_parameters | |
| 119 } | |
| 120 | |
| 121 params = { | |
| 122 'pipeline_id': "", | |
| 123 'run_stage_range': [1, 1], | |
| 124 'stages': [stage] | |
| 125 } | |
| 126 | |
| 127 req = urllib.request.Request(url, method='POST') | |
| 128 req_data = json.dumps(params).encode('utf-8') | |
| 129 req.add_header('Content-Type', 'application/json; charset=utf-8') | |
| 130 req.add_header('Content-Length', len(req_data)) | |
| 131 | |
| 132 # Make an initial request to submit job | |
| 133 response_json = iedb_request(req, timeout, retries, error_retry_sleep, req_data=req_data) | |
| 134 if not response_json: | |
| 135 warn_err('Initial request failed.') | |
| 136 return None | |
| 137 | |
| 138 # Check response from job submission | |
| 139 warnings = response_json.get('warnings') | |
| 140 if warnings and len(warnings) > 0: | |
| 141 invalid_alleles = False | |
| 142 for warning in warnings: | |
| 143 if 'cannot predict binding for allele' in warning: | |
| 144 warn_err(f"Error: Bad allelle input. {warning}") | |
| 145 invalid_alleles = True | |
| 146 if invalid_alleles: | |
| 147 return None | |
| 148 | |
| 149 warn_err(f'Warnings from IEDB: {warnings}') | |
| 150 | |
| 151 errors = response_json.get('errors') | |
| 152 if errors: | |
| 153 warn_err(f'Errors from IEDB: {errors}') | |
| 154 return None | |
| 155 | |
| 156 results_uri = response_json.get('results_uri') | |
| 157 if not results_uri: | |
| 158 warn_err('No results URI provided from IEDB.') | |
| 159 return None | |
| 160 | |
| 161 # Callback function to rate-limit poll requests | |
| 162 def poll_response_fn(response, retry): | |
| 163 response_string = response.read().decode('utf-8') | |
| 164 response_json = json.loads(response_string) | |
| 165 if response_json['status'] != 'done': | |
| 166 if retry == poll_retries: | |
| 167 warn_err('Job not finished in maximum allowed time.') | |
| 168 backoff_count = min(retry, max_backoff_count) | |
| 169 | |
| 170 # Double sleep every requests_before_backoff requests | |
| 171 sleep_duration = init_poll_sleep * 2 ** (backoff_count // requests_before_backoff) | |
| 172 time.sleep(sleep_duration) | |
| 173 return None | |
| 174 return response_json | |
| 175 | |
| 176 # Submit polling for results | |
| 177 response_json = iedb_request(results_uri, timeout, poll_retries, | |
| 178 error_retry_sleep, response_fn=poll_response_fn) | |
| 179 if not response_json: | |
| 180 warn_err('Retrieving results failed.') | |
| 181 return response_json | |
| 182 | |
| 183 | |
| 184 def warn_err(msg, exit_code=None): | |
| 185 sys.stderr.write(f"{msg}\n") | |
| 186 sys.stderr.flush() | |
| 187 if exit_code: | |
| 188 sys.exit(exit_code) | |
| 189 | |
| 190 | |
| 191 def add_reversed_sequences(file_lines, file_format): | |
| 192 """Adds a reversed sequence after each input sequence. | |
| 193 | |
| 194 Takes a plain list of sequences, or FASTA file. Each reversed FASTA sequence has | |
| 195 the same header prefixed with 'reversed_'. | |
| 196 """ | |
| 197 sequences_with_reversed = [] | |
| 198 if file_format == 'fasta': | |
| 199 i = 0 | |
| 200 while i < len(file_lines): | |
| 201 | |
| 202 # Validate header from next sequence | |
| 203 seq_header = file_lines[i] | |
| 204 if seq_header[0] != '>': | |
| 205 print('Invalid FASTA. Exiting.', file=sys.stderr) | |
| 206 sys.exit(1) | |
| 207 | |
| 208 # Aggregate sequence into a single line | |
| 209 j = i + 1 | |
| 210 seq = '' | |
| 211 while j < len(file_lines): | |
| 212 next_line = file_lines[j] | |
| 213 if next_line[0] == '>': | |
| 214 break | |
| 215 seq = seq + file_lines[j] | |
| 216 j += 1 | |
| 217 | |
| 218 # Add non-reversed sequence | |
| 219 sequences_with_reversed.append(seq_header) | |
| 220 sequences_with_reversed.append(seq) | |
| 221 | |
| 222 # Add reversed header and sequence | |
| 223 rev_header = seq_header.replace('>', '>reversed_') | |
| 224 sequences_with_reversed.append(rev_header) | |
| 225 | |
| 226 rev_seq = seq[::-1] | |
| 227 sequences_with_reversed.append(rev_seq) | |
| 228 | |
| 229 # Advance index to what should be the next sequence header | |
| 230 i = j | |
| 231 | |
| 232 # If not FASTA, should be a simple list of peptides to reverse sequentially | |
| 233 else: | |
| 234 for seq in file_lines: | |
| 235 | |
| 236 # Reverse seq | |
| 237 rev_seq = f'{seq[::-1]}' | |
| 238 | |
| 239 # Add original and reversed sequences | |
| 240 sequences_with_reversed.append(seq) | |
| 241 sequences_with_reversed.append(rev_seq) | |
| 242 | |
| 243 return sequences_with_reversed | |
| 244 | |
| 245 | |
| 246 def __main__(): | |
| 247 # Parse Command Line | |
| 248 parser = argparse.ArgumentParser() | |
| 249 parser.add_argument('-T', '--tool-group', | |
| 250 dest='tool_group', | |
| 251 default='mhci', | |
| 252 choices=tool_group_methods.keys(), | |
| 253 help='IEDB API Tool Group') | |
| 254 parser.add_argument('-m', '--method', | |
| 255 action="append", | |
| 256 required=True, | |
| 257 choices=all_methods, | |
| 258 help='prediction method') | |
| 259 parser.add_argument('-A', '--allelefile', | |
| 260 required=True, | |
| 261 help='File of HLA alleles') | |
| 262 parser.add_argument('-l', '--lengthrange', | |
| 263 help='length range for which to make predictions for alleles') | |
| 264 parser.add_argument('-P', '--peptide_shift', | |
| 265 type=int, | |
| 266 default=None, | |
| 267 help='Peptide Shift') | |
| 268 parser.add_argument('-i', '--input', | |
| 269 required=True, | |
| 270 help='Input file for peptide sequences ' | |
| 271 + '(fasta or tabular)') | |
| 272 parser.add_argument('-c', '--column', | |
| 273 default=None, | |
| 274 help='Zero-indexed peptide column in a tabular input file') | |
| 275 parser.add_argument('-o', '--output', | |
| 276 required=True, | |
| 277 help='Output file for query results') | |
| 278 parser.add_argument('-t', '--timeout', | |
| 279 type=int, | |
| 280 default=600, | |
| 281 help='Seconds to wait for server response') | |
| 282 parser.add_argument('-r', '--retries', | |
| 283 type=int, | |
| 284 default=5, | |
| 285 help='Number of times to retry failed server query') | |
| 286 parser.add_argument('-S', '--sleep', | |
| 287 type=int, | |
| 288 default=300, | |
| 289 help='Seconds to wait between failed server query retries') | |
| 290 parser.add_argument('-R', '--add-reversed', | |
| 291 dest='add_reversed', | |
| 292 action='store_true', | |
| 293 help='Input has every other sequence reversed. Identify in output.') | |
| 294 args = parser.parse_args() | |
| 295 | |
| 296 allele_string = ','.join(parse_alleles(args.allelefile)) | |
| 297 | |
| 298 length_range = [int(i) for i in args.lengthrange.split(',')] | |
| 299 | |
| 300 pipeline_url = 'https://api-nextgen-tools.iedb.org/api/v1/pipeline' | |
| 301 | |
| 302 # If sequences submitted as a file, parse out sequences. | |
| 303 try: | |
| 304 with open(args.input) as inf: | |
| 305 sequence_file_contents = inf.read() | |
| 306 except Exception as e: | |
| 307 warn_err(f'Unable to open input file: {e}', exit_code=1) | |
| 308 | |
| 309 sequence_file_lines = sequence_file_contents.splitlines() | |
| 310 | |
| 311 # Pick out sequences if input file has multiple columns, | |
| 312 # otherwise submit list of sequences as-is. | |
| 313 if not args.column: | |
| 314 # IEDB may take FASTA files directly, so input contents as-is | |
| 315 if args.add_reversed: | |
| 316 sequence_text = '\n'.join(add_reversed_sequences(sequence_file_lines, 'fasta')) | |
| 317 else: | |
| 318 sequence_text = sequence_file_contents | |
| 319 else: | |
| 320 sequences = parse_sequence_column(sequence_file_lines, int(args.column)) | |
| 321 if args.add_reversed: | |
| 322 sequence_text = '\n'.join(add_reversed_sequences(sequences, 'tsv')) | |
| 323 else: | |
| 324 sequence_text = '\n'.join(sequences) | |
| 325 | |
| 326 if len(sequence_text) == 0: | |
| 327 warn_err('Error parsing sequences', exit_code=1) | |
| 328 | |
| 329 # Submit job and return results | |
| 330 results = pipeline_request(pipeline_url, args.tool_group, sequence_text, | |
| 331 allele_string, length_range, args.method, | |
| 332 peptide_shift=args.peptide_shift, timeout=args.timeout, | |
| 333 retries=args.retries, error_retry_sleep=args.sleep) | |
| 334 if not results: | |
| 335 warn_err('Job failed. Exiting.', exit_code=1) | |
| 336 | |
| 337 try: | |
| 338 peptide_table = [t for t in results['data']['results'] if t['type'] == 'peptide_table'][0] | |
| 339 peptide_table_data = peptide_table['table_data'] | |
| 340 peptide_table_columns = peptide_table['table_columns'] | |
| 341 | |
| 342 # If we reversed peptides prior to IEDB input, | |
| 343 # find column index of sequence number so we can identify which come from reversed input. | |
| 344 if args.add_reversed: | |
| 345 for i, column in enumerate(peptide_table_columns): | |
| 346 if column['display_name'] == 'seq #': | |
| 347 seq_num_index = i | |
| 348 break | |
| 349 except (KeyError, IndexError) as e: | |
| 350 warn_err(f'Error parsing IEDB results: {e}', exit_code=1) | |
| 351 | |
| 352 output_path = os.path.abspath(args.output) | |
| 353 with open(output_path, 'w') as output_file: | |
| 354 # Write column names | |
| 355 display_names = '\t'.join([c['display_name'] for c in peptide_table_columns] + ['reversed']) | |
| 356 print(display_names, file=output_file) | |
| 357 | |
| 358 # Write data | |
| 359 for values in peptide_table_data: | |
| 360 if args.add_reversed: | |
| 361 seq_number = values[seq_num_index] | |
| 362 # Every original input sequence is followed by its reversed sequence, | |
| 363 # so we know even sequence numbers are reversed. | |
| 364 reversed_val = str(seq_number % 2 == 0).lower() | |
| 365 else: | |
| 366 reversed_val = 'false' | |
| 367 values = '\t'.join([str(v) for v in values] + [reversed_val]) | |
| 368 print(values, file=output_file) | |
| 369 | |
| 370 | |
| 371 if __name__ == "__main__": | |
| 372 __main__() |
