diff utils/get_msa.py @ 4:af383638de66 draft

planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
author nick
date Mon, 23 Nov 2015 18:44:23 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/get_msa.py	Mon Nov 23 18:44:23 2015 -0500
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+from __future__ import division
+import os
+import sys
+import argparse
+import tempfile
+import subprocess
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+import consensus
+import seqtools
+
+OPT_DEFAULTS = {'format':'plain', 'qual':20, 'qual_format':'sanger'}
+USAGE = "%(prog)s [options]"
+DESCRIPTION = """"""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION)
+  parser.set_defaults(**OPT_DEFAULTS)
+
+  parser.add_argument('seqs', metavar='sequence', nargs='*',
+    help='The alignment.')
+  parser.add_argument('-i', '--input',
+    help='Provide the sequences in this input file instead of as command-line arguments. '
+         'Give "-" to use stdin.')
+  parser.add_argument('-f', '--format', choices=('plain', 'duplex'),
+    help='Input format. "plain" is a simple list of the sequences, one on each line. "duplex" is '
+         'the 8-column format of the family-sorted read data from the duplex pipeline. It must be '
+         'the read pairs from a single alpha/beta barcode combination (both the alpha-beta and '
+         'beta-alpha strands). If "duplex" is given, you must also specify which of the four '
+         'possible alignments to output with --mate and --order.')
+  parser.add_argument('-m', '--mate', type=int, choices=(1, 2))
+  parser.add_argument('-o', '--order', choices=('ab', 'ba'))
+  parser.add_argument('-F', '--qual-format', choices=('sanger',))
+  parser.add_argument('-q', '--qual', type=int,
+    help='Quality threshold: Default: %(default)s')
+
+  args = parser.parse_args(argv[1:])
+
+  qual_thres = ' '
+  if args.qual_format == 'sanger':
+    qual_thres = chr(args.qual + 33)
+  else:
+    fail('Error: Unsupported FASTQ quality format "{}".'.format(args.qual_format))
+  # Check arguments.
+  if not (args.seqs or args.input):
+    fail('Error: You must provide sequences either in a file with --input or as arguments.')
+  elif args.seqs and args.input:
+    fail('Error: You cannot provide sequences in both a file and command-line arguments.')
+  if args.format == 'duplex' and not (args.mate and args.order):
+    fail('Error: If the --format is duplex, you must specify a --mate and --order.')
+
+  # Read input.
+  quals = []
+  if args.input:
+    if args.format == 'plain':
+      if args.input == '-':
+        seqs = [line.strip() for line in sys.stdin]
+      else:
+        with open(args.input) as infile:
+          seqs = [line.strip() for line in infile]
+    elif args.format == 'duplex':
+      if args.input == '-':
+        (seqs, quals) = parse_duplex(sys.stdin, args.mate, args.order)
+      else:
+        with open(args.input) as infile:
+          (seqs, quals) = parse_duplex(infile, args.mate, args.order)
+  else:
+    seqs = args.seqs
+
+  align = make_msa(seqs)
+  if quals:
+    quals = seqtools.transfer_gaps_multi(quals, align, gap_char_out=' ')
+  cons = consensus.get_consensus(align, quals, qual_thres=qual_thres, gapped=True)
+
+  output = format_alignment(cons, align, quals, qual_thres=ord(qual_thres))
+
+  for seq in output:
+    print seq
+
+
+def parse_duplex(infile, mate, order):
+  seqs = []
+  quals = []
+  for line in infile:
+    (bar, this_order, name1, seq1, qual1, name2, seq2, qual2) = line.rstrip('\r\n').split('\t')
+    if this_order == order:
+      if mate == 1:
+        seqs.append(seq1)
+        quals.append(qual1)
+      elif mate == 2:
+        seqs.append(seq2)
+        quals.append(qual2)
+  return seqs, quals
+
+
+def make_msa(seqs):
+  """Perform a multiple sequence alignment on a set of sequences.
+  Uses MAFFT."""
+  i = 0
+  #TODO: Replace with tempfile.mkstemp()?
+  with tempfile.NamedTemporaryFile('w', delete=False, prefix='msa.') as family_file:
+    for seq in seqs:
+      i+=1
+      header = '>{}\n'.format(i)
+      family_file.write(header)
+      family_file.write(seq+'\n')
+  with open(os.devnull, 'w') as devnull:
+    try:
+      command = ['mafft', '--nuc', '--quiet', family_file.name]
+      output = subprocess.check_output(command, stderr=devnull)
+    except (OSError, subprocess.CalledProcessError):
+      return None
+  os.remove(family_file.name)
+  return read_fasta(output)
+
+
+def read_fasta(fasta):
+  """Quick and dirty FASTA parser. Return only the list of sequences (no names).
+  Warning: Reads the entire contents of the file into memory at once."""
+  sequences = []
+  sequence = ''
+  for line in fasta.splitlines():
+    if line.startswith('>'):
+      if sequence:
+        sequences.append(sequence)
+      sequence = ''
+      continue
+    sequence += line.strip()
+  if sequence:
+    sequences.append(sequence)
+  return sequences
+
+
+def format_alignment(cons, seqs, quals=(), qual_thres=32, id_char='.'):
+  output = [cons.upper()]
+  for i, seq in enumerate(seqs):
+    outseq = ''
+    for j, seq_base in enumerate(seq.upper()):
+      if quals and seq_base != '-' and ord(quals[i][j]) < qual_thres:
+        outseq += ' '
+      elif cons[j] == seq_base:
+        outseq += id_char
+      else:
+        outseq += seq_base
+    output.append(outseq)
+  return output
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))