Mercurial > repos > nick > duplex
comparison utils/get_msa.py @ 4:af383638de66 draft
planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
| author | nick |
|---|---|
| date | Mon, 23 Nov 2015 18:44:23 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:13bcc2f459b0 | 4:af383638de66 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 import os | |
| 4 import sys | |
| 5 import argparse | |
| 6 import tempfile | |
| 7 import subprocess | |
| 8 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) | |
| 9 import consensus | |
| 10 import seqtools | |
| 11 | |
| 12 OPT_DEFAULTS = {'format':'plain', 'qual':20, 'qual_format':'sanger'} | |
| 13 USAGE = "%(prog)s [options]" | |
| 14 DESCRIPTION = """""" | |
| 15 | |
| 16 | |
| 17 def main(argv): | |
| 18 | |
| 19 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
| 20 parser.set_defaults(**OPT_DEFAULTS) | |
| 21 | |
| 22 parser.add_argument('seqs', metavar='sequence', nargs='*', | |
| 23 help='The alignment.') | |
| 24 parser.add_argument('-i', '--input', | |
| 25 help='Provide the sequences in this input file instead of as command-line arguments. ' | |
| 26 'Give "-" to use stdin.') | |
| 27 parser.add_argument('-f', '--format', choices=('plain', 'duplex'), | |
| 28 help='Input format. "plain" is a simple list of the sequences, one on each line. "duplex" is ' | |
| 29 'the 8-column format of the family-sorted read data from the duplex pipeline. It must be ' | |
| 30 'the read pairs from a single alpha/beta barcode combination (both the alpha-beta and ' | |
| 31 'beta-alpha strands). If "duplex" is given, you must also specify which of the four ' | |
| 32 'possible alignments to output with --mate and --order.') | |
| 33 parser.add_argument('-m', '--mate', type=int, choices=(1, 2)) | |
| 34 parser.add_argument('-o', '--order', choices=('ab', 'ba')) | |
| 35 parser.add_argument('-F', '--qual-format', choices=('sanger',)) | |
| 36 parser.add_argument('-q', '--qual', type=int, | |
| 37 help='Quality threshold: Default: %(default)s') | |
| 38 | |
| 39 args = parser.parse_args(argv[1:]) | |
| 40 | |
| 41 qual_thres = ' ' | |
| 42 if args.qual_format == 'sanger': | |
| 43 qual_thres = chr(args.qual + 33) | |
| 44 else: | |
| 45 fail('Error: Unsupported FASTQ quality format "{}".'.format(args.qual_format)) | |
| 46 # Check arguments. | |
| 47 if not (args.seqs or args.input): | |
| 48 fail('Error: You must provide sequences either in a file with --input or as arguments.') | |
| 49 elif args.seqs and args.input: | |
| 50 fail('Error: You cannot provide sequences in both a file and command-line arguments.') | |
| 51 if args.format == 'duplex' and not (args.mate and args.order): | |
| 52 fail('Error: If the --format is duplex, you must specify a --mate and --order.') | |
| 53 | |
| 54 # Read input. | |
| 55 quals = [] | |
| 56 if args.input: | |
| 57 if args.format == 'plain': | |
| 58 if args.input == '-': | |
| 59 seqs = [line.strip() for line in sys.stdin] | |
| 60 else: | |
| 61 with open(args.input) as infile: | |
| 62 seqs = [line.strip() for line in infile] | |
| 63 elif args.format == 'duplex': | |
| 64 if args.input == '-': | |
| 65 (seqs, quals) = parse_duplex(sys.stdin, args.mate, args.order) | |
| 66 else: | |
| 67 with open(args.input) as infile: | |
| 68 (seqs, quals) = parse_duplex(infile, args.mate, args.order) | |
| 69 else: | |
| 70 seqs = args.seqs | |
| 71 | |
| 72 align = make_msa(seqs) | |
| 73 if quals: | |
| 74 quals = seqtools.transfer_gaps_multi(quals, align, gap_char_out=' ') | |
| 75 cons = consensus.get_consensus(align, quals, qual_thres=qual_thres, gapped=True) | |
| 76 | |
| 77 output = format_alignment(cons, align, quals, qual_thres=ord(qual_thres)) | |
| 78 | |
| 79 for seq in output: | |
| 80 print seq | |
| 81 | |
| 82 | |
| 83 def parse_duplex(infile, mate, order): | |
| 84 seqs = [] | |
| 85 quals = [] | |
| 86 for line in infile: | |
| 87 (bar, this_order, name1, seq1, qual1, name2, seq2, qual2) = line.rstrip('\r\n').split('\t') | |
| 88 if this_order == order: | |
| 89 if mate == 1: | |
| 90 seqs.append(seq1) | |
| 91 quals.append(qual1) | |
| 92 elif mate == 2: | |
| 93 seqs.append(seq2) | |
| 94 quals.append(qual2) | |
| 95 return seqs, quals | |
| 96 | |
| 97 | |
| 98 def make_msa(seqs): | |
| 99 """Perform a multiple sequence alignment on a set of sequences. | |
| 100 Uses MAFFT.""" | |
| 101 i = 0 | |
| 102 #TODO: Replace with tempfile.mkstemp()? | |
| 103 with tempfile.NamedTemporaryFile('w', delete=False, prefix='msa.') as family_file: | |
| 104 for seq in seqs: | |
| 105 i+=1 | |
| 106 header = '>{}\n'.format(i) | |
| 107 family_file.write(header) | |
| 108 family_file.write(seq+'\n') | |
| 109 with open(os.devnull, 'w') as devnull: | |
| 110 try: | |
| 111 command = ['mafft', '--nuc', '--quiet', family_file.name] | |
| 112 output = subprocess.check_output(command, stderr=devnull) | |
| 113 except (OSError, subprocess.CalledProcessError): | |
| 114 return None | |
| 115 os.remove(family_file.name) | |
| 116 return read_fasta(output) | |
| 117 | |
| 118 | |
| 119 def read_fasta(fasta): | |
| 120 """Quick and dirty FASTA parser. Return only the list of sequences (no names). | |
| 121 Warning: Reads the entire contents of the file into memory at once.""" | |
| 122 sequences = [] | |
| 123 sequence = '' | |
| 124 for line in fasta.splitlines(): | |
| 125 if line.startswith('>'): | |
| 126 if sequence: | |
| 127 sequences.append(sequence) | |
| 128 sequence = '' | |
| 129 continue | |
| 130 sequence += line.strip() | |
| 131 if sequence: | |
| 132 sequences.append(sequence) | |
| 133 return sequences | |
| 134 | |
| 135 | |
| 136 def format_alignment(cons, seqs, quals=(), qual_thres=32, id_char='.'): | |
| 137 output = [cons.upper()] | |
| 138 for i, seq in enumerate(seqs): | |
| 139 outseq = '' | |
| 140 for j, seq_base in enumerate(seq.upper()): | |
| 141 if quals and seq_base != '-' and ord(quals[i][j]) < qual_thres: | |
| 142 outseq += ' ' | |
| 143 elif cons[j] == seq_base: | |
| 144 outseq += id_char | |
| 145 else: | |
| 146 outseq += seq_base | |
| 147 output.append(outseq) | |
| 148 return output | |
| 149 | |
| 150 | |
| 151 def fail(message): | |
| 152 sys.stderr.write(message+"\n") | |
| 153 sys.exit(1) | |
| 154 | |
| 155 if __name__ == '__main__': | |
| 156 sys.exit(main(sys.argv)) |
