annotate convert_bc_to_binary_RY.py @ 58:bbbae1ee87e0 draft default tip

fix for flexbar with small data issue
author rnateam
date Tue, 16 Feb 2016 10:08:58 -0500
parents 0b9aab6aaebf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
3 import argparse
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
4 import logging
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
5 from string import maketrans
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
6 from sys import stdout
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
7 from Bio import SeqIO
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
8 from Bio.Seq import Seq
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
9 from Bio.Alphabet import IUPAC
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 8
diff changeset
10
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
11 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
12 Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
13
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
14 A and G are converted to nucleotide code R. T, U and C are converted to Y. By
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15 default output is written to stdout.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18 - write converted sequences from file in.fa to file file out.fa:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19 convert_bc_to_binary_RY.py in.fa --outfile out.fa
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27 Status: Testing
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33 formatter_class=argparse.RawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36 "infile",
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
37 help="Path to fastq input file.")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 "-o", "--outfile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41 help="Write results to this file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42 parser.add_argument(
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
43 "--fasta-format",
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
44 dest="fasta_format",
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
45 help="Read and write fasta instead of fastq format.",
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
46 action="store_true")
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
47 parser.add_argument(
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
50 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
51 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
52 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
53 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
54 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
55 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
56 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
57 action='version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
58 version='0.1.0')
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
59
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
60
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
61 def translate_nt_to_RY(seq):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 """Translates nucleotides to RY (A,G -> R; C,U,T -> Y).
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
64 >>> translate_nt_to_RY("ACGUTACGUT")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
65 RYRYYRYRYY
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
66 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
67 trans_table = maketrans("AGCUT", "RRYYY")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
68 trans_seq = seq.translate(trans_table)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 logging.debug(seq + " -> " + trans_seq)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
70 return trans_seq
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
71
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
72
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73 def translate_nt_to_RY_iterator(robj):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74 """Translate SeqRecords sequences to RY alphabet."""
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 for record in robj:
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
76 if not args.fasta_format:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
77 saved_letter_annotations = record.letter_annotations
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
78 record.letter_annotations = {}
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79 record.seq = Seq(translate_nt_to_RY(str(record.seq)),
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80 IUPAC.unambiguous_dna)
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
81 if not args.fasta_format:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
82 record.letter_annotations = saved_letter_annotations
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 yield record
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 # handle arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
88 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
89 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
90 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
91 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
92 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
93 logging.info("Parsed arguments:")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
94 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
95 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
96 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
97 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
98 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
99
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
100 # get input iterator
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
101 input_handle = open(args.infile, "rU")
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
102 if args.fasta_format:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
103 input_seq_iterator = SeqIO.parse(input_handle, "fasta")
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
104 else:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
105 input_seq_iterator = SeqIO.parse(input_handle, "fastq")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
106 convert_seq_iterator = translate_nt_to_RY_iterator(input_seq_iterator)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
107 output_handle = (open(args.outfile, "w") if args.outfile is not None else stdout)
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
108 if args.fasta_format:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
109 SeqIO.write(convert_seq_iterator, output_handle, "fasta")
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
110 else:
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
111 SeqIO.write(convert_seq_iterator, output_handle, "fastq")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
112 output_handle.close()