annotate trimmer.py @ 0:c824894e0827 draft

planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
author nick
date Tue, 01 Dec 2015 21:11:51 -0500
parents
children 7ef568cbf13b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
1 #!/usr/bin/env python
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
2 from __future__ import division
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
3 import sys
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
4 import argparse
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
5 import getreads
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
6
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
7 OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'}
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
8 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]"
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
9 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
10 N's."""
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
11
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
12
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
13 def main(argv):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
14
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
15 parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
16 parser.set_defaults(**OPT_DEFAULTS)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
17
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
18 parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'),
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
19 default=sys.stdin,
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
20 help='Input reads (mate 1). Omit to read from stdin.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
21 parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'),
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
22 help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out '
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
23 'entirely, the other will also be lost).')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
24 parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'),
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
25 default=sys.stdout,
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
26 help='Output file for mate 1. WARNING: Will overwrite.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
27 parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'),
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
28 help='Output file for mate 2. WARNING: Will overwrite.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
29 parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'),
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
30 help='Input read format.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
31 parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'),
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
32 help='Output read format. Default: whatever the input format is.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
33 parser.add_argument('-b', '--filt-bases',
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
34 help='The bases to filter on. Case-insensitive. Default: %(default)s.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
35 parser.add_argument('-t', '--thres', type=float,
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
36 help='The threshold. The read will be trimmed once the proportion of filter bases in the '
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
37 'window exceed this fraction (not a percentage). Default: %(default)s.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
38 parser.add_argument('-w', '--window', dest='win_len', type=int,
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
39 help='Window size for trimming. Default: %(default)s.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
40 parser.add_argument('-i', '--invert', action='store_true',
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
41 help='Invert the filter bases: filter on bases NOT present in the --filt-bases.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
42 parser.add_argument('-m', '--min-length', type=int,
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
43 help='Set a minimum read length. Reads which are trimmed below this length will be filtered '
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
44 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a '
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
45 'pair must exceed this length to be kept. Set to 0 to only omit empty reads.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
46 parser.add_argument('--error',
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
47 help='Fail with this error message (useful for Galaxy tool).')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
48 parser.add_argument('-A', '--acgt', action='store_true',
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
49 help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
50 parser.add_argument('-I', '--iupac', action='store_true',
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
51 help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
52
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
53 args = parser.parse_args(argv[1:])
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
54
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
55 if args.error:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
56 fail('Error: '+args.error)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
57
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
58 # Catch invalid argument combinations.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
59 if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
60 fail('Error: If giving two input files (paired end), must specify both output files.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
61 # Determine filetypes, open input file parsers.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
62 filetype1 = get_filetype(args.infile1, args.filetype)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
63 file1_parser = iter(getreads.getparser(args.infile1, filetype=filetype1))
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
64 if args.infile2:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
65 paired = True
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
66 filetype2 = get_filetype(args.infile2, args.filetype)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
67 file2_parser = iter(getreads.getparser(args.infile2, filetype=filetype2))
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
68 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
69 filetype2 = None
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
70 file2_parser = None
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
71 paired = False
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
72 # Override output filetypes if it was specified on the command line.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
73 if args.out_filetype:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
74 filetype1 = args.out_filetype
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
75 filetype2 = args.out_filetype
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
76
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
77 # Determine the filter bases and whether to invert the selection.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
78 filt_bases = args.filt_bases
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
79 invert = args.invert
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
80 if args.acgt:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
81 filt_bases = 'ACGT'
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
82 invert = True
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
83 elif args.iupac:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
84 filt_bases = 'ACGTUWSMKRYBDHVN-'
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
85 invert = True
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
86
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
87 # Do the actual trimming.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
88 try:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
89 trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2,
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
90 paired, args.win_len, args.thres, filt_bases, invert, args.min_length)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
91 finally:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
92 for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
93 if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
94 filehandle.close()
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
95
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
96
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
97 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired,
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
98 win_len, thres, filt_bases, invert, min_length):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
99 """Trim all the reads in the input file(s), writing to the output file(s)."""
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
100 read1 = None
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
101 read2 = None
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
102 while True:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
103 # Read in the reads.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
104 try:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
105 read1 = next(file1_parser)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
106 if paired:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
107 read2 = next(file2_parser)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
108 except StopIteration:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
109 break
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
110 # Do trimming.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
111 read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
112 if filetype1 == 'fastq':
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
113 # If the output filetype is FASTQ, trim the quality scores too.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
114 # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
115 # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
116 qual1 = read1.qual or 'z' * len(read1.seq)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
117 read1.qual = qual1[:len(read1.seq)]
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
118 if paired:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
119 read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
120 if filetype2 == 'fastq':
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
121 qual2 = read2.qual or 'z' * len(read2.seq)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
122 read2.qual = qual2[:len(read2.seq)]
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
123 # Output reads if they both pass the minimum length threshold (if any was given).
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
124 if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
125 write_read(outfile1, read1, filetype1)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
126 write_read(outfile2, read2, filetype2)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
127 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
128 # Output read if it passes the minimum length threshold (if any was given).
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
129 if min_length is None or len(read1.seq) >= min_length:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
130 write_read(outfile1, read1, filetype1)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
131
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
132
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
133 def get_filetype(infile, filetype_arg):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
134 if infile is sys.stdin:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
135 if filetype_arg:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
136 filetype = filetype_arg
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
137 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
138 fail('Error: You must specify the --format if reading from stdin.')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
139 elif infile:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
140 if filetype_arg:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
141 filetype = filetype_arg
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
142 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
143 if infile.name.endswith('.fa') or infile.name.endswith('.fasta'):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
144 filetype = 'fasta'
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
145 elif infile.name.endswith('.fq') or infile.name.endswith('.fastq'):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
146 filetype = 'fastq'
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
147 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
148 fail('Error: Unrecognized file ending on "{}". Please specify the --format.'.format(infile))
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
149 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
150 fail('Error: infile is {}'.format(infile))
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
151 return filetype
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
152
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
153
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
154 def write_read(filehandle, read, filetype):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
155 if filetype == 'fasta':
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
156 filehandle.write('>{name}\n{seq}\n'.format(**vars(read)))
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
157 elif filetype == 'fastq':
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
158 filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read)))
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
159
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
160
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
161 def trim_read(seq, win_len, thres, filt_bases, invert):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
162 """Trim an individual read and return its trimmed sequence.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
163 This will track the frequency of bad bases in a window of length win_len, and trim once the
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
164 frequency goes below thres. The trim point will be just before the first (leftmost) bad base in
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
165 the window (the first window with a frequency below thres). The "bad" bases are the ones in
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
166 filt_bases if invert is False, or any base NOT in filt_bases if invert is True."""
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
167 # Algorithm:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
168 # The window is a list which acts as a FIFO. As we scan from the left (3') end to the right (5')
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
169 # end, we append new bases to the right end of the window and pop them from the left end.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
170 # Each base is only examined twice: when it enters the window and when it leaves it.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
171 # We keep a running total of the number of bad bases in bad_bases_count, incrementing it when bad
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
172 # bases enter the window and decrementing it when they leave.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
173 # We also track the location of bad bases in the window with bad_bases_coords so we can figure out
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
174 # where to cut if we have to trim.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
175 max_bad_bases = win_len * thres
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
176 window = []
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
177 bad_bases_count = 0
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
178 bad_bases_coords = []
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
179 for coord, base in enumerate(seq.upper()):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
180 # Shift window, adjust bad_bases_count and bad_bases_coords list.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
181 window.append(base)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
182 # Is the new base we're adding to the window a bad base?
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
183 if invert:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
184 bad_base = base not in filt_bases
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
185 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
186 bad_base = base in filt_bases
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
187 # If so, increment the total and add its coordinate to the window.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
188 if bad_base:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
189 bad_bases_count += 1
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
190 bad_bases_coords.append(coord)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
191 if len(window) > win_len:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
192 first_base = window.pop(0)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
193 # Is the base we're removing (the first base in the window) a bad base?
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
194 if invert:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
195 bad_base = first_base not in filt_bases
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
196 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
197 bad_base = first_base in filt_bases
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
198 # If so, decrement the total and remove its coordinate from the window.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
199 if bad_base:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
200 bad_bases_count -= 1
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
201 bad_bases_coords.pop(0)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
202 # print bad_bases_coords
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
203 # Are we over the threshold?
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
204 if bad_bases_count > max_bad_bases:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
205 break
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
206 # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window.
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
207 if bad_bases_count > max_bad_bases:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
208 first_bad_base = bad_bases_coords[0]
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
209 return seq[0:first_bad_base]
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
210 else:
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
211 return seq
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
212
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
213
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
214 def fail(message):
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
215 sys.stderr.write(message+"\n")
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
216 sys.exit(1)
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
217
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
218
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
219 if __name__ == '__main__':
c824894e0827 planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
nick
parents:
diff changeset
220 sys.exit(main(sys.argv))