Mercurial > repos > nick > sequence_content_trimmer
comparison trimmer.py @ 0:c824894e0827 draft
planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
| author | nick |
|---|---|
| date | Tue, 01 Dec 2015 21:11:51 -0500 |
| parents | |
| children | 7ef568cbf13b |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c824894e0827 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 import sys | |
| 4 import argparse | |
| 5 import getreads | |
| 6 | |
| 7 OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'} | |
| 8 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]" | |
| 9 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of | |
| 10 N's.""" | |
| 11 | |
| 12 | |
| 13 def main(argv): | |
| 14 | |
| 15 parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) | |
| 16 parser.set_defaults(**OPT_DEFAULTS) | |
| 17 | |
| 18 parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'), | |
| 19 default=sys.stdin, | |
| 20 help='Input reads (mate 1). Omit to read from stdin.') | |
| 21 parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'), | |
| 22 help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out ' | |
| 23 'entirely, the other will also be lost).') | |
| 24 parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'), | |
| 25 default=sys.stdout, | |
| 26 help='Output file for mate 1. WARNING: Will overwrite.') | |
| 27 parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'), | |
| 28 help='Output file for mate 2. WARNING: Will overwrite.') | |
| 29 parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'), | |
| 30 help='Input read format.') | |
| 31 parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'), | |
| 32 help='Output read format. Default: whatever the input format is.') | |
| 33 parser.add_argument('-b', '--filt-bases', | |
| 34 help='The bases to filter on. Case-insensitive. Default: %(default)s.') | |
| 35 parser.add_argument('-t', '--thres', type=float, | |
| 36 help='The threshold. The read will be trimmed once the proportion of filter bases in the ' | |
| 37 'window exceed this fraction (not a percentage). Default: %(default)s.') | |
| 38 parser.add_argument('-w', '--window', dest='win_len', type=int, | |
| 39 help='Window size for trimming. Default: %(default)s.') | |
| 40 parser.add_argument('-i', '--invert', action='store_true', | |
| 41 help='Invert the filter bases: filter on bases NOT present in the --filt-bases.') | |
| 42 parser.add_argument('-m', '--min-length', type=int, | |
| 43 help='Set a minimum read length. Reads which are trimmed below this length will be filtered ' | |
| 44 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a ' | |
| 45 'pair must exceed this length to be kept. Set to 0 to only omit empty reads.') | |
| 46 parser.add_argument('--error', | |
| 47 help='Fail with this error message (useful for Galaxy tool).') | |
| 48 parser.add_argument('-A', '--acgt', action='store_true', | |
| 49 help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").') | |
| 50 parser.add_argument('-I', '--iupac', action='store_true', | |
| 51 help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").') | |
| 52 | |
| 53 args = parser.parse_args(argv[1:]) | |
| 54 | |
| 55 if args.error: | |
| 56 fail('Error: '+args.error) | |
| 57 | |
| 58 # Catch invalid argument combinations. | |
| 59 if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2): | |
| 60 fail('Error: If giving two input files (paired end), must specify both output files.') | |
| 61 # Determine filetypes, open input file parsers. | |
| 62 filetype1 = get_filetype(args.infile1, args.filetype) | |
| 63 file1_parser = iter(getreads.getparser(args.infile1, filetype=filetype1)) | |
| 64 if args.infile2: | |
| 65 paired = True | |
| 66 filetype2 = get_filetype(args.infile2, args.filetype) | |
| 67 file2_parser = iter(getreads.getparser(args.infile2, filetype=filetype2)) | |
| 68 else: | |
| 69 filetype2 = None | |
| 70 file2_parser = None | |
| 71 paired = False | |
| 72 # Override output filetypes if it was specified on the command line. | |
| 73 if args.out_filetype: | |
| 74 filetype1 = args.out_filetype | |
| 75 filetype2 = args.out_filetype | |
| 76 | |
| 77 # Determine the filter bases and whether to invert the selection. | |
| 78 filt_bases = args.filt_bases | |
| 79 invert = args.invert | |
| 80 if args.acgt: | |
| 81 filt_bases = 'ACGT' | |
| 82 invert = True | |
| 83 elif args.iupac: | |
| 84 filt_bases = 'ACGTUWSMKRYBDHVN-' | |
| 85 invert = True | |
| 86 | |
| 87 # Do the actual trimming. | |
| 88 try: | |
| 89 trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2, | |
| 90 paired, args.win_len, args.thres, filt_bases, invert, args.min_length) | |
| 91 finally: | |
| 92 for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2): | |
| 93 if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout: | |
| 94 filehandle.close() | |
| 95 | |
| 96 | |
| 97 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired, | |
| 98 win_len, thres, filt_bases, invert, min_length): | |
| 99 """Trim all the reads in the input file(s), writing to the output file(s).""" | |
| 100 read1 = None | |
| 101 read2 = None | |
| 102 while True: | |
| 103 # Read in the reads. | |
| 104 try: | |
| 105 read1 = next(file1_parser) | |
| 106 if paired: | |
| 107 read2 = next(file2_parser) | |
| 108 except StopIteration: | |
| 109 break | |
| 110 # Do trimming. | |
| 111 read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert) | |
| 112 if filetype1 == 'fastq': | |
| 113 # If the output filetype is FASTQ, trim the quality scores too. | |
| 114 # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. | |
| 115 # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. | |
| 116 qual1 = read1.qual or 'z' * len(read1.seq) | |
| 117 read1.qual = qual1[:len(read1.seq)] | |
| 118 if paired: | |
| 119 read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert) | |
| 120 if filetype2 == 'fastq': | |
| 121 qual2 = read2.qual or 'z' * len(read2.seq) | |
| 122 read2.qual = qual2[:len(read2.seq)] | |
| 123 # Output reads if they both pass the minimum length threshold (if any was given). | |
| 124 if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length): | |
| 125 write_read(outfile1, read1, filetype1) | |
| 126 write_read(outfile2, read2, filetype2) | |
| 127 else: | |
| 128 # Output read if it passes the minimum length threshold (if any was given). | |
| 129 if min_length is None or len(read1.seq) >= min_length: | |
| 130 write_read(outfile1, read1, filetype1) | |
| 131 | |
| 132 | |
| 133 def get_filetype(infile, filetype_arg): | |
| 134 if infile is sys.stdin: | |
| 135 if filetype_arg: | |
| 136 filetype = filetype_arg | |
| 137 else: | |
| 138 fail('Error: You must specify the --format if reading from stdin.') | |
| 139 elif infile: | |
| 140 if filetype_arg: | |
| 141 filetype = filetype_arg | |
| 142 else: | |
| 143 if infile.name.endswith('.fa') or infile.name.endswith('.fasta'): | |
| 144 filetype = 'fasta' | |
| 145 elif infile.name.endswith('.fq') or infile.name.endswith('.fastq'): | |
| 146 filetype = 'fastq' | |
| 147 else: | |
| 148 fail('Error: Unrecognized file ending on "{}". Please specify the --format.'.format(infile)) | |
| 149 else: | |
| 150 fail('Error: infile is {}'.format(infile)) | |
| 151 return filetype | |
| 152 | |
| 153 | |
| 154 def write_read(filehandle, read, filetype): | |
| 155 if filetype == 'fasta': | |
| 156 filehandle.write('>{name}\n{seq}\n'.format(**vars(read))) | |
| 157 elif filetype == 'fastq': | |
| 158 filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read))) | |
| 159 | |
| 160 | |
| 161 def trim_read(seq, win_len, thres, filt_bases, invert): | |
| 162 """Trim an individual read and return its trimmed sequence. | |
| 163 This will track the frequency of bad bases in a window of length win_len, and trim once the | |
| 164 frequency goes below thres. The trim point will be just before the first (leftmost) bad base in | |
| 165 the window (the first window with a frequency below thres). The "bad" bases are the ones in | |
| 166 filt_bases if invert is False, or any base NOT in filt_bases if invert is True.""" | |
| 167 # Algorithm: | |
| 168 # The window is a list which acts as a FIFO. As we scan from the left (3') end to the right (5') | |
| 169 # end, we append new bases to the right end of the window and pop them from the left end. | |
| 170 # Each base is only examined twice: when it enters the window and when it leaves it. | |
| 171 # We keep a running total of the number of bad bases in bad_bases_count, incrementing it when bad | |
| 172 # bases enter the window and decrementing it when they leave. | |
| 173 # We also track the location of bad bases in the window with bad_bases_coords so we can figure out | |
| 174 # where to cut if we have to trim. | |
| 175 max_bad_bases = win_len * thres | |
| 176 window = [] | |
| 177 bad_bases_count = 0 | |
| 178 bad_bases_coords = [] | |
| 179 for coord, base in enumerate(seq.upper()): | |
| 180 # Shift window, adjust bad_bases_count and bad_bases_coords list. | |
| 181 window.append(base) | |
| 182 # Is the new base we're adding to the window a bad base? | |
| 183 if invert: | |
| 184 bad_base = base not in filt_bases | |
| 185 else: | |
| 186 bad_base = base in filt_bases | |
| 187 # If so, increment the total and add its coordinate to the window. | |
| 188 if bad_base: | |
| 189 bad_bases_count += 1 | |
| 190 bad_bases_coords.append(coord) | |
| 191 if len(window) > win_len: | |
| 192 first_base = window.pop(0) | |
| 193 # Is the base we're removing (the first base in the window) a bad base? | |
| 194 if invert: | |
| 195 bad_base = first_base not in filt_bases | |
| 196 else: | |
| 197 bad_base = first_base in filt_bases | |
| 198 # If so, decrement the total and remove its coordinate from the window. | |
| 199 if bad_base: | |
| 200 bad_bases_count -= 1 | |
| 201 bad_bases_coords.pop(0) | |
| 202 # print bad_bases_coords | |
| 203 # Are we over the threshold? | |
| 204 if bad_bases_count > max_bad_bases: | |
| 205 break | |
| 206 # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window. | |
| 207 if bad_bases_count > max_bad_bases: | |
| 208 first_bad_base = bad_bases_coords[0] | |
| 209 return seq[0:first_bad_base] | |
| 210 else: | |
| 211 return seq | |
| 212 | |
| 213 | |
| 214 def fail(message): | |
| 215 sys.stderr.write(message+"\n") | |
| 216 sys.exit(1) | |
| 217 | |
| 218 | |
| 219 if __name__ == '__main__': | |
| 220 sys.exit(main(sys.argv)) |
