Mercurial > repos > mvdbeek > generate_sliding_windows
comparison generate_sliding_windows.py @ 0:559cf4ca1f2d draft
Uploaded
| author | mvdbeek |
|---|---|
| date | Wed, 15 Apr 2015 06:34:23 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:559cf4ca1f2d |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from Bio import SeqIO | |
| 3 import argparse | |
| 4 import sys | |
| 5 | |
| 6 def generate_windows(seq, window, step): | |
| 7 ''' | |
| 8 Generates windows of a sequence, with the distance of windows | |
| 9 defined by *step*. | |
| 10 | |
| 11 seq -- string to split into windows. | |
| 12 window -- integer specifying the size the generated fragments. | |
| 13 step -- integer specifiying the distance between adjacent fragments. | |
| 14 ''' | |
| 15 stop = window | |
| 16 end = len(seq) | |
| 17 for i in range(stop, end, step): | |
| 18 start = stop-window | |
| 19 fragment = seq[start:stop] | |
| 20 stop_coordinate = stop #to return real stop coordinate | |
| 21 stop = stop+step | |
| 22 yield (fragment, start+1, stop_coordinate) #start+1 to adjust 0-based range | |
| 23 | |
| 24 | |
| 25 def write_fragment(description, output_handle, fragment, start, stop): | |
| 26 '''Write out fragments as fasta with description and start/stop coordinates as fasta header''' | |
| 27 output_string = ">{0}_start:{1}_stop:{2}\n{3}\n".format(description, start, stop, fragment) | |
| 28 output_handle.write(output_string) | |
| 29 | |
| 30 | |
| 31 def handle_io(input, output, window = 21, step= 21): | |
| 32 ''' | |
| 33 Keyword arguments: | |
| 34 input -- file handle for fasta file containing sequences for which you wish to generate fragments. | |
| 35 output -- file handle for the multi-fasta that will contain the generated fragments. | |
| 36 window -- integer specifying the size of the fragments. | |
| 37 step -- integer specifiying the distance between adjacent fragments. | |
| 38 ''' | |
| 39 record_iterator = SeqIO.parse(input, "fasta") | |
| 40 for entry in record_iterator: | |
| 41 seq = str(entry.seq) | |
| 42 description = str(entry.description) | |
| 43 windows = generate_windows(seq, window, step) | |
| 44 [write_fragment(description, output, *fragment) for fragment in windows] | |
| 45 output.close() | |
| 46 input.close() | |
| 47 | |
| 48 def positive_int(val): | |
| 49 try: | |
| 50 assert(int(val) > 0) | |
| 51 except: | |
| 52 raise ArgumentTypeError("'%s' is not a valid positive int" % val) | |
| 53 return int(val) | |
| 54 | |
| 55 if __name__ == "__main__": | |
| 56 | |
| 57 parser = argparse.ArgumentParser(description='Generate fixed size windows in fasta format from multi-fasta sequence.') | |
| 58 parser.add_argument('--input', type=argparse.FileType('r'), required=True, | |
| 59 help='supply an input multi-fasta file.') | |
| 60 parser.add_argument('--output', type=argparse.FileType('w'), default=sys.stdout, | |
| 61 help='supply an output multi-fasta file. If not specified use stdout.') | |
| 62 parser.add_argument('--window', type=positive_int, default=21, | |
| 63 help='Set the size of the generated windows') | |
| 64 parser.add_argument('--step', type=positive_int, default=21, | |
| 65 help='Set distance between the windows') | |
| 66 args = parser.parse_args() | |
| 67 | |
| 68 handle_io(args.input, args.output, args.window, args.step) |
