Mercurial > repos > greg > fasta_extract
comparison fasta_extract.py @ 0:bc3f2a5c7b53 draft
Uploaded
author | greg |
---|---|
date | Sun, 10 Jan 2016 13:03:12 -0500 |
parents | |
children | 3fb7f36c2c8a |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bc3f2a5c7b53 |
---|---|
1 import argparse | |
2 import csv | |
3 import os | |
4 import sys | |
5 | |
6 from fasta_extract_utils import Fasta | |
7 | |
8 | |
9 def reverse_complement(bases): | |
10 complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} | |
11 return ''.join(complements[b.upper()] for b in reversed(bases)) | |
12 | |
13 | |
14 def get_output_path(hid, subtract_from_start, add_to_end, extend_existing, consider_strand, orphan=False): | |
15 attrs = 'u%dd%d' % (subtract_from_start, add_to_end) | |
16 if extend_existing: | |
17 attrs += 'x' | |
18 if consider_strand: | |
19 attrs += '_s' | |
20 if orphan: | |
21 attrs += '_orphan' | |
22 format = 'gff' | |
23 else: | |
24 format = 'fasta' | |
25 return os.path.join('output_dir', 'fasta_extract-%s_on_data_%d.%s' % (attrs, hid, format)) | |
26 | |
27 | |
28 def stop_err(msg): | |
29 sys.stderr.write(msg) | |
30 sys.exit(1) | |
31 | |
32 | |
33 parser = argparse.ArgumentParser() | |
34 parser.add_argument('--genome_file', dest='genome_file', help='Reference genome fasta index file.') | |
35 parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.') | |
36 parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.') | |
37 parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.') | |
38 parser.add_argument('--strand', dest='strand', help='Consider strandedness: reverse complement extracted sequence on reverse strand.') | |
39 args = parser.parse_args() | |
40 | |
41 fasta = Fasta(args.genome_file) | |
42 | |
43 for (input_filename, hid) in args.inputs: | |
44 extend_existing = args.extend_existing == 'existing' | |
45 consider_strand = args.strand == 'yes' | |
46 reader = csv.reader(open(input_filename, 'rU'), delimiter='\t') | |
47 fasta_output_path = get_output_path(args.hid, | |
48 args.subtract_from_start, | |
49 args.add_to_end, | |
50 extend_existing, | |
51 consider_strand) | |
52 output = open(fasta_output_path, 'wb') | |
53 gff_output_path = get_output_path(args.hid, | |
54 args.subtract_from_start, | |
55 args.add_to_end, | |
56 extend_existing, | |
57 consider_strand, | |
58 orphan=True) | |
59 orphan_writer = csv.writer(open(gff_output_path, 'wb'), delimiter='\t') | |
60 for row in reader: | |
61 if len(row) != 9 or row[0].startswith('#'): | |
62 continue | |
63 try: | |
64 cname = row[0] | |
65 start = int(row[3]) | |
66 end = int(row[4]) | |
67 strand = row[6] | |
68 if extend_existing: | |
69 start -= args.subtract_from_start | |
70 end += args.add_to_end | |
71 else: | |
72 midpoint = (start + end) // 2 | |
73 start = midpoint - args.subtract_from_start | |
74 end = midpoint + args.add_to_end | |
75 if 1 <= start and end <= len(fasta[cname]): | |
76 output.write('>%s:%s-%s_%s\n' % (cname, start, end, strand)) | |
77 bases = fasta[cname][start-1:end] | |
78 if consider_strand and strand == '-': | |
79 bases = reverse_complement(bases) | |
80 output.write('%s\n' % bases) | |
81 else: | |
82 orphan_writer.writerow(row) | |
83 except Exception, e: | |
84 stop_err(str(e)) |