comparison fasta_extract.py @ 0:bc3f2a5c7b53 draft

Uploaded
author greg
date Sun, 10 Jan 2016 13:03:12 -0500
parents
children 3fb7f36c2c8a
comparison
equal deleted inserted replaced
-1:000000000000 0:bc3f2a5c7b53
1 import argparse
2 import csv
3 import os
4 import sys
5
6 from fasta_extract_utils import Fasta
7
8
9 def reverse_complement(bases):
10 complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
11 return ''.join(complements[b.upper()] for b in reversed(bases))
12
13
14 def get_output_path(hid, subtract_from_start, add_to_end, extend_existing, consider_strand, orphan=False):
15 attrs = 'u%dd%d' % (subtract_from_start, add_to_end)
16 if extend_existing:
17 attrs += 'x'
18 if consider_strand:
19 attrs += '_s'
20 if orphan:
21 attrs += '_orphan'
22 format = 'gff'
23 else:
24 format = 'fasta'
25 return os.path.join('output_dir', 'fasta_extract-%s_on_data_%d.%s' % (attrs, hid, format))
26
27
28 def stop_err(msg):
29 sys.stderr.write(msg)
30 sys.exit(1)
31
32
33 parser = argparse.ArgumentParser()
34 parser.add_argument('--genome_file', dest='genome_file', help='Reference genome fasta index file.')
35 parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.')
36 parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.')
37 parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.')
38 parser.add_argument('--strand', dest='strand', help='Consider strandedness: reverse complement extracted sequence on reverse strand.')
39 args = parser.parse_args()
40
41 fasta = Fasta(args.genome_file)
42
43 for (input_filename, hid) in args.inputs:
44 extend_existing = args.extend_existing == 'existing'
45 consider_strand = args.strand == 'yes'
46 reader = csv.reader(open(input_filename, 'rU'), delimiter='\t')
47 fasta_output_path = get_output_path(args.hid,
48 args.subtract_from_start,
49 args.add_to_end,
50 extend_existing,
51 consider_strand)
52 output = open(fasta_output_path, 'wb')
53 gff_output_path = get_output_path(args.hid,
54 args.subtract_from_start,
55 args.add_to_end,
56 extend_existing,
57 consider_strand,
58 orphan=True)
59 orphan_writer = csv.writer(open(gff_output_path, 'wb'), delimiter='\t')
60 for row in reader:
61 if len(row) != 9 or row[0].startswith('#'):
62 continue
63 try:
64 cname = row[0]
65 start = int(row[3])
66 end = int(row[4])
67 strand = row[6]
68 if extend_existing:
69 start -= args.subtract_from_start
70 end += args.add_to_end
71 else:
72 midpoint = (start + end) // 2
73 start = midpoint - args.subtract_from_start
74 end = midpoint + args.add_to_end
75 if 1 <= start and end <= len(fasta[cname]):
76 output.write('>%s:%s-%s_%s\n' % (cname, start, end, strand))
77 bases = fasta[cname][start-1:end]
78 if consider_strand and strand == '-':
79 bases = reverse_complement(bases)
80 output.write('%s\n' % bases)
81 else:
82 orphan_writer.writerow(row)
83 except Exception, e:
84 stop_err(str(e))