0
|
1 import argparse
|
|
2 import csv
|
|
3 import os
|
|
4 import sys
|
|
5
|
|
6 from fasta_extract_utils import Fasta
|
|
7
|
|
8
|
|
9 def reverse_complement(bases):
|
|
10 complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
|
|
11 return ''.join(complements[b.upper()] for b in reversed(bases))
|
|
12
|
|
13
|
|
14 def get_output_path(hid, subtract_from_start, add_to_end, extend_existing, consider_strand, orphan=False):
|
|
15 attrs = 'u%dd%d' % (subtract_from_start, add_to_end)
|
|
16 if extend_existing:
|
|
17 attrs += 'x'
|
|
18 if consider_strand:
|
|
19 attrs += '_s'
|
|
20 if orphan:
|
|
21 attrs += '_orphan'
|
|
22 format = 'gff'
|
|
23 else:
|
|
24 format = 'fasta'
|
|
25 return os.path.join('output_dir', 'fasta_extract-%s_on_data_%d.%s' % (attrs, hid, format))
|
|
26
|
|
27
|
|
28 def stop_err(msg):
|
|
29 sys.stderr.write(msg)
|
|
30 sys.exit(1)
|
|
31
|
|
32
|
|
33 parser = argparse.ArgumentParser()
|
|
34 parser.add_argument('--genome_file', dest='genome_file', help='Reference genome fasta index file.')
|
|
35 parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.')
|
|
36 parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.')
|
|
37 parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.')
|
|
38 parser.add_argument('--strand', dest='strand', help='Consider strandedness: reverse complement extracted sequence on reverse strand.')
|
|
39 args = parser.parse_args()
|
|
40
|
|
41 fasta = Fasta(args.genome_file)
|
|
42
|
|
43 for (input_filename, hid) in args.inputs:
|
|
44 extend_existing = args.extend_existing == 'existing'
|
|
45 consider_strand = args.strand == 'yes'
|
|
46 reader = csv.reader(open(input_filename, 'rU'), delimiter='\t')
|
|
47 fasta_output_path = get_output_path(args.hid,
|
|
48 args.subtract_from_start,
|
|
49 args.add_to_end,
|
|
50 extend_existing,
|
|
51 consider_strand)
|
|
52 output = open(fasta_output_path, 'wb')
|
|
53 gff_output_path = get_output_path(args.hid,
|
|
54 args.subtract_from_start,
|
|
55 args.add_to_end,
|
|
56 extend_existing,
|
|
57 consider_strand,
|
|
58 orphan=True)
|
|
59 orphan_writer = csv.writer(open(gff_output_path, 'wb'), delimiter='\t')
|
|
60 for row in reader:
|
|
61 if len(row) != 9 or row[0].startswith('#'):
|
|
62 continue
|
|
63 try:
|
|
64 cname = row[0]
|
|
65 start = int(row[3])
|
|
66 end = int(row[4])
|
|
67 strand = row[6]
|
|
68 if extend_existing:
|
|
69 start -= args.subtract_from_start
|
|
70 end += args.add_to_end
|
|
71 else:
|
|
72 midpoint = (start + end) // 2
|
|
73 start = midpoint - args.subtract_from_start
|
|
74 end = midpoint + args.add_to_end
|
|
75 if 1 <= start and end <= len(fasta[cname]):
|
|
76 output.write('>%s:%s-%s_%s\n' % (cname, start, end, strand))
|
|
77 bases = fasta[cname][start-1:end]
|
|
78 if consider_strand and strand == '-':
|
|
79 bases = reverse_complement(bases)
|
|
80 output.write('%s\n' % bases)
|
|
81 else:
|
|
82 orphan_writer.writerow(row)
|
|
83 except Exception, e:
|
|
84 stop_err(str(e))
|