annotate fasta_extract.py @ 14:a234ed464674 draft default tip

Uploaded
author greg
date Sun, 10 Jan 2016 15:01:05 -0500
parents a5d7ed2680c3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
1 import argparse
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
2 import csv
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
3 import os
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
4 import sys
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
5
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
6 from fasta_extract_utils import Fasta
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
7
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
8
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
9 def reverse_complement(bases):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
10 complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
11 return ''.join(complements[b.upper()] for b in reversed(bases))
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
12
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
13
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
14 def get_output_path(hid, subtract_from_start, add_to_end, extend_existing, consider_strand, orphan=False):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
15 attrs = 'u%dd%d' % (subtract_from_start, add_to_end)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
16 if extend_existing:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
17 attrs += 'x'
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
18 if consider_strand:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
19 attrs += '_s'
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
20 if orphan:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
21 attrs += '_orphan'
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
22 format = 'gff'
7
4f6eba6beef5 Uploaded
greg
parents: 5
diff changeset
23 output_dir = 'output_orphan_dir'
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
24 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
25 format = 'fasta'
7
4f6eba6beef5 Uploaded
greg
parents: 5
diff changeset
26 output_dir = 'output_dir'
12
da6ab598f025 Uploaded
greg
parents: 11
diff changeset
27 return os.path.join(output_dir, '%s_on_data_%s.%s' % (attrs, hid, format))
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
28
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
29
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
30 def stop_err(msg):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
31 sys.stderr.write(msg)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
32 sys.exit(1)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
33
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
34
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
35 parser = argparse.ArgumentParser()
1
3fb7f36c2c8a Uploaded
greg
parents: 0
diff changeset
36 parser.add_argument('--input', dest='inputs', action='append', nargs=2, help="Input datasets")
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
37 parser.add_argument('--genome_file', dest='genome_file', help='Reference genome fasta index file.')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
38 parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
39 parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
40 parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
41 parser.add_argument('--strand', dest='strand', help='Consider strandedness: reverse complement extracted sequence on reverse strand.')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
42 args = parser.parse_args()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
43
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
44 fasta = Fasta(args.genome_file)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
45
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
46 for (input_filename, hid) in args.inputs:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
47 extend_existing = args.extend_existing == 'existing'
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
48 consider_strand = args.strand == 'yes'
12
da6ab598f025 Uploaded
greg
parents: 11
diff changeset
49
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
50 reader = csv.reader(open(input_filename, 'rU'), delimiter='\t')
3
79167132946d Uploaded
greg
parents: 1
diff changeset
51 fasta_output_path = get_output_path(hid,
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
52 args.subtract_from_start,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
53 args.add_to_end,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
54 extend_existing,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
55 consider_strand)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
56 output = open(fasta_output_path, 'wb')
3
79167132946d Uploaded
greg
parents: 1
diff changeset
57 gff_output_path = get_output_path(hid,
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
58 args.subtract_from_start,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
59 args.add_to_end,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
60 extend_existing,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
61 consider_strand,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
62 orphan=True)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
63 orphan_writer = csv.writer(open(gff_output_path, 'wb'), delimiter='\t')
12
da6ab598f025 Uploaded
greg
parents: 11
diff changeset
64
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
65 for row in reader:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
66 if len(row) != 9 or row[0].startswith('#'):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
67 continue
14
a234ed464674 Uploaded
greg
parents: 13
diff changeset
68 cname = row[0]
a234ed464674 Uploaded
greg
parents: 13
diff changeset
69 start = int(row[3])
a234ed464674 Uploaded
greg
parents: 13
diff changeset
70 end = int(row[4])
a234ed464674 Uploaded
greg
parents: 13
diff changeset
71 strand = row[6]
a234ed464674 Uploaded
greg
parents: 13
diff changeset
72 if extend_existing:
a234ed464674 Uploaded
greg
parents: 13
diff changeset
73 start -= args.subtract_from_start
a234ed464674 Uploaded
greg
parents: 13
diff changeset
74 end += args.add_to_end
a234ed464674 Uploaded
greg
parents: 13
diff changeset
75 else:
a234ed464674 Uploaded
greg
parents: 13
diff changeset
76 midpoint = (start + end) // 2
a234ed464674 Uploaded
greg
parents: 13
diff changeset
77 start = midpoint - args.subtract_from_start
a234ed464674 Uploaded
greg
parents: 13
diff changeset
78 end = midpoint + args.add_to_end
a234ed464674 Uploaded
greg
parents: 13
diff changeset
79 if 1 <= start and end <= len(fasta[cname]):
a234ed464674 Uploaded
greg
parents: 13
diff changeset
80 output.write('>%s:%s-%s_%s\n' % (cname, start, end, strand))
a234ed464674 Uploaded
greg
parents: 13
diff changeset
81 bases = fasta[cname][start-1:end]
a234ed464674 Uploaded
greg
parents: 13
diff changeset
82 if consider_strand and strand == '-':
a234ed464674 Uploaded
greg
parents: 13
diff changeset
83 bases = reverse_complement(bases)
a234ed464674 Uploaded
greg
parents: 13
diff changeset
84 output.write('%s\n' % bases)
a234ed464674 Uploaded
greg
parents: 13
diff changeset
85 else:
a234ed464674 Uploaded
greg
parents: 13
diff changeset
86 orphan_writer.writerow(row)
13
a5d7ed2680c3 Uploaded
greg
parents: 12
diff changeset
87 output.close()