annotate getalleleseq.py @ 4:8f63bfc151e9 draft

Updated script comments
author boris
date Tue, 25 Jun 2013 00:46:02 -0400
parents
children 8d0a0c488a8e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
1 #!/usr/bin/env python
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
2 # Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
3 #
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
4 #usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
5 #
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
6 #Given a table with minor and major alleles per position, it generates the
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
7 #minor and major allele sequences in FASTA format
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
8 #
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
9 #positional arguments:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
10 # alleles Table containing minor and major allele base per
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
11 # position. cols: [id, chr, pos, A, C, G, T, cvrg,
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
12 # plody, major, minor, freq_minor]
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
13 #
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
14 #optional arguments:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
15 # -h, --help show this help message and exit
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
16 # -l INT, --seq-length INT
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
17 # Background sequence length. Bases in an artifical
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
18 # all-N-sequence of length INT will be replaced by
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
19 # either the major or minor allele base accordingly
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
20 # -j FILE, --major-seq FILE
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
21 # File to write major allele sequences in FASTA multiple
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
22 # alignment format.
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
23 # -d DIR, --minor-dir DIR
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
24 # Per sample minor allele sequences will be written to
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
25 # this directory
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
26 #
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
27 # The expected columns in the alleles table follow Nicholas Stoler's
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
28 # Count alleles tool format. See Count alleles in Galaxy's tool shed
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
29 # http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
30 #
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
31 # Expected columns:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
32 # 1. sample_id
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
33 # 2. chr
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
34 # 3. position
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
35 # 4 counts for A's
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
36 # 5. counts for C's
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
37 # 6. counts for G's
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
38 # 7. counts for T's
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
39 # 8. Coverage
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
40 # 9. Number of alleles passing a given criteria
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
41 # 10. Major allele
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
42 # 11. Minor allele
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
43 # 12. Minor allele frequency in position
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
44
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
45 import sys
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
46 import os
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
47 import argparse
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
48
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
49 def createseq(sample, allele, seq_size, table):
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
50 """Generate major or minor allele sequence"""
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
51 out_sequence = ['N' for i in range(seq_size)]
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
52 sample_data = [line for line in table if line[0] == sample]
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
53
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
54 for entry in sample_data:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
55 position = int(entry[2])
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
56 number_of_alleles = int(entry[8])
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
57 major_allele = entry[9].strip()
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
58 minor_allele = entry[10].strip()
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
59
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
60 if allele == 'major':
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
61 out_sequence[position-1] = major_allele
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
62 elif allele == 'minor':
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
63 if number_of_alleles == 2:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
64 out_sequence[position-1] = minor_allele
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
65 else:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
66 out_sequence[position-1] = major_allele
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
67 return out_sequence
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
68
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
69 def printseq(sample,allele,seq,output):
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
70 """Print out sequence"""
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
71 print >> output, '>{0}_{1}'.format(sample,allele)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
72 for i in range(0,len(seq),70):
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
73 print >> output, ''.join(seq[i:i+70])
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
74
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
75 def main():
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
76 parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
77 parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
78 parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
79 parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
80 parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)")
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
81 parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
82 args = parser.parse_args()
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
83
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
84
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
85 try:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
86 table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line]
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
87 samples = sorted(list(set([ line[0] for line in table ])))
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
88 except:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
89 sys.exit('\nERROR: Could not open %s\n' % args.alleles)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
90 try:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
91 major_out = open(args.major_seq, 'w+')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
92 except:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
93 sys.exit('\nCould not create %s\n' % args.major_seq)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
94
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
95 # Single file for all major allele sequences in FASTA multiple alignment
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
96 for sample in samples:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
97 sequence = createseq(sample,'major',args.seq_length,table)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
98 printseq(sample,'major',sequence,major_out)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
99 major_out.close()
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
100
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
101 # Sample specific minor allele sequence in FASTA format
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
102 try:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
103 os.makedirs(args.minor_dir)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
104 except:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
105 pass
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
106
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
107 for sample in samples:
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
108 if args.minor_prefix: # to fit Galaxy requirements
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
109 name = sample.replace('_','')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
110 minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
111 else: # for non-Galaxy
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
112 minor_name = sample+'-minor.fa'
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
113 minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+')
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
114 sequence = createseq(sample,'minor',args.seq_length,table)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
115 printseq(sample,'minor',sequence,minor_out)
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
116 minor_out.close()
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
117
8f63bfc151e9 Updated script comments
boris
parents:
diff changeset
118 if __name__ == "__main__": main()