0
|
1 #!/usr/bin/env python
|
|
2 #guruprasad Ananda
|
|
3 """
|
|
4 Estimates substitution rates from pairwise alignments using JC69 model.
|
|
5 """
|
|
6
|
|
7 from galaxy import eggs
|
|
8 from galaxy.tools.util.galaxyops import *
|
|
9 from galaxy.tools.util import maf_utilities
|
|
10 import bx.align.maf
|
|
11 import fileinput
|
|
12 import sys
|
|
13
|
|
14 def stop_err(msg):
|
|
15 sys.stderr.write(msg)
|
|
16 sys.exit()
|
|
17
|
|
18
|
|
19 if len(sys.argv) < 3:
|
|
20 stop_err("Incorrect number of arguments.")
|
|
21
|
|
22 inp_file = sys.argv[1]
|
|
23 out_file = sys.argv[2]
|
|
24 fout = open(out_file, 'w')
|
|
25 int_file = sys.argv[3]
|
|
26 if int_file != "None": #The user has specified an interval file
|
|
27 dbkey_i = sys.argv[4]
|
|
28 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
|
|
29
|
|
30
|
|
31 def rateEstimator(block):
|
|
32 global alignlen, mismatches
|
|
33
|
|
34 src1 = block.components[0].src
|
|
35 sequence1 = block.components[0].text
|
|
36 start1 = block.components[0].start
|
|
37 end1 = block.components[0].end
|
|
38 len1 = int(end1)-int(start1)
|
|
39 len1_withgap = len(sequence1)
|
|
40 mismatch = 0.0
|
|
41
|
|
42 for seq in range (1, len(block.components)):
|
|
43 src2 = block.components[seq].src
|
|
44 sequence2 = block.components[seq].text
|
|
45 start2 = block.components[seq].start
|
|
46 end2 = block.components[seq].end
|
|
47 len2 = int(end2)-int(start2)
|
|
48 for nt in range(len1_withgap):
|
|
49 if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': # Not a gap or masked character
|
|
50 if sequence1[nt].upper() != sequence2[nt].upper():
|
|
51 mismatch += 1
|
|
52
|
|
53 if int_file == "None":
|
|
54 p = mismatch/min(len1, len2)
|
|
55 print >> fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" % ( src1, start1, end1, src2, start2, end2, min(len1, len2), mismatch, p )
|
|
56 else:
|
|
57 mismatches += mismatch
|
|
58 alignlen += min(len1, len2)
|
|
59
|
|
60
|
|
61 def main():
|
|
62 skipped = 0
|
|
63 not_pairwise = 0
|
|
64
|
|
65 if int_file == "None":
|
|
66 try:
|
|
67 maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
|
|
68 except:
|
|
69 stop_err("Your MAF file appears to be malformed.")
|
|
70 print >> fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp"
|
|
71 for block in maf_reader:
|
|
72 if len(block.components) != 2:
|
|
73 not_pairwise += 1
|
|
74 continue
|
|
75 try:
|
|
76 rateEstimator(block)
|
|
77 except:
|
|
78 skipped += 1
|
|
79 else:
|
|
80 index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] )
|
|
81 if index is None:
|
|
82 print >> sys.stderr, "Your MAF file appears to be malformed."
|
|
83 sys.exit()
|
|
84 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
|
|
85 chrom_col=chr_col_i,
|
|
86 start_col=start_col_i,
|
|
87 end_col=end_col_i,
|
|
88 strand_col=strand_col_i,
|
|
89 fix_strand=True)
|
|
90 species = None
|
|
91 mincols = 0
|
|
92 global alignlen, mismatches
|
|
93
|
|
94 for interval in win:
|
|
95 alignlen = 0
|
|
96 mismatches = 0.0
|
|
97 src = "%s.%s" % ( dbkey_i, interval.chrom )
|
|
98 for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ):
|
|
99 if len(block.components) != 2:
|
|
100 not_pairwise += 1
|
|
101 continue
|
|
102 try:
|
|
103 rateEstimator(block)
|
|
104 except:
|
|
105 skipped += 1
|
|
106 if alignlen:
|
|
107 p = mismatches/alignlen
|
|
108 else:
|
|
109 p = 'NA'
|
|
110 interval.fields.append(str(alignlen))
|
|
111 interval.fields.append(str(mismatches))
|
|
112 interval.fields.append(str(p))
|
|
113 print >> fout, "\t".join(interval.fields)
|
|
114 #num_blocks += 1
|
|
115
|
|
116 if not_pairwise:
|
|
117 print "Skipped %d non-pairwise blocks" % (not_pairwise)
|
|
118 if skipped:
|
|
119 print "Skipped %d blocks as invalid" % (skipped)
|
|
120
|
|
121
|
|
122 if __name__ == "__main__":
|
|
123 main()
|