annotate lca.py @ 0:e1dea768b4c1 draft default tip

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:30:52 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
2 #Guruprasad Ananda
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
3 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
4 Least Common Ancestor tool.
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
5 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
6 import sys, string, re, commands, tempfile, random
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
7
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
8 def stop_err(msg):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
9 sys.stderr.write(msg)
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
10 sys.exit()
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
11
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
12 def main():
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
13 try:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
14 inputfile = sys.argv[1]
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
15 outfile = sys.argv[2]
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
16 rank_bound = int( sys.argv[3] )
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
17 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
18 Mapping of ranks:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
19 root :2,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
20 superkingdom:3,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
21 kingdom :4,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
22 subkingdom :5,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
23 superphylum :6,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
24 phylum :7,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
25 subphylum :8,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
26 superclass :9,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
27 class :10,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
28 subclass :11,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
29 superorder :12,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
30 order :13,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
31 suborder :14,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
32 superfamily :15,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
33 family :16,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
34 subfamily :17,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
35 tribe :18,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
36 subtribe :19,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
37 genus :20,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
38 subgenus :21,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
39 species :22,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
40 subspecies :23,
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
41 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
42 except:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
43 stop_err("Syntax error: Use correct syntax: program infile outfile")
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
44
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
45 fin = open(sys.argv[1],'r')
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
46 for j, line in enumerate( fin ):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
47 elems = line.strip().split('\t')
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
48 if len(elems) < 24:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
49 stop_err("The format of the input dataset is incorrect. Taxonomy datatype should contain at least 24 columns.")
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
50 if j > 30:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
51 break
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
52 cols = range(1,len(elems))
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
53 fin.close()
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
54
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
55 group_col = 0
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
56 tmpfile = tempfile.NamedTemporaryFile()
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
57
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
58 try:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
59 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
60 The -k option for the Posix sort command is as follows:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
61 -k, --key=POS1[,POS2]
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
62 start a key at POS1, end it at POS2 (origin 1)
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
63 In other words, column positions start at 1 rather than 0, so
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
64 we need to add 1 to group_col.
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
65 if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
66 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
67 command_line = "sort -f -k " + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
68 except Exception, exc:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
69 stop_err( 'Initialization error -> %s' %str(exc) )
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
70
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
71 error_code, stdout = commands.getstatusoutput(command_line)
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
72
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
73 if error_code != 0:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
74 stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
75
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
76 prev_item = ""
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
77 prev_vals = []
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
78 remaining_vals = []
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
79 skipped_lines = 0
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
80 fout = open(outfile, "w")
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
81 block_valid = False
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
82
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
83
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
84 for ii, line in enumerate( file( tmpfile.name )):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
85 if line and not line.startswith( '#' ) and len(line.split('\t')) >= 24: #Taxonomy datatype should have at least 24 columns
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
86 line = line.rstrip( '\r\n' )
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
87 try:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
88 fields = line.split("\t")
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
89 item = fields[group_col]
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
90 if prev_item != "":
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
91 # At this level, we're grouping on values (item and prev_item) in group_col
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
92 if item == prev_item:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
93 # Keep iterating and storing values until a new value is encountered.
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
94 if block_valid:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
95 for i, col in enumerate(cols):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
96 if col >= 3:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
97 prev_vals[i].append(fields[col].strip())
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
98 if len(set(prev_vals[i])) > 1:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
99 block_valid = False
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
100 break
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
101
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
102 else:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
103 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
104 When a new value is encountered, write the previous value and the
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
105 corresponding aggregate values into the output file. This works
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
106 due to the sort on group_col we've applied to the data above.
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
107 """
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
108 out_list = ['']*24
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
109 out_list[0] = str(prev_item)
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
110 out_list[1] = str(prev_vals[0][0])
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
111 out_list[2] = str(prev_vals[1][0])
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
112
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
113 for k, col in enumerate(cols):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
114 if col >= 3 and col < 24:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
115 if len(set(prev_vals[k])) == 1:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
116 out_list[col] = prev_vals[k][0]
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
117 else:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
118 break
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
119 while k < 23:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
120 out_list[k+1] = 'n'
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
121 k += 1
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
122
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
123 j = 0
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
124 while True:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
125 try:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
126 out_list.append(str(prev_vals[23+j][0]))
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
127 j += 1
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
128 except:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
129 break
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
130
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
131 if rank_bound == 0:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
132 print >>fout, '\t'.join(out_list).strip()
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
133 else:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
134 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
135 print >>fout, '\t'.join(out_list).strip()
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
136
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
137 block_valid = True
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
138 prev_item = item
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
139 prev_vals = []
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
140 for col in cols:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
141 val_list = []
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
142 val_list.append(fields[col].strip())
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
143 prev_vals.append(val_list)
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
144
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
145 else:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
146 # This only occurs once, right at the start of the iteration.
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
147 block_valid = True
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
148 prev_item = item #groupby item
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
149 for col in cols: #everyting else
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
150 val_list = []
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
151 val_list.append(fields[col].strip())
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
152 prev_vals.append(val_list)
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
153
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
154 except:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
155 skipped_lines += 1
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
156 else:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
157 skipped_lines += 1
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
158
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
159 # Handle the last grouped value
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
160 out_list = ['']*24
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
161 out_list[0] = str(prev_item)
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
162 out_list[1] = str(prev_vals[0][0])
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
163 out_list[2] = str(prev_vals[1][0])
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
164
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
165 for k, col in enumerate(cols):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
166 if col >= 3 and col < 24:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
167 if len(set(prev_vals[k])) == 1:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
168 out_list[col] = prev_vals[k][0]
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
169 else:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
170 break
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
171 while k < 23:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
172 out_list[k+1] = 'n'
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
173 k += 1
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
174
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
175 j = 0
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
176 while True:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
177 try:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
178 out_list.append(str(prev_vals[23+j][0]))
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
179 j += 1
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
180 except:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
181 break
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
182
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
183 if rank_bound == 0:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
184 print >>fout, '\t'.join(out_list).strip()
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
185 else:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
186 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
187 print >>fout, '\t'.join(out_list).strip()
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
188
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
189 if skipped_lines > 0:
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
190 print "Skipped %d invalid lines." % ( skipped_lines )
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
191
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
192 if __name__ == "__main__":
e1dea768b4c1 Imported from capsule None
devteam
parents:
diff changeset
193 main()