Mercurial > repos > rico > no_tests_test
comparison pca.py @ 0:580da578c5e6 default tip
Uploaded
| author | rico |
|---|---|
| date | Thu, 05 Apr 2012 15:56:36 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:580da578c5e6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import errno | |
| 4 import os | |
| 5 import shutil | |
| 6 import subprocess | |
| 7 import sys | |
| 8 from BeautifulSoup import BeautifulSoup | |
| 9 import gd_composite | |
| 10 | |
| 11 ################################################################################ | |
| 12 | |
| 13 def mkdir_p(path): | |
| 14 try: | |
| 15 os.makedirs(path) | |
| 16 except OSError, e: | |
| 17 if e.errno <> errno.EEXIST: | |
| 18 raise | |
| 19 | |
| 20 ################################################################################ | |
| 21 | |
| 22 def run_program(prog, args, stdout_file=None): | |
| 23 #print "args: ", ' '.join(args) | |
| 24 p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| 25 (stdoutdata, stderrdata) = p.communicate() | |
| 26 rc = p.returncode | |
| 27 | |
| 28 if stdout_file is not None: | |
| 29 with open(stdout_file, 'w') as ofh: | |
| 30 print >> ofh, stdoutdata | |
| 31 | |
| 32 if rc != 0: | |
| 33 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 34 print >> sys.stderr, stderrdata | |
| 35 sys.exit(1) | |
| 36 | |
| 37 ################################################################################ | |
| 38 | |
| 39 def do_ped2geno(input, output): | |
| 40 lines = [] | |
| 41 with open(input) as fh: | |
| 42 for line in fh: | |
| 43 line = line.rstrip('\r\n') | |
| 44 lines.append(line.split()) | |
| 45 | |
| 46 pair_map = { | |
| 47 '0':{ '0':'9', '1':'9', '2':'9' }, | |
| 48 '1':{ '0':'1', '1':'2', '2':'1' }, | |
| 49 '2':{ '0':'1', '1':'1', '2':'0' } | |
| 50 } | |
| 51 with open(output, 'w') as ofh: | |
| 52 for a_idx in xrange(6, len(lines[0]), 2): | |
| 53 b_idx = a_idx + 1 | |
| 54 print >> ofh, ''.join(map(lambda line: pair_map[line[a_idx]][line[b_idx]], lines)) | |
| 55 | |
| 56 def do_map2snp(input, output): | |
| 57 with open(output, 'w') as ofh: | |
| 58 with open(input) as fh: | |
| 59 for line in fh: | |
| 60 elems = line.split() | |
| 61 print >> ofh, ' {0} 11 0.002 2000 A T'.format(elems[1]) | |
| 62 | |
| 63 def make_ind_file(ind_file, input): | |
| 64 pops = [] | |
| 65 | |
| 66 ofh = open(ind_file, 'w') | |
| 67 | |
| 68 with open(input) as fh: | |
| 69 soup = BeautifulSoup(fh) | |
| 70 misc = soup.find('div', {'id': 'gd_misc'}) | |
| 71 populations = misc('ul')[0] | |
| 72 | |
| 73 i = 0 | |
| 74 for entry in populations: | |
| 75 if i % 2 == 1: | |
| 76 population_name = entry.contents[0].encode('utf8').strip().replace(' ', '_') | |
| 77 pops.append(population_name) | |
| 78 individuals = entry.ol('li') | |
| 79 for individual in individuals: | |
| 80 individual_name = individual.string.encode('utf8').strip() | |
| 81 print >> ofh, individual_name, 'M', population_name | |
| 82 i += 1 | |
| 83 | |
| 84 ofh.close() | |
| 85 return pops | |
| 86 | |
| 87 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file): | |
| 88 with open(par_file, 'w') as fh: | |
| 89 print >> fh, 'genotypename: {0}'.format(geno_file) | |
| 90 print >> fh, 'snpname: {0}'.format(snp_file) | |
| 91 print >> fh, 'indivname: {0}'.format(ind_file) | |
| 92 print >> fh, 'evecoutname: {0}'.format(evec_file) | |
| 93 print >> fh, 'evaloutname: {0}'.format(eval_file) | |
| 94 print >> fh, 'altnormstyle: NO' | |
| 95 print >> fh, 'numoutevec: 2' | |
| 96 | |
| 97 def do_smartpca(par_file): | |
| 98 prog = 'smartpca' | |
| 99 | |
| 100 args = [ prog ] | |
| 101 args.append('-p') | |
| 102 args.append(par_file) | |
| 103 | |
| 104 #print "args: ", ' '.join(args) | |
| 105 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr) | |
| 106 (stdoutdata, stderrdata) = p.communicate() | |
| 107 rc = p.returncode | |
| 108 | |
| 109 if rc != 0: | |
| 110 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 111 print >> sys.stderr, stderrdata | |
| 112 sys.exit(1) | |
| 113 | |
| 114 stats = [] | |
| 115 | |
| 116 save_line = False | |
| 117 for line in stdoutdata.split('\n'): | |
| 118 if line.startswith(('## Average divergence', '## Anova statistics', '## Statistical significance')): | |
| 119 stats.append('') | |
| 120 save_line = True | |
| 121 if line.strip() == '': | |
| 122 save_line = False | |
| 123 if save_line: | |
| 124 stats.append(line) | |
| 125 | |
| 126 return '\n'.join(stats[1:]) | |
| 127 | |
| 128 def do_ploteig(evec_file, population_names): | |
| 129 prog = 'gd_ploteig' | |
| 130 | |
| 131 args = [ prog ] | |
| 132 args.append('-i') | |
| 133 args.append(evec_file) | |
| 134 args.append('-c') | |
| 135 args.append('1:2') | |
| 136 args.append('-p') | |
| 137 args.append(':'.join(population_names)) | |
| 138 args.append('-x') | |
| 139 | |
| 140 run_program(None, args) | |
| 141 | |
| 142 def do_eval2pct(eval_file, explained_file): | |
| 143 prog = 'eval2pct' | |
| 144 | |
| 145 args = [ prog ] | |
| 146 args.append(eval_file) | |
| 147 | |
| 148 with open(explained_file, 'w') as ofh: | |
| 149 #print "args:", ' '.join(args) | |
| 150 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) | |
| 151 (stdoutdata, stderrdata) = p.communicate() | |
| 152 rc = p.returncode | |
| 153 | |
| 154 if rc != 0: | |
| 155 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 156 print >> sys.stderr, stderrdata | |
| 157 sys.exit(1) | |
| 158 | |
| 159 def do_coords2admix(coords_file): | |
| 160 prog = 'coords2admix' | |
| 161 | |
| 162 args = [ prog ] | |
| 163 args.append(coords_file) | |
| 164 | |
| 165 with open('fake', 'w') as ofh: | |
| 166 #print "args:", ' '.join(args) | |
| 167 p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE) | |
| 168 (stdoutdata, stderrdata) = p.communicate() | |
| 169 rc = p.returncode | |
| 170 | |
| 171 if rc != 0: | |
| 172 print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args)) | |
| 173 print >> sys.stderr, stderrdata | |
| 174 sys.exit(1) | |
| 175 | |
| 176 shutil.copy2('fake', coords_file) | |
| 177 | |
| 178 ################################################################################ | |
| 179 | |
| 180 if len(sys.argv) != 5: | |
| 181 print "usage" | |
| 182 sys.exit(1) | |
| 183 | |
| 184 input, input_files_path, output, output_files_path = sys.argv[1:5] | |
| 185 | |
| 186 mkdir_p(output_files_path) | |
| 187 | |
| 188 ped_file = os.path.join(input_files_path, 'admix.ped') | |
| 189 geno_file = os.path.join(output_files_path, 'admix.geno') | |
| 190 do_ped2geno(ped_file, geno_file) | |
| 191 | |
| 192 map_file = os.path.join(input_files_path, 'admix.map') | |
| 193 snp_file = os.path.join(output_files_path, 'admix.snp') | |
| 194 do_map2snp(map_file, snp_file) | |
| 195 | |
| 196 ind_file = os.path.join(output_files_path, 'admix.ind') | |
| 197 population_names = make_ind_file(ind_file, input) | |
| 198 | |
| 199 par_file = os.path.join(output_files_path, 'par.admix') | |
| 200 evec_file = os.path.join(output_files_path, 'coordinates.txt') | |
| 201 eval_file = os.path.join(output_files_path, 'admix.eval') | |
| 202 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file) | |
| 203 | |
| 204 smartpca_stats = do_smartpca(par_file) | |
| 205 | |
| 206 do_ploteig(evec_file, population_names) | |
| 207 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names)) | |
| 208 output_plot_file = os.path.join(output_files_path, 'PCA.pdf') | |
| 209 shutil.copy2(plot_file, output_plot_file) | |
| 210 os.unlink(plot_file) | |
| 211 | |
| 212 do_eval2pct(eval_file, os.path.join(output_files_path, 'explained.txt')) | |
| 213 os.unlink(eval_file) | |
| 214 | |
| 215 do_coords2admix(evec_file) | |
| 216 | |
| 217 ################################################################################ | |
| 218 | |
| 219 info_page = gd_composite.InfoPage() | |
| 220 info_page.set_title('PCA Galaxy Composite Dataset') | |
| 221 | |
| 222 display_file = gd_composite.DisplayFile() | |
| 223 display_value = gd_composite.DisplayValue() | |
| 224 | |
| 225 out_pdf = gd_composite.Parameter(name='PCA.pdf', value='PCA.pdf', display_type=display_file) | |
| 226 out_evec = gd_composite.Parameter(name='coordinates.txt', value='coordinates.txt', display_type=display_file) | |
| 227 out_explained = gd_composite.Parameter(name='explained.txt', value='explained.txt', display_type=display_file) | |
| 228 | |
| 229 evec_prefix = 'coordinates.txt.1:2.{0}'.format(':'.join(population_names)) | |
| 230 ps_file = '{0}.ps'.format(evec_prefix) | |
| 231 xtxt_file = '{0}.xtxt'.format(evec_prefix) | |
| 232 | |
| 233 os.unlink(os.path.join(output_files_path, ps_file)) | |
| 234 os.unlink(os.path.join(output_files_path, xtxt_file)) | |
| 235 | |
| 236 info_page.add_output_parameter(out_pdf) | |
| 237 info_page.add_output_parameter(out_evec) | |
| 238 info_page.add_output_parameter(out_explained) | |
| 239 | |
| 240 in_admix = gd_composite.Parameter(name='par.admix', value='par.admix', display_type=display_file) | |
| 241 in_geno = gd_composite.Parameter(name='admix.geno', value='admix.geno', display_type=display_file) | |
| 242 in_snp = gd_composite.Parameter(name='admix.snp', value='admix.snp', display_type=display_file) | |
| 243 in_ind = gd_composite.Parameter(name='admix.ind', value='admix.ind', display_type=display_file) | |
| 244 | |
| 245 info_page.add_input_parameter(in_admix) | |
| 246 info_page.add_input_parameter(in_geno) | |
| 247 info_page.add_input_parameter(in_snp) | |
| 248 info_page.add_input_parameter(in_ind) | |
| 249 | |
| 250 misc_stats = gd_composite.Parameter(description='Stats<p/><pre>\n{0}\n</pre>'.format(smartpca_stats), display_type=display_value) | |
| 251 | |
| 252 info_page.add_misc(misc_stats) | |
| 253 | |
| 254 with open (output, 'w') as ofh: | |
| 255 print >> ofh, info_page.render() | |
| 256 | |
| 257 sys.exit(0) | |
| 258 |
