Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison design_matrix_creator.py @ 31:9e9b98a1cb12 draft
Uploaded
| author | yhoogstrate |
|---|---|
| date | Tue, 20 May 2014 05:28:50 -0400 |
| parents | |
| children | 7fc17dc83777 |
comparison
equal
deleted
inserted
replaced
| 30:171f3cf6233d | 31:9e9b98a1cb12 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import argparse, os, shutil, sys, tempfile, subprocess | |
| 3 | |
| 4 | |
| 5 class sampleContainer: | |
| 6 def __init__(self): | |
| 7 self.samples = [] | |
| 8 self.treatments = {} | |
| 9 self.treatment_types = {} | |
| 10 | |
| 11 def do_decode(self,encoded_str): | |
| 12 return encoded_str.decode("base64").strip().replace("\t",'') | |
| 13 | |
| 14 def add_samples(self,argument): | |
| 15 print " - Adding samples" | |
| 16 for sample in argument: | |
| 17 self.add_sample(self.do_decode(sample)) | |
| 18 | |
| 19 def add_sample(self,sample): | |
| 20 if(sample in self.samples): | |
| 21 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") | |
| 22 sys.exit(1) | |
| 23 else: | |
| 24 self.samples.append(sample) | |
| 25 print " - Added: "+sample | |
| 26 | |
| 27 def add_blocking(self,argument): | |
| 28 print " - Adding paired samples" | |
| 29 pair = [] | |
| 30 for block in argument: | |
| 31 self.add_block(block) | |
| 32 | |
| 33 def add_block(self,blocks): | |
| 34 blocks = blocks.split(":") | |
| 35 as_treatment = blocks[0] | |
| 36 blocks = blocks[1:] | |
| 37 | |
| 38 used_samples = [] | |
| 39 indexed_samples = {} | |
| 40 | |
| 41 for i in range(len(blocks)): | |
| 42 block = blocks[i] | |
| 43 samples = self.get_samples_from_block(block) | |
| 44 indexed_samples[i+1] = [] | |
| 45 for sample in samples: | |
| 46 if(sample in used_samples): | |
| 47 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") | |
| 48 sys.exit(0) | |
| 49 else: | |
| 50 indexed_samples[i+1] = block | |
| 51 used_samples.append(sample) | |
| 52 | |
| 53 for sample in self.samples: | |
| 54 if(sample not in used_samples): | |
| 55 i = i + 1 | |
| 56 indexed_samples[i+1] = str(sample).encode('base64').strip() | |
| 57 | |
| 58 for index in indexed_samples.keys(): | |
| 59 key = str(index).encode('base64').strip() | |
| 60 as_treatment += ":"+key+":"+indexed_samples[index] | |
| 61 | |
| 62 self.add_treatment(as_treatment) | |
| 63 | |
| 64 def get_samples_from_block(self,decoded_block): | |
| 65 return [ self.do_decode(x) for x in decoded_block.split(",")] | |
| 66 | |
| 67 def add_treatments(self,argument): | |
| 68 print " - Adding treatments" | |
| 69 for treatment in argument: | |
| 70 self.add_treatment(treatment) | |
| 71 | |
| 72 def add_treatment(self,treatment_argument): | |
| 73 print " - Parsing treatment" | |
| 74 | |
| 75 | |
| 76 treatment_argument = treatment_argument.split(":") | |
| 77 name = self.do_decode(treatment_argument[0]) | |
| 78 treatment_argument = treatment_argument[1:] | |
| 79 | |
| 80 | |
| 81 treatment = {"factor_index":{},"sample_index":{}} | |
| 82 only_integers = True | |
| 83 | |
| 84 i = 1 | |
| 85 for item in treatment_argument: | |
| 86 if(i % 2): | |
| 87 factor = self.do_decode(item) | |
| 88 | |
| 89 if(treatment['factor_index'].has_key(factor)): | |
| 90 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") | |
| 91 sys.exit(0) | |
| 92 else: | |
| 93 print " - Adding factor: "+factor | |
| 94 treatment["factor_index"][factor] = [] | |
| 95 if(not factor.isdigit()): | |
| 96 only_integers = False | |
| 97 else: | |
| 98 for sample in item.split(","): | |
| 99 sample = self.do_decode(sample) | |
| 100 | |
| 101 if(not sample in self.samples): | |
| 102 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") | |
| 103 sys.exit(0) | |
| 104 | |
| 105 treatment["factor_index"][factor].append(sample) | |
| 106 if(treatment["sample_index"].has_key(sample)): | |
| 107 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") | |
| 108 sys.exit(0) | |
| 109 else: | |
| 110 treatment["sample_index"][sample] = factor | |
| 111 i += 1 | |
| 112 | |
| 113 treatment_factors = sorted(treatment["factor_index"].keys()) | |
| 114 | |
| 115 if(name == None): | |
| 116 treatment["name"] = "_vs_".join(treatment_factors) | |
| 117 else: | |
| 118 treatment["name"] = str(name) | |
| 119 | |
| 120 if(len(treatment["sample_index"]) != len(self.samples)): | |
| 121 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") | |
| 122 | |
| 123 if(only_integers): | |
| 124 treatment_type = "integer" | |
| 125 else: | |
| 126 treatment_type = "string" | |
| 127 | |
| 128 if(self.treatments.has_key(treatment["name"])): | |
| 129 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") | |
| 130 else: | |
| 131 self.treatments[treatment["name"]] = treatment | |
| 132 self.treatment_types[treatment["name"]] = treatment_type | |
| 133 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" | |
| 134 | |
| 135 def export(self,output): | |
| 136 # Open file stream | |
| 137 if(args.output == "-"): | |
| 138 fh = sys.stdout | |
| 139 else: | |
| 140 fh = open(args.output,"w") | |
| 141 | |
| 142 # Write header: | |
| 143 fh.write("sample-name\t"+"\t".join(self.treatments.keys())+"\n") | |
| 144 | |
| 145 # Write body: | |
| 146 for sample in self.samples: | |
| 147 fh.write(sample) | |
| 148 for treatment in self.treatments.keys(): | |
| 149 fh.write("\t"+self.treatments[treatment]["sample_index"][sample]) | |
| 150 fh.write("\n") | |
| 151 | |
| 152 fh.close() | |
| 153 | |
| 154 if __name__=="__main__": | |
| 155 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") | |
| 156 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) | |
| 157 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') | |
| 158 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') | |
| 159 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) | |
| 160 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') | |
| 161 | |
| 162 args = parser.parse_args() | |
| 163 | |
| 164 columns = None | |
| 165 if(args.columns_file): | |
| 166 with open(args.columns_file, "r") as f: | |
| 167 listed_columns = [None] + f.readline().strip("\n").split("\t") | |
| 168 for i in range(1,len(listed_columns)): | |
| 169 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') | |
| 170 | |
| 171 s = sampleContainer() | |
| 172 | |
| 173 if(listed_columns): | |
| 174 columns = [] | |
| 175 for sample in args.sample_names: | |
| 176 columns.append(listed_columns[int(sample)]) | |
| 177 | |
| 178 | |
| 179 treatments = [] | |
| 180 for treatment in args.treatments: | |
| 181 treatment = treatment.split(":") | |
| 182 for i in range(1,len(treatment)): | |
| 183 if(i%2 == 0): | |
| 184 treatment_tmp = treatment[i].split(",") | |
| 185 for j in range(len(treatment_tmp)): | |
| 186 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] | |
| 187 treatment[i] = ",".join(treatment_tmp) | |
| 188 | |
| 189 treatments.append(":".join(treatment)) | |
| 190 | |
| 191 blockings = [] | |
| 192 if(args.blocking): | |
| 193 for blocking in args.blocking: | |
| 194 blocking = blocking.split(":") | |
| 195 for i in range(1,len(blocking)): | |
| 196 block = blocking[i].split(",") | |
| 197 for j in range(len(block)): | |
| 198 block[j] = listed_columns[int(block[j])] | |
| 199 blocking[i] = ",".join(block) | |
| 200 blockings.append(":".join(blocking)) | |
| 201 | |
| 202 s.add_samples(columns) | |
| 203 s.add_treatments(treatments) | |
| 204 s.add_blocking(blockings) | |
| 205 | |
| 206 else: | |
| 207 s.add_samples(args.sample_names) | |
| 208 s.add_treatments(args.treatments) | |
| 209 if(args.blocking): | |
| 210 s.add_blocking(args.blocking) | |
| 211 | |
| 212 s.export(args.output) |
