Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison design_matrix_creator @ 79:c86f1f86b3c1 draft
Uploaded
| author | yhoogstrate |
|---|---|
| date | Thu, 13 Nov 2014 10:30:43 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 78:398963a4827a | 79:c86f1f86b3c1 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse, os, shutil, sys, tempfile, subprocess | |
| 4 | |
| 5 __version_info__ = ('1', '0', '0')#, 'beta') | |
| 6 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] | |
| 7 __author__ = 'Youri Hoogstrate' | |
| 8 __homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' | |
| 9 __license__ = 'GNU General Public License v3 (GPLv3)' | |
| 10 | |
| 11 | |
| 12 class sampleContainer: | |
| 13 def __init__(self): | |
| 14 self.samples = [] | |
| 15 self.treatments = {} | |
| 16 self.treatment_index = [] | |
| 17 self.treatment_types = {} | |
| 18 | |
| 19 def do_decode(self,encoded_str): | |
| 20 return encoded_str.decode("base64").strip().replace("\t",'') | |
| 21 | |
| 22 def add_samples(self,argument): | |
| 23 print " - Adding samples" | |
| 24 for sample in argument: | |
| 25 self.add_sample(self.do_decode(sample)) | |
| 26 | |
| 27 def add_sample(self,sample): | |
| 28 if(sample in self.samples): | |
| 29 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") | |
| 30 sys.exit(1) | |
| 31 else: | |
| 32 self.samples.append(sample) | |
| 33 print " - Added: "+sample | |
| 34 | |
| 35 def add_blocking(self,argument): | |
| 36 print " - Adding paired samples" | |
| 37 pair = [] | |
| 38 for block in argument: | |
| 39 self.add_block(block) | |
| 40 | |
| 41 def add_block(self,blocks): | |
| 42 blocks = blocks.split(":") | |
| 43 as_treatment = blocks[0] | |
| 44 blocks = blocks[1:] | |
| 45 | |
| 46 used_samples = [] | |
| 47 indexed_samples = {} | |
| 48 | |
| 49 for i in range(len(blocks)): | |
| 50 block = blocks[i] | |
| 51 samples = self.get_samples_from_block(block) | |
| 52 indexed_samples[i+1] = [] | |
| 53 for sample in samples: | |
| 54 if(sample in used_samples): | |
| 55 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") | |
| 56 sys.exit(0) | |
| 57 else: | |
| 58 indexed_samples[i+1] = block | |
| 59 used_samples.append(sample) | |
| 60 | |
| 61 for sample in self.samples: | |
| 62 if(sample not in used_samples): | |
| 63 i = i + 1 | |
| 64 indexed_samples[i+1] = str(sample).encode('base64').strip() | |
| 65 | |
| 66 for index in indexed_samples.keys(): | |
| 67 key = str(index).encode('base64').strip() | |
| 68 as_treatment += ":"+key+":"+indexed_samples[index] | |
| 69 | |
| 70 self.add_treatment(as_treatment) | |
| 71 | |
| 72 def get_samples_from_block(self,decoded_block): | |
| 73 return [ self.do_decode(x) for x in decoded_block.split(",")] | |
| 74 | |
| 75 def add_treatments(self,argument): | |
| 76 print " - Adding treatments" | |
| 77 for treatment in argument: | |
| 78 self.add_treatment(treatment) | |
| 79 | |
| 80 def add_treatment(self,treatment_argument): | |
| 81 print " - Parsing treatment" | |
| 82 | |
| 83 | |
| 84 treatment_argument = treatment_argument.split(":") | |
| 85 name = self.do_decode(treatment_argument[0]) | |
| 86 treatment_argument = treatment_argument[1:] | |
| 87 | |
| 88 | |
| 89 treatment = {"factor_index":{},"sample_index":{}} | |
| 90 only_integers = True | |
| 91 | |
| 92 i = 1 | |
| 93 for item in treatment_argument: | |
| 94 if(i % 2): | |
| 95 factor = self.do_decode(item) | |
| 96 | |
| 97 if(treatment['factor_index'].has_key(factor)): | |
| 98 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") | |
| 99 sys.exit(0) | |
| 100 else: | |
| 101 print " - Adding factor: "+factor | |
| 102 treatment["factor_index"][factor] = [] | |
| 103 if(not factor.isdigit()): | |
| 104 only_integers = False | |
| 105 else: | |
| 106 for sample in item.split(","): | |
| 107 sample = self.do_decode(sample) | |
| 108 | |
| 109 if(not sample in self.samples): | |
| 110 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") | |
| 111 sys.exit(0) | |
| 112 | |
| 113 treatment["factor_index"][factor].append(sample) | |
| 114 if(treatment["sample_index"].has_key(sample)): | |
| 115 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") | |
| 116 sys.exit(0) | |
| 117 else: | |
| 118 treatment["sample_index"][sample] = factor | |
| 119 i += 1 | |
| 120 | |
| 121 treatment_factors = sorted(treatment["factor_index"].keys()) | |
| 122 | |
| 123 if(name == None): | |
| 124 treatment["name"] = "_vs_".join(treatment_factors) | |
| 125 else: | |
| 126 treatment["name"] = str(name) | |
| 127 | |
| 128 if(len(treatment["sample_index"]) != len(self.samples)): | |
| 129 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") | |
| 130 | |
| 131 if(only_integers): | |
| 132 treatment_type = "integer" | |
| 133 else: | |
| 134 treatment_type = "string" | |
| 135 | |
| 136 if(self.treatments.has_key(treatment["name"])): | |
| 137 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") | |
| 138 else: | |
| 139 self.treatments[treatment["name"]] = treatment | |
| 140 self.treatment_index.append(treatment["name"]) | |
| 141 self.treatment_types[treatment["name"]] = treatment_type | |
| 142 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" | |
| 143 | |
| 144 def export(self,output): | |
| 145 # Open file stream | |
| 146 if(args.output == "-"): | |
| 147 fh = sys.stdout | |
| 148 else: | |
| 149 fh = open(args.output,"w") | |
| 150 | |
| 151 # Write header: | |
| 152 fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") | |
| 153 | |
| 154 # Write body: | |
| 155 for sample in self.samples: | |
| 156 fh.write(sample) | |
| 157 for treatment_id in self.treatment_index: | |
| 158 treatment = self.treatments[treatment_id] | |
| 159 fh.write("\t"+treatment["sample_index"][sample]) | |
| 160 fh.write("\n") | |
| 161 | |
| 162 fh.close() | |
| 163 | |
| 164 if __name__=="__main__": | |
| 165 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") | |
| 166 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) | |
| 167 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') | |
| 168 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') | |
| 169 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) | |
| 170 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') | |
| 171 | |
| 172 args = parser.parse_args() | |
| 173 | |
| 174 columns = None | |
| 175 if(args.columns_file): | |
| 176 with open(args.columns_file, "r") as f: | |
| 177 listed_columns = [None] + f.readline().strip("\n").split("\t") | |
| 178 for i in range(1,len(listed_columns)): | |
| 179 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') | |
| 180 | |
| 181 s = sampleContainer() | |
| 182 | |
| 183 if(listed_columns): | |
| 184 columns = [] | |
| 185 for sample in args.sample_names: | |
| 186 columns.append(listed_columns[int(sample)]) | |
| 187 | |
| 188 | |
| 189 treatments = [] | |
| 190 for treatment in args.treatments: | |
| 191 treatment = treatment.split(":") | |
| 192 for i in range(1,len(treatment)): | |
| 193 if(i%2 == 0): | |
| 194 treatment_tmp = treatment[i].split(",") | |
| 195 for j in range(len(treatment_tmp)): | |
| 196 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] | |
| 197 treatment[i] = ",".join(treatment_tmp) | |
| 198 | |
| 199 treatments.append(":".join(treatment)) | |
| 200 | |
| 201 blockings = [] | |
| 202 if(args.blocking): | |
| 203 for blocking in args.blocking: | |
| 204 blocking = blocking.split(":") | |
| 205 for i in range(1,len(blocking)): | |
| 206 block = blocking[i].split(",") | |
| 207 for j in range(len(block)): | |
| 208 block[j] = listed_columns[int(block[j])] | |
| 209 blocking[i] = ",".join(block) | |
| 210 blockings.append(":".join(blocking)) | |
| 211 | |
| 212 s.add_samples(columns) | |
| 213 s.add_treatments(treatments) | |
| 214 s.add_blocking(blockings) | |
| 215 | |
| 216 else: | |
| 217 s.add_samples(args.sample_names) | |
| 218 s.add_treatments(args.treatments) | |
| 219 if(args.blocking): | |
| 220 s.add_blocking(args.blocking) | |
| 221 | |
| 222 s.export(args.output) |
