Mercurial > repos > iuc > multigps
comparison multigps.py @ 2:91127c200437 draft
Uploaded
| author | iuc |
|---|---|
| date | Mon, 03 Apr 2017 11:21:10 -0400 |
| parents | |
| children | 0bec3b2df784 |
comparison
equal
deleted
inserted
replaced
| 1:944dcc240ab2 | 2:91127c200437 |
|---|---|
| 1 import argparse | |
| 2 import os | |
| 3 import shutil | |
| 4 import subprocess | |
| 5 import sys | |
| 6 import tempfile | |
| 7 | |
| 8 BUFF_SIZE = 1048576 | |
| 9 DESIGN_FILE = 'design.tabular' | |
| 10 | |
| 11 parser = argparse.ArgumentParser() | |
| 12 parser.add_argument('--all_events_table', dest='all_events_table', help='Output all events table file') | |
| 13 parser.add_argument('--alphascale', dest='alphascale', type=float, default=None, help='Alpha scaling factor') | |
| 14 parser.add_argument('--chrom_len_file', dest='chrom_len_file', help='File listing the lengths of all chromosomes') | |
| 15 parser.add_argument('--control', dest='control', default=None, help='Input control files and data formats') | |
| 16 parser.add_argument('--diffp', dest='diffp', type=float, default=None, help='Minimum p-value for reporting differential enrichment') | |
| 17 parser.add_argument('--edgerod', dest='edgerod', type=float, default=None, help='EdgeR over-dispersion parameter value') | |
| 18 parser.add_argument('--exclude', dest='exclude', default=None, help='File containing a set of regions to ignore during MultiGPS training') | |
| 19 parser.add_argument('--expt', dest='expt', default=None, help="Input expt files and data formats") | |
| 20 parser.add_argument('--eventsaretxt', dest='eventsaretxt', default=None, help='Append a .txt extension to the events file for browser rendering') | |
| 21 parser.add_argument('--fixedalpha', dest='fixedalpha', type=int, default=None, help='Impose this alpha') | |
| 22 parser.add_argument('--fixedmodelrange', dest='fixedmodelrange', default=None, help='Keep binding model range fixed to inital size') | |
| 23 parser.add_argument('--fixedpb', dest='fixedpb', type=int, default=None, help='Fixed per-base limit') | |
| 24 parser.add_argument('--fixedscaling', dest='fixedscaling', type=float, default=None, help='Multiply control counts by total tag count ratio and then by this factor') | |
| 25 parser.add_argument('--format', dest='format', default=None, help='Input expt file data format') | |
| 26 parser.add_argument('--gaussmodelsmoothing', dest='gaussmodelsmoothing', default=None, help='Use Gaussian model smoothing') | |
| 27 parser.add_argument('--gausssmoothparam', dest='gausssmoothparam', type=int, default=None, help='Smoothing factor') | |
| 28 parser.add_argument('--input_item', dest='input_items', action='append', nargs=7, default=None, help="Input files, attributes and options") | |
| 29 parser.add_argument('--jointinmodel', dest='jointinmodel', default=None, help='Allow joint events in model updates') | |
| 30 parser.add_argument('--mappability', dest='mappability', type=float, default=None, help='Fraction of the genome that is mappable for these experiments') | |
| 31 parser.add_argument('--maxtrainingrounds', dest='maxtrainingrounds', type=int, default=None, help='Maximum number of training rounds for updating binding event read distributions') | |
| 32 parser.add_argument('--medianscale', dest='medianscale', default=None, help='Use the median signal/control ratio as the scaling factor') | |
| 33 parser.add_argument('--meme1proc', dest='meme1proc', default=None, help='Do not run the parallel version of meme') | |
| 34 parser.add_argument('--mememaxw', dest='mememaxw', type=int, default=None, help='Maximum motif width for MEME') | |
| 35 parser.add_argument('--mememinw', dest='mememinw', type=int, default=None, help='Minimum motif width for MEME') | |
| 36 parser.add_argument('--memenmotifs', dest='memenmotifs', type=int, default=None, help='Number of motifs MEME should find for each condition') | |
| 37 parser.add_argument('--minfold', dest='minfold', type=float, default=None, help='Minimum event fold-change vs scaled control') | |
| 38 parser.add_argument('--minqvalue', dest='minqvalue', type=float, default=None, help='Minimum Q-value (corrected p-value) of reported binding events') | |
| 39 parser.add_argument('--minmodelupdateevents', dest='minmodelupdateevents', type=int, default=None, help='Minimum number of events to support an update of the read distribution') | |
| 40 parser.add_argument('--mlconfignotshared', dest='mlconfignotshared', default=None, help='Share component configs in the ML step') | |
| 41 parser.add_argument('--nocache', dest='nocache', default=None, help='Turn off caching of the entire set of experiments') | |
| 42 parser.add_argument('--nodifftests', dest='nodifftests', default=None, help='Run differential enrichment tests') | |
| 43 parser.add_argument('--nomodelsmoothing', dest='nomodelsmoothing', default=None, help='Perform binding model smoothing') | |
| 44 parser.add_argument('--nomodelupdate', dest='nomodelupdate', default=None, help='Perform binding model updates') | |
| 45 parser.add_argument('--nomotifprior', dest='nomotifprior', default=None, help='Perform motif-finding only') | |
| 46 parser.add_argument('--nomotifs', dest='nomotifs', default=None, help='Perform motif-finding and motif priors') | |
| 47 parser.add_argument('--nonunique', dest='nonunique', default=None, help='Use non-unique reads') | |
| 48 parser.add_argument('--noposprior', dest='noposprior', default=None, help='Perform inter-experiment positional prior') | |
| 49 parser.add_argument('--noscaling', dest='noscaling', default=None, help='Do not use signal vs control scaling') | |
| 50 parser.add_argument('--output_bed', dest='output_bed', help='Output bed results file') | |
| 51 parser.add_argument('--output_html', dest='output_html', help='Output html results file') | |
| 52 parser.add_argument('--output_html_files_path', dest='output_html_files_path', help='Output html extra files') | |
| 53 parser.add_argument('--plotscaling', dest='plotscaling', default=None, help='Plot diagnostic information for the chosen scaling method') | |
| 54 parser.add_argument('--poissongausspb', dest='poissongausspb', type=int, default=None, help='Poisson threshold for filtering per base') | |
| 55 parser.add_argument('--prlogconf', dest='prlogconf', type=int, default=None, help='Poisson log threshold for potential region scanning') | |
| 56 parser.add_argument('--probshared', dest='probshared', type=float, default=None, help='Probability that events are shared across conditions') | |
| 57 parser.add_argument('--readdistributionfile', dest='readdistributionfile', default=None, help='Optional binding event read distribution file for initializing models') | |
| 58 parser.add_argument('--regressionscale', dest='regressionscale', default=None, help='Use scaling by regression on binned tag counts') | |
| 59 parser.add_argument('--replicates_counts', dest='replicates_counts', help='Output replicates counts file') | |
| 60 parser.add_argument('--scalewin', dest='scalewin', type=int, default=None, help='Window size for estimating scaling ratios') | |
| 61 parser.add_argument('--seq', dest='seq', default=None, help='Reference genome path') | |
| 62 parser.add_argument('--sesscale', dest='sesscale', default=None, help='Estimate scaling factor by SES') | |
| 63 parser.add_argument('--splinesmoothparam', dest='splinesmoothparam', type=int, default=None, help='Spline smoothing parameter') | |
| 64 parser.add_argument('--threads', dest='threads', type=int, default=4, help='The number of threads to run') | |
| 65 args = parser.parse_args() | |
| 66 | |
| 67 | |
| 68 def generate_design_file(input_items): | |
| 69 design_file = open(DESIGN_FILE, 'w') | |
| 70 for item in input_items: | |
| 71 file_name, label, file_format, condition_name, replicate_name, experiment_type, fixed_read_count = item | |
| 72 file_format = file_format.upper() | |
| 73 items = [file_name, label, file_format, condition_name] | |
| 74 if replicate_name not in ['None', None, '']: | |
| 75 items.append(replicate_name) | |
| 76 if experiment_type not in ['None', None, '']: | |
| 77 items.append(experiment_type) | |
| 78 if fixed_read_count not in ['None', None, '']: | |
| 79 items.append(fixed_read_count) | |
| 80 design_file.write('%s\n' % '\t'.join(items)) | |
| 81 design_file.close() | |
| 82 | |
| 83 | |
| 84 def get_file_with_extension(dir, ext): | |
| 85 file_list = [f for f in os.listdir(dir) if f.endswith(ext)] | |
| 86 if len(file_list) == 1: | |
| 87 return file_list[0] | |
| 88 stop_err('Error running MultiGPS: output file with extension "%s" not generated.' % ext) | |
| 89 | |
| 90 | |
| 91 def get_stderr_exception(tmp_err, tmp_stderr, tmp_out, tmp_stdout, include_stdout=False): | |
| 92 tmp_stderr.close() | |
| 93 # Get stderr, allowing for case where it's very large. | |
| 94 tmp_stderr = open(tmp_err, 'rb') | |
| 95 stderr_str = '' | |
| 96 buffsize = BUFF_SIZE | |
| 97 try: | |
| 98 while True: | |
| 99 stderr_str += tmp_stderr.read(buffsize) | |
| 100 if not stderr_str or len(stderr_str) % buffsize != 0: | |
| 101 break | |
| 102 except OverflowError: | |
| 103 pass | |
| 104 tmp_stderr.close() | |
| 105 if include_stdout: | |
| 106 tmp_stdout = open(tmp_out, 'rb') | |
| 107 stdout_str = '' | |
| 108 buffsize = BUFF_SIZE | |
| 109 try: | |
| 110 while True: | |
| 111 stdout_str += tmp_stdout.read(buffsize) | |
| 112 if not stdout_str or len(stdout_str) % buffsize != 0: | |
| 113 break | |
| 114 except OverflowError: | |
| 115 pass | |
| 116 tmp_stdout.close() | |
| 117 if include_stdout: | |
| 118 return 'STDOUT\n%s\n\nSTDERR\n%s\n' % (stdout_str, stderr_str) | |
| 119 return stderr_str | |
| 120 | |
| 121 | |
| 122 def stop_err(msg): | |
| 123 sys.stderr.write(msg) | |
| 124 sys.exit(1) | |
| 125 | |
| 126 | |
| 127 # Preparation. | |
| 128 tmp_dir = tempfile.mkdtemp(prefix="tmp-multigps-") | |
| 129 os.makedirs(args.output_html_files_path) | |
| 130 # Build the command line. | |
| 131 cmd = 'multigps' | |
| 132 # General options | |
| 133 cmd += ' --threads %s' % args.threads | |
| 134 if args.eventsaretxt is not None: | |
| 135 # Append .txt extensions to events hrefs | |
| 136 # in output dataset so files will render | |
| 137 # in the browser. | |
| 138 cmd += ' --eventsaretxt' | |
| 139 if args.meme1proc is not None: | |
| 140 # Do not run the parallel version of meme. | |
| 141 cmd += ' --meme1proc' | |
| 142 # Experiment. | |
| 143 if args.input_items is not None: | |
| 144 generate_design_file(args.input_items) | |
| 145 cmd += ' --design %s' % DESIGN_FILE | |
| 146 else: | |
| 147 if args.expt is not None: | |
| 148 cmd += ' --expt %s' % args.expt | |
| 149 if args.format is not None: | |
| 150 cmd += ' --format %s' % args.format | |
| 151 if args.control is not None: | |
| 152 cmd += ' --ctrl %s' % args.control | |
| 153 cmd += ' --geninfo %s' % args.chrom_len_file | |
| 154 # Advanced options. | |
| 155 if args.seq is not None: | |
| 156 cmd += ' --seq %s' % args.seq | |
| 157 # Limits on how many reads | |
| 158 if args.fixedpb is not None: | |
| 159 cmd += ' --fixedpb %d' % args.fixedpb | |
| 160 if args.poissongausspb is not None: | |
| 161 cmd += ' --poissongausspb %d' % args.poissongausspb | |
| 162 if args.nonunique is not None: | |
| 163 cmd += ' --nonunique' | |
| 164 if args.mappability is not None: | |
| 165 cmd += ' --mappability %4f' % args.mappability | |
| 166 if args.nocache is not None: | |
| 167 cmd += ' --nocache' | |
| 168 # Scaling data.' | |
| 169 if args.noscaling is not None: | |
| 170 cmd += ' --noscaling %s' % args.noscaling | |
| 171 if args.medianscale is not None: | |
| 172 cmd += ' --medianscale %s' % args.medianscale | |
| 173 if args.regressionscale is not None: | |
| 174 cmd += ' --regressionscale %s' % args.regressionscale | |
| 175 if args.sesscale is not None: | |
| 176 cmd += ' --sesscale %s' % args.sesscale | |
| 177 if args.fixedscaling is not None: | |
| 178 cmd += ' --fixedscaling %4f' % args.fixedscaling | |
| 179 if args.scalewin is not None: | |
| 180 cmd += ' --scalewin %d' % args.scalewin | |
| 181 if args.plotscaling is not None: | |
| 182 cmd += ' --plotscaling %s' % args.plotscaling | |
| 183 # Running MultiGPS. | |
| 184 if args.readdistributionfile is not None: | |
| 185 cmd += ' --d %s' % args.readdistributionfile | |
| 186 if args.maxtrainingrounds is not None: | |
| 187 cmd += ' --r %s' % args.maxtrainingrounds | |
| 188 if args.nomodelupdate is not None: | |
| 189 cmd += ' --nomodelupdate' | |
| 190 if args.minmodelupdateevents is not None: | |
| 191 cmd += ' --minmodelupdateevents %d' % args.minmodelupdateevents | |
| 192 if args.nomodelsmoothing is not None: | |
| 193 cmd += ' --nomodelsmoothing' | |
| 194 if args.splinesmoothparam is not None: | |
| 195 cmd += ' --splinesmoothparam %d' % args.splinesmoothparam | |
| 196 if args.gaussmodelsmoothing is not None: | |
| 197 cmd += ' --gaussmodelsmoothing' | |
| 198 if args.gausssmoothparam is not None: | |
| 199 cmd += ' --gausssmoothparam %s' % args.gausssmoothparam | |
| 200 if args.jointinmodel is not None: | |
| 201 cmd += ' --jointinmodel' | |
| 202 if args.fixedmodelrange is not None: | |
| 203 cmd += ' --fixedmodelrange' | |
| 204 if args.prlogconf is not None: | |
| 205 cmd += ' --prlogconf %d' % args.prlogconf | |
| 206 if args.fixedalpha is not None: | |
| 207 cmd += ' --fixedalpha %d' % args.fixedalpha | |
| 208 if args.alphascale is not None: | |
| 209 cmd += ' --alphascale %4f' % args.alphascale | |
| 210 if args.mlconfignotshared is not None: | |
| 211 cmd += ' --mlconfignotshared' | |
| 212 if args.exclude not in [None, 'None']: | |
| 213 cmd += ' --exclude %s' % args.exclude_file | |
| 214 # MultiGPS priors | |
| 215 if args.noposprior is not None: | |
| 216 cmd += ' --noposprior' | |
| 217 if args.probshared is not None: | |
| 218 cmd += ' --probshared %4f' % args.probshared | |
| 219 if args.memenmotifs is not None: | |
| 220 cmd += ' --memenmotifs %d' % args.memenmotifs | |
| 221 if args.mememinw is not None: | |
| 222 cmd += ' --mememinw %d' % args.mememinw | |
| 223 if args.mememaxw is not None: | |
| 224 cmd += ' --mememaxw %d' % args.mememaxw | |
| 225 if args.nomotifs is not None: | |
| 226 cmd += ' --nomotifs' | |
| 227 if args.nomotifprior is not None: | |
| 228 cmd += ' --nomotifprior' | |
| 229 # Reporting binding events | |
| 230 if args.minqvalue is not None: | |
| 231 cmd += ' --q %4f' % args.minqvalue | |
| 232 if args.minfold is not None: | |
| 233 cmd += ' --minfold %4f' % args.minfold | |
| 234 if args.nodifftests is not None: | |
| 235 cmd += ' --nodifftests' | |
| 236 if args.edgerod is not None: | |
| 237 cmd += ' --edgerod %4f' % args.edgerod | |
| 238 if args.diffp is not None: | |
| 239 cmd += ' --diffp %4f' % args.diffp | |
| 240 # Output directory. | |
| 241 cmd += ' --out %s' % args.output_html_files_path | |
| 242 # Define command response buffers. | |
| 243 tmp_out = tempfile.NamedTemporaryFile(dir=tmp_dir).name | |
| 244 tmp_stdout = open(tmp_out, 'wb') | |
| 245 tmp_err = tempfile.NamedTemporaryFile(dir=tmp_dir).name | |
| 246 tmp_stderr = open(tmp_err, 'wb') | |
| 247 tmp_filename = tempfile.NamedTemporaryFile(dir=tmp_dir).name | |
| 248 # Execute the command. | |
| 249 proc = subprocess.Popen(args=cmd, stderr=tmp_stderr, stdout=tmp_stdout, shell=True) | |
| 250 rc = proc.wait() | |
| 251 if rc != 0: | |
| 252 error_message = get_stderr_exception(tmp_err, tmp_stderr, tmp_out, tmp_stdout) | |
| 253 stop_err(error_message) | |
| 254 # Move each output file to the approapriate output dataset path. | |
| 255 output_bed = get_file_with_extension(args.output_html_files_path, 'bed') | |
| 256 shutil.move(os.path.join(args.output_html_files_path, output_bed), args.output_bed) | |
| 257 output_html = get_file_with_extension(args.output_html_files_path, 'html') | |
| 258 shutil.move(os.path.join(args.output_html_files_path, output_html), args.output_html) | |
| 259 replicates_counts = get_file_with_extension(args.output_html_files_path, 'counts') | |
| 260 shutil.move(os.path.join(args.output_html_files_path, replicates_counts), args.replicates_counts) | |
| 261 all_events_table = get_file_with_extension(args.output_html_files_path, 'table.txt') | |
| 262 shutil.move(os.path.join(args.output_html_files_path, all_events_table), args.all_events_table) | |
| 263 # Clean up. | |
| 264 if os.path.exists(tmp_dir): | |
| 265 shutil.rmtree(tmp_dir) |
