diff utils/sim-genome.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/sim-genome.py	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import print_function
+import sys
+import random
+import argparse
+import numpy
+import sim
+import fastareader
+
+ARG_DEFAULTS = {'spacing':600, 'indel_rate':0.15, 'ext_rate':0.3}
+USAGE = "%(prog)s [options]"
+DESCRIPTION = """Generate a version of the input reference with randomly added variants."""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION)
+  parser.set_defaults(**ARG_DEFAULTS)
+
+  parser.add_argument('ref', type=fastareader.FastaLineGenerator,
+    help='The original reference genome (FASTA).')
+  parser.add_argument('-m', '--mutations', type=argparse.FileType('w'),
+    help='Write inserted mutations here.')
+  parser.add_argument('-s', '--spacing', type=int,
+    help='Average (or exact) spacing between variants, in bp. Note: must be less than the line '
+         'width in the reference file. Default: %(default)s')
+  parser.add_argument('-r', '--random', action='store_true',
+    help='Randomly distribute the variants instead of evenly spacing them.')
+  parser.add_argument('-N', '--ref-name',
+    help='Name the output sequence this.')
+  parser.add_argument('-i', '--indel-rate', type=float,
+    help='Default: %(default)s')
+  parser.add_argument('-E', '--extension-rate', dest='ext_rate', type=float,
+    help='Default: %(default)s')
+  parser.add_argument('-S', '--seed', type=int,
+    help='Default: random.')
+
+  args = parser.parse_args(argv[1:])
+
+  if args.seed is None:
+    seed = random.randint(0, 2**31-1)
+    sys.stderr.write('seed: {}\n'.format(seed))
+  else:
+    seed = args.seed
+  random.seed(seed)
+
+  var_prob = 1/args.spacing
+
+  next_var = args.spacing
+  coord = 0
+  chr_name = None
+  for line in args.ref:
+    if args.ref.name != chr_name:
+      chr_name = args.ref.name
+      if args.ref_name:
+        chr_id = args.ref_name.split()[0]
+        print('>'+args.ref_name)
+      else:
+        chr_id = args.ref.id
+        print('>'+chr_name)
+    end_coord = coord + len(line)
+    new_line = line
+    if args.random:
+      n_vars = numpy.random.binomial(len(line), var_prob)
+      for i in range(n_vars):
+        vcoord = random.randint(coord+1, end_coord)
+        vtype, alt = sim.make_mutation(args.indel_rate, args.ext_rate)
+        var = {'coord':vcoord-coord-1, 'type':vtype, 'alt':alt}
+        # sys.stderr.write('Adding var at {}: {} ({})\n'.format(vcoord, vtype, alt))
+        new_line = sim.apply_mutation(var, new_line)
+        if args.mutations:
+          var['coord'] = vcoord
+          sim.log_mutations(args.mutations, [var], '.', chr_id, '.', '.')
+    else:
+      if next_var <= end_coord:
+        vtype, alt = sim.make_mutation(args.indel_rate, args.ext_rate)
+        var = {'coord':next_var-coord-1, 'type':vtype, 'alt':alt}
+        # sys.stderr.write('Adding var at {}: {} ({})\n'.format(next_var, vtype, alt))
+        new_line = sim.apply_mutation(var, new_line)
+        if args.mutations:
+          var['coord'] = next_var
+          sim.log_mutations(args.mutations, [var], '.', chr_id, '.', '.')
+        next_var += args.spacing
+    print(new_line)
+    coord += len(line)
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))