Mercurial > repos > soranzo > sequel
comparison sequel_wrapper.py @ 0:33f282b9dafe draft
Uploaded
| author | soranzo |
|---|---|
| date | Fri, 18 Jul 2014 07:07:31 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:33f282b9dafe |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 SEQuel | |
| 4 version 0.2 (andrea.pinna@crs4.it) | |
| 5 """ | |
| 6 | |
| 7 import optparse | |
| 8 import os | |
| 9 import shutil | |
| 10 import subprocess | |
| 11 import sys | |
| 12 import tempfile | |
| 13 import uuid | |
| 14 | |
| 15 def external_insert_size(prep_file): | |
| 16 """ Retrieve average external insert-size from pre-processing log """ | |
| 17 with open(prep_file, 'r') as f: | |
| 18 for line in f: | |
| 19 if line[0:30] == '\tAverage external insert-size:': | |
| 20 ext_int_size = line.split(':')[1][1:-1] | |
| 21 break | |
| 22 else: | |
| 23 sys.exit('Average external insert-size not found in %s' % prep_file) | |
| 24 return ext_int_size | |
| 25 | |
| 26 | |
| 27 def __main__(): | |
| 28 # load arguments | |
| 29 print 'Parsing SEQuel input options...' | |
| 30 parser = optparse.OptionParser() | |
| 31 parser.add_option('--sequel_jar_path', dest='sequel_jar_path', help='') | |
| 32 parser.add_option('--read1', dest='read1', help='') | |
| 33 parser.add_option('--read2', dest='read2', help='') | |
| 34 parser.add_option('--contigs', dest='contigs', help='') | |
| 35 parser.add_option('--bases_length', dest='bases_length', type='int', help='') | |
| 36 parser.add_option('-t', dest='n_threads', type='int', help='') | |
| 37 parser.add_option('-p', dest='max_threads', type='int', help='') | |
| 38 parser.add_option('-u', dest='min_threads', type='int', help='') | |
| 39 parser.add_option('--kmer_size', dest='kmer_size', type='int', help='') | |
| 40 parser.add_option('--max_positional_error', dest='max_positional_error', type='int', help='') | |
| 41 parser.add_option('--min_fraction', dest='min_fraction', type='float', help='') | |
| 42 parser.add_option('--min_aln_length', dest='min_aln_length', type='int', help='') | |
| 43 parser.add_option('--min_avg_coverage', dest='min_avg_coverage', type='float', help='') | |
| 44 parser.add_option('--discard_kmers', dest='discard_kmers', type='int', help='') | |
| 45 parser.add_option('--discard_positional', dest='discard_positional', type='int', help='') | |
| 46 parser.add_option('--min_aln_score', dest='min_aln_score', type='int', help='') | |
| 47 parser.add_option('--single_cell_mode', action='store_true', dest='single_cell_mode', help='') | |
| 48 parser.add_option('--report_changes', action='store_true', dest='report_changes', help='') | |
| 49 parser.add_option('--extend_contig', action='store_true', dest='extend_contig', help='') | |
| 50 parser.add_option('--reference_genome', dest='reference_genome', help='') | |
| 51 parser.add_option('--contigs_refined', dest='contigs_refined', help='') | |
| 52 parser.add_option('--logprep', dest='logprep', help='') | |
| 53 parser.add_option('--logseq', dest='logseq', help='') | |
| 54 parser.add_option('--logfile_prep', dest='logfile_prep', help='') | |
| 55 parser.add_option('--logfile_seq', dest='logfile_seq', help='') | |
| 56 (options, args) = parser.parse_args() | |
| 57 if len(args) > 0: | |
| 58 parser.error('Wrong number of arguments') | |
| 59 | |
| 60 prep_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # prep.pl dies if the directory already exists, so just define a unique name | |
| 61 seq_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # SEQuel.jar would work in another directory (with '_1' suffix) if the directory already exists, so just define a unique name | |
| 62 | |
| 63 # Build SEQuel (pre-processing) command to be executed | |
| 64 # basic preprocessing | |
| 65 prep_input = "-r1 %s -r2 %s -c %s" % (options.read1, options.read2, options.contigs) | |
| 66 bases_length = "-l %d" % (options.bases_length) if options.bases_length is not None else '' | |
| 67 n_threads = "-t %d" % (options.n_threads) if options.n_threads is not None else '' | |
| 68 cmd_prep_dir = "-o %s" % (prep_directory) | |
| 69 cmd_seq_dir = "-o %s" % (seq_directory) | |
| 70 # -i INT (external insert size of paired-end reads (from prep.log) | |
| 71 # -p INT | |
| 72 max_threads = "-p %d" % (options.max_threads) if options.max_threads is not None else '' | |
| 73 # -u INT | |
| 74 min_threads = "-u %d" % (options.min_threads) if options.min_threads is not None else '' | |
| 75 # -A DIR | |
| 76 input_directory = "-A %s" % (prep_directory) | |
| 77 # -k INT | |
| 78 kmer_size = "-k %d" % (options.kmer_size) if options.kmer_size is not None else '' | |
| 79 # -d INT | |
| 80 max_positional_error = "-d %d" % (options.max_positional_error) if options.max_positional_error is not None else '' | |
| 81 # -f FLOAT | |
| 82 min_fraction = "-f %s" % (options.min_fraction) if options.min_fraction is not None else '' | |
| 83 # -l INT | |
| 84 min_aln_length = "-l %d" % (options.min_aln_length) if options.min_aln_length is not None else '' | |
| 85 # -v FLOAT | |
| 86 min_avg_coverage = "-v %s" % (options.min_avg_coverage) if options.min_avg_coverage is not None else '' | |
| 87 # -m INT | |
| 88 discard_kmers = "-m %d" % (options.discard_kmers) if options.discard_kmers is not None else '' | |
| 89 # -n INT | |
| 90 discard_positional = "-n %d" % (options.discard_positional) if options.discard_positional is not None else '' | |
| 91 # -q INT | |
| 92 min_aln_score = "-q %d" % (options.min_aln_score) if options.min_aln_score is not None else '' | |
| 93 # -s | |
| 94 single_cell_mode = '-s' if options.single_cell_mode else '' | |
| 95 # -r | |
| 96 report_changes = '-r' if options.report_changes else '' | |
| 97 # -e | |
| 98 extend_contig = '-e' if options.extend_contig else '' | |
| 99 # -g FILE | |
| 100 reference_genome = "-g %s" % (options.reference_genome) if options.reference_genome else '' | |
| 101 # contigs_refined.fa | |
| 102 contigs_refined = options.contigs_refined | |
| 103 # logprep & logseq | |
| 104 logprep = options.logprep | |
| 105 logseq = options.logseq | |
| 106 # x.refined-fa | |
| 107 # x.stdout | |
| 108 # logfile | |
| 109 logfile_prep = options.logfile_prep | |
| 110 logfile_seq = options.logfile_seq | |
| 111 | |
| 112 # Build SEQuel (pre-processing) command | |
| 113 cmd1 = ' '.join(['prep.pl', prep_input, bases_length, n_threads, cmd_prep_dir]) | |
| 114 print '\nSEQuel (pre-processing) command to be executed:\n %s' % (cmd1) | |
| 115 | |
| 116 # Execution of SEQuel (pre-processing) | |
| 117 print 'Executing SEQuel (pre-processing)...' | |
| 118 logfile_prep_file = open(logfile_prep, 'w') if logfile_prep else sys.stdout | |
| 119 try: | |
| 120 try: | |
| 121 subprocess.check_call(cmd1, stdout=logfile_prep_file, stderr=subprocess.STDOUT, shell=True) | |
| 122 finally: | |
| 123 if logfile_prep_file != sys.stdout: | |
| 124 logfile_prep_file.close() | |
| 125 print 'SEQuel (pre-processing) executed!' | |
| 126 | |
| 127 prep_file = os.path.join(prep_directory, 'prep.log') | |
| 128 ext_ins_size = external_insert_size(prep_file) | |
| 129 | |
| 130 # Build SEQuel command | |
| 131 cmd2 = 'java -Xmx12g -jar %s %s -i %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (os.path.join(options.sequel_jar_path, 'SEQuel.jar'), input_directory, ext_ins_size, max_threads, min_threads, kmer_size, max_positional_error, min_fraction, min_aln_length, min_avg_coverage, discard_kmers, discard_positional, min_aln_score, single_cell_mode, report_changes, extend_contig, reference_genome, cmd_seq_dir) | |
| 132 print '\nSEQuel command to be executed:\n %s' % (cmd2) | |
| 133 | |
| 134 # Execution of SEQuel | |
| 135 print 'Executing SEQuel...' | |
| 136 logfile_seq_file = open(logfile_seq, 'w') if logfile_seq else sys.stdout | |
| 137 try: | |
| 138 subprocess.check_call(cmd2, stdout=logfile_seq_file, stderr=subprocess.STDOUT, shell=True) | |
| 139 except subprocess.CalledProcessError, e: | |
| 140 if e.retcode == 24: | |
| 141 sys.exit("Too many contigs in the assembly!") | |
| 142 else: | |
| 143 raise | |
| 144 finally: | |
| 145 if logfile_seq_file != sys.stdout: | |
| 146 logfile_seq_file.close() | |
| 147 print 'SEQuel executed!' | |
| 148 | |
| 149 shutil.move(prep_file, logprep) | |
| 150 shutil.move(os.path.join(seq_directory, 'Log'), logseq) | |
| 151 shutil.move(os.path.join(seq_directory, 'contigs_refined.fa'), contigs_refined) | |
| 152 finally: | |
| 153 shutil.rmtree(prep_directory, ignore_errors=True) | |
| 154 shutil.rmtree(seq_directory, ignore_errors=True) | |
| 155 | |
| 156 | |
| 157 if __name__ == "__main__": | |
| 158 __main__() |
