Mercurial > repos > soranzo > sequel
diff sequel_wrapper.py @ 0:33f282b9dafe draft
Uploaded
| author | soranzo |
|---|---|
| date | Fri, 18 Jul 2014 07:07:31 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequel_wrapper.py Fri Jul 18 07:07:31 2014 -0400 @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +SEQuel +version 0.2 (andrea.pinna@crs4.it) +""" + +import optparse +import os +import shutil +import subprocess +import sys +import tempfile +import uuid + +def external_insert_size(prep_file): + """ Retrieve average external insert-size from pre-processing log """ + with open(prep_file, 'r') as f: + for line in f: + if line[0:30] == '\tAverage external insert-size:': + ext_int_size = line.split(':')[1][1:-1] + break + else: + sys.exit('Average external insert-size not found in %s' % prep_file) + return ext_int_size + + +def __main__(): + # load arguments + print 'Parsing SEQuel input options...' + parser = optparse.OptionParser() + parser.add_option('--sequel_jar_path', dest='sequel_jar_path', help='') + parser.add_option('--read1', dest='read1', help='') + parser.add_option('--read2', dest='read2', help='') + parser.add_option('--contigs', dest='contigs', help='') + parser.add_option('--bases_length', dest='bases_length', type='int', help='') + parser.add_option('-t', dest='n_threads', type='int', help='') + parser.add_option('-p', dest='max_threads', type='int', help='') + parser.add_option('-u', dest='min_threads', type='int', help='') + parser.add_option('--kmer_size', dest='kmer_size', type='int', help='') + parser.add_option('--max_positional_error', dest='max_positional_error', type='int', help='') + parser.add_option('--min_fraction', dest='min_fraction', type='float', help='') + parser.add_option('--min_aln_length', dest='min_aln_length', type='int', help='') + parser.add_option('--min_avg_coverage', dest='min_avg_coverage', type='float', help='') + parser.add_option('--discard_kmers', dest='discard_kmers', type='int', help='') + parser.add_option('--discard_positional', dest='discard_positional', type='int', help='') + parser.add_option('--min_aln_score', dest='min_aln_score', type='int', help='') + parser.add_option('--single_cell_mode', action='store_true', dest='single_cell_mode', help='') + parser.add_option('--report_changes', action='store_true', dest='report_changes', help='') + parser.add_option('--extend_contig', action='store_true', dest='extend_contig', help='') + parser.add_option('--reference_genome', dest='reference_genome', help='') + parser.add_option('--contigs_refined', dest='contigs_refined', help='') + parser.add_option('--logprep', dest='logprep', help='') + parser.add_option('--logseq', dest='logseq', help='') + parser.add_option('--logfile_prep', dest='logfile_prep', help='') + parser.add_option('--logfile_seq', dest='logfile_seq', help='') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + + prep_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # prep.pl dies if the directory already exists, so just define a unique name + seq_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # SEQuel.jar would work in another directory (with '_1' suffix) if the directory already exists, so just define a unique name + + # Build SEQuel (pre-processing) command to be executed + # basic preprocessing + prep_input = "-r1 %s -r2 %s -c %s" % (options.read1, options.read2, options.contigs) + bases_length = "-l %d" % (options.bases_length) if options.bases_length is not None else '' + n_threads = "-t %d" % (options.n_threads) if options.n_threads is not None else '' + cmd_prep_dir = "-o %s" % (prep_directory) + cmd_seq_dir = "-o %s" % (seq_directory) + # -i INT (external insert size of paired-end reads (from prep.log) + # -p INT + max_threads = "-p %d" % (options.max_threads) if options.max_threads is not None else '' + # -u INT + min_threads = "-u %d" % (options.min_threads) if options.min_threads is not None else '' + # -A DIR + input_directory = "-A %s" % (prep_directory) + # -k INT + kmer_size = "-k %d" % (options.kmer_size) if options.kmer_size is not None else '' + # -d INT + max_positional_error = "-d %d" % (options.max_positional_error) if options.max_positional_error is not None else '' + # -f FLOAT + min_fraction = "-f %s" % (options.min_fraction) if options.min_fraction is not None else '' + # -l INT + min_aln_length = "-l %d" % (options.min_aln_length) if options.min_aln_length is not None else '' + # -v FLOAT + min_avg_coverage = "-v %s" % (options.min_avg_coverage) if options.min_avg_coverage is not None else '' + # -m INT + discard_kmers = "-m %d" % (options.discard_kmers) if options.discard_kmers is not None else '' + # -n INT + discard_positional = "-n %d" % (options.discard_positional) if options.discard_positional is not None else '' + # -q INT + min_aln_score = "-q %d" % (options.min_aln_score) if options.min_aln_score is not None else '' + # -s + single_cell_mode = '-s' if options.single_cell_mode else '' + # -r + report_changes = '-r' if options.report_changes else '' + # -e + extend_contig = '-e' if options.extend_contig else '' + # -g FILE + reference_genome = "-g %s" % (options.reference_genome) if options.reference_genome else '' + # contigs_refined.fa + contigs_refined = options.contigs_refined + # logprep & logseq + logprep = options.logprep + logseq = options.logseq + # x.refined-fa + # x.stdout + # logfile + logfile_prep = options.logfile_prep + logfile_seq = options.logfile_seq + + # Build SEQuel (pre-processing) command + cmd1 = ' '.join(['prep.pl', prep_input, bases_length, n_threads, cmd_prep_dir]) + print '\nSEQuel (pre-processing) command to be executed:\n %s' % (cmd1) + + # Execution of SEQuel (pre-processing) + print 'Executing SEQuel (pre-processing)...' + logfile_prep_file = open(logfile_prep, 'w') if logfile_prep else sys.stdout + try: + try: + subprocess.check_call(cmd1, stdout=logfile_prep_file, stderr=subprocess.STDOUT, shell=True) + finally: + if logfile_prep_file != sys.stdout: + logfile_prep_file.close() + print 'SEQuel (pre-processing) executed!' + + prep_file = os.path.join(prep_directory, 'prep.log') + ext_ins_size = external_insert_size(prep_file) + + # Build SEQuel command + cmd2 = 'java -Xmx12g -jar %s %s -i %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (os.path.join(options.sequel_jar_path, 'SEQuel.jar'), input_directory, ext_ins_size, max_threads, min_threads, kmer_size, max_positional_error, min_fraction, min_aln_length, min_avg_coverage, discard_kmers, discard_positional, min_aln_score, single_cell_mode, report_changes, extend_contig, reference_genome, cmd_seq_dir) + print '\nSEQuel command to be executed:\n %s' % (cmd2) + + # Execution of SEQuel + print 'Executing SEQuel...' + logfile_seq_file = open(logfile_seq, 'w') if logfile_seq else sys.stdout + try: + subprocess.check_call(cmd2, stdout=logfile_seq_file, stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError, e: + if e.retcode == 24: + sys.exit("Too many contigs in the assembly!") + else: + raise + finally: + if logfile_seq_file != sys.stdout: + logfile_seq_file.close() + print 'SEQuel executed!' + + shutil.move(prep_file, logprep) + shutil.move(os.path.join(seq_directory, 'Log'), logseq) + shutil.move(os.path.join(seq_directory, 'contigs_refined.fa'), contigs_refined) + finally: + shutil.rmtree(prep_directory, ignore_errors=True) + shutil.rmtree(seq_directory, ignore_errors=True) + + +if __name__ == "__main__": + __main__()
