# HG changeset patch # User soranzo # Date 1405681651 14400 # Node ID 33f282b9dafe642eba8d0ca6d2b15cd1d902337f Uploaded diff -r 000000000000 -r 33f282b9dafe COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COPYING Fri Jul 18 07:07:31 2014 -0400 @@ -0,0 +1,23 @@ +Copyright © 2013-2014 CRS4 Srl. http://www.crs4.it/ +Created by: +Andrea Pinna +Nicola Soranzo + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff -r 000000000000 -r 33f282b9dafe readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Fri Jul 18 07:07:31 2014 -0400 @@ -0,0 +1,29 @@ +SEQuel wrapper +============== + +Configuration +------------- + +sequel_wrapper tool may be configured to use more than one CPU core by selecting an appropriate destination for this tool in Galaxy job_conf.xml file (see https://wiki.galaxyproject.org/Admin/Config/Jobs and https://wiki.galaxyproject.org/Admin/Config/Performance/Cluster ). + +If you are using Galaxy release_2013.11.04 or later, this tool will automatically use the number of CPU cores allocated by the job runner according to the configuration of the destination selected for this tool. + +If instead you are using an older Galaxy release, you should also add a line + + GALAXY_SLOTS=N; export GALAXY_SLOTS + +(where N is the number of CPU cores allocated by the job runner for this tool) to the file + + /sequel/1.0.2/crs4/sequel//env.sh + +Version history +--------------- + +- Release 2: Fix version for blat requirement (reported by Björn Grüning). Update Orione citation. +- Release 1: Use $GALAXY_SLOTS instead of $SEQUEL_SITE_OPTIONS. Depend on package_blat_35x1 . Add readme.rst . Update Orione citation. +- Release 0: Initial release in the Tool Shed. + +Development +----------- + +Development is hosted at https://bitbucket.org/crs4/orione-tools . Contributions and bug reports are very welcome! diff -r 000000000000 -r 33f282b9dafe sequel_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequel_wrapper.py Fri Jul 18 07:07:31 2014 -0400 @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +SEQuel +version 0.2 (andrea.pinna@crs4.it) +""" + +import optparse +import os +import shutil +import subprocess +import sys +import tempfile +import uuid + +def external_insert_size(prep_file): + """ Retrieve average external insert-size from pre-processing log """ + with open(prep_file, 'r') as f: + for line in f: + if line[0:30] == '\tAverage external insert-size:': + ext_int_size = line.split(':')[1][1:-1] + break + else: + sys.exit('Average external insert-size not found in %s' % prep_file) + return ext_int_size + + +def __main__(): + # load arguments + print 'Parsing SEQuel input options...' + parser = optparse.OptionParser() + parser.add_option('--sequel_jar_path', dest='sequel_jar_path', help='') + parser.add_option('--read1', dest='read1', help='') + parser.add_option('--read2', dest='read2', help='') + parser.add_option('--contigs', dest='contigs', help='') + parser.add_option('--bases_length', dest='bases_length', type='int', help='') + parser.add_option('-t', dest='n_threads', type='int', help='') + parser.add_option('-p', dest='max_threads', type='int', help='') + parser.add_option('-u', dest='min_threads', type='int', help='') + parser.add_option('--kmer_size', dest='kmer_size', type='int', help='') + parser.add_option('--max_positional_error', dest='max_positional_error', type='int', help='') + parser.add_option('--min_fraction', dest='min_fraction', type='float', help='') + parser.add_option('--min_aln_length', dest='min_aln_length', type='int', help='') + parser.add_option('--min_avg_coverage', dest='min_avg_coverage', type='float', help='') + parser.add_option('--discard_kmers', dest='discard_kmers', type='int', help='') + parser.add_option('--discard_positional', dest='discard_positional', type='int', help='') + parser.add_option('--min_aln_score', dest='min_aln_score', type='int', help='') + parser.add_option('--single_cell_mode', action='store_true', dest='single_cell_mode', help='') + parser.add_option('--report_changes', action='store_true', dest='report_changes', help='') + parser.add_option('--extend_contig', action='store_true', dest='extend_contig', help='') + parser.add_option('--reference_genome', dest='reference_genome', help='') + parser.add_option('--contigs_refined', dest='contigs_refined', help='') + parser.add_option('--logprep', dest='logprep', help='') + parser.add_option('--logseq', dest='logseq', help='') + parser.add_option('--logfile_prep', dest='logfile_prep', help='') + parser.add_option('--logfile_seq', dest='logfile_seq', help='') + (options, args) = parser.parse_args() + if len(args) > 0: + parser.error('Wrong number of arguments') + + prep_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # prep.pl dies if the directory already exists, so just define a unique name + seq_directory = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) # SEQuel.jar would work in another directory (with '_1' suffix) if the directory already exists, so just define a unique name + + # Build SEQuel (pre-processing) command to be executed + # basic preprocessing + prep_input = "-r1 %s -r2 %s -c %s" % (options.read1, options.read2, options.contigs) + bases_length = "-l %d" % (options.bases_length) if options.bases_length is not None else '' + n_threads = "-t %d" % (options.n_threads) if options.n_threads is not None else '' + cmd_prep_dir = "-o %s" % (prep_directory) + cmd_seq_dir = "-o %s" % (seq_directory) + # -i INT (external insert size of paired-end reads (from prep.log) + # -p INT + max_threads = "-p %d" % (options.max_threads) if options.max_threads is not None else '' + # -u INT + min_threads = "-u %d" % (options.min_threads) if options.min_threads is not None else '' + # -A DIR + input_directory = "-A %s" % (prep_directory) + # -k INT + kmer_size = "-k %d" % (options.kmer_size) if options.kmer_size is not None else '' + # -d INT + max_positional_error = "-d %d" % (options.max_positional_error) if options.max_positional_error is not None else '' + # -f FLOAT + min_fraction = "-f %s" % (options.min_fraction) if options.min_fraction is not None else '' + # -l INT + min_aln_length = "-l %d" % (options.min_aln_length) if options.min_aln_length is not None else '' + # -v FLOAT + min_avg_coverage = "-v %s" % (options.min_avg_coverage) if options.min_avg_coverage is not None else '' + # -m INT + discard_kmers = "-m %d" % (options.discard_kmers) if options.discard_kmers is not None else '' + # -n INT + discard_positional = "-n %d" % (options.discard_positional) if options.discard_positional is not None else '' + # -q INT + min_aln_score = "-q %d" % (options.min_aln_score) if options.min_aln_score is not None else '' + # -s + single_cell_mode = '-s' if options.single_cell_mode else '' + # -r + report_changes = '-r' if options.report_changes else '' + # -e + extend_contig = '-e' if options.extend_contig else '' + # -g FILE + reference_genome = "-g %s" % (options.reference_genome) if options.reference_genome else '' + # contigs_refined.fa + contigs_refined = options.contigs_refined + # logprep & logseq + logprep = options.logprep + logseq = options.logseq + # x.refined-fa + # x.stdout + # logfile + logfile_prep = options.logfile_prep + logfile_seq = options.logfile_seq + + # Build SEQuel (pre-processing) command + cmd1 = ' '.join(['prep.pl', prep_input, bases_length, n_threads, cmd_prep_dir]) + print '\nSEQuel (pre-processing) command to be executed:\n %s' % (cmd1) + + # Execution of SEQuel (pre-processing) + print 'Executing SEQuel (pre-processing)...' + logfile_prep_file = open(logfile_prep, 'w') if logfile_prep else sys.stdout + try: + try: + subprocess.check_call(cmd1, stdout=logfile_prep_file, stderr=subprocess.STDOUT, shell=True) + finally: + if logfile_prep_file != sys.stdout: + logfile_prep_file.close() + print 'SEQuel (pre-processing) executed!' + + prep_file = os.path.join(prep_directory, 'prep.log') + ext_ins_size = external_insert_size(prep_file) + + # Build SEQuel command + cmd2 = 'java -Xmx12g -jar %s %s -i %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s' % (os.path.join(options.sequel_jar_path, 'SEQuel.jar'), input_directory, ext_ins_size, max_threads, min_threads, kmer_size, max_positional_error, min_fraction, min_aln_length, min_avg_coverage, discard_kmers, discard_positional, min_aln_score, single_cell_mode, report_changes, extend_contig, reference_genome, cmd_seq_dir) + print '\nSEQuel command to be executed:\n %s' % (cmd2) + + # Execution of SEQuel + print 'Executing SEQuel...' + logfile_seq_file = open(logfile_seq, 'w') if logfile_seq else sys.stdout + try: + subprocess.check_call(cmd2, stdout=logfile_seq_file, stderr=subprocess.STDOUT, shell=True) + except subprocess.CalledProcessError, e: + if e.retcode == 24: + sys.exit("Too many contigs in the assembly!") + else: + raise + finally: + if logfile_seq_file != sys.stdout: + logfile_seq_file.close() + print 'SEQuel executed!' + + shutil.move(prep_file, logprep) + shutil.move(os.path.join(seq_directory, 'Log'), logseq) + shutil.move(os.path.join(seq_directory, 'contigs_refined.fa'), contigs_refined) + finally: + shutil.rmtree(prep_directory, ignore_errors=True) + shutil.rmtree(seq_directory, ignore_errors=True) + + +if __name__ == "__main__": + __main__() diff -r 000000000000 -r 33f282b9dafe sequel_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequel_wrapper.xml Fri Jul 18 07:07:31 2014 -0400 @@ -0,0 +1,134 @@ + + + + bwa + blat + sequel + + + sequel_wrapper.py -t \${GALAXY_SLOTS:-8} -p \${GALAXY_SLOTS:-8} -u 1 + --sequel_jar_path=\$SEQUEL_JAR_PATH --read1=$read1 --read2=$read2 --contigs=$contigs + #if str($bases_length) + --bases_length=$bases_length + #end if + #if str($kmer_size) + --kmer_size=$kmer_size + #end if + #if str($max_positional_error) + --max_positional_error=$max_positional_error + #end if + #if str($min_fraction) + --min_fraction=$min_fraction + #end if + #if str($min_aln_length) + --min_aln_length=$min_aln_length + #end if + #if str($min_avg_coverage) + --min_avg_coverage=$min_avg_coverage + #end if + #if str($discard_kmers) + --discard_kmers=$discard_kmers + #end if + #if str($discard_positional) + --discard_positional=$discard_positional + #end if + #if str($min_aln_score) + --min_aln_score=$min_aln_score + #end if + #if $single_cell_mode + --single_cell_mode + #end if + #if $report_changes + --report_changes + #end if + #if $extend_contig + --extend_contig + #end if + #if $reference_genome + --reference_genome=$reference_genome + #end if + --contigs_refined=$contigs_refined + --logprep=$logprep + --logseq=$logseq + --logfile_prep=$logfile_prep + --logfile_seq=$logfile_seq + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +SEQuel is a tool for correcting errors (i.e., insertions, deletions, and substitutions) in contigs output from assembly. While assemblies of next generation sequencing (NGS) data are accurate, they still contain a substantial number of errors that need to be corrected after the assembly process. The algorithm behind SEQuel makes use of a graph structure called the positional de Bruijn graph, which models k-mers within reads while incorporating their approximate positions into the model. + +SEQuel substantially reduces the number of small insertions, deletions and substitutions errors in assemblies of both standard (multi-cell) and single-cell sequencing data. SEQuel was tested mainly on Illumina sequence data, in combination with multiple NGS assemblers, such as Euler-SR, Velvet, SoapDeNovo, ALLPATHS and SPAdes. + +**Known issues** + +.. class:: warningmark + +During the pre-processing stage, a SAM file per contig is created. Due to runtime considerations, these files are kept open simultaneously. The program will crash when the number of contigs in the assembly is too high. + +**License and citation** + +This Galaxy tool is Copyright © 2013-2014 `CRS4 Srl.`_ and is released under the `MIT license`_. + +.. _CRS4 Srl.: http://www.crs4.it/ +.. _MIT license: http://opensource.org/licenses/MIT + +You can use this tool only if you agree to the license terms of: `SEQuel`_. + +.. _SEQuel: http://bix.ucsd.edu/SEQuel/ + +If you use this tool, please cite: + +- |Cuccuru2014|_ +- |Ronen2012|_. + +.. |Cuccuru2014| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2014) Orione, a web-based framework for NGS analysis in microbiology. *Bioinformatics* 30(13), 1928-1929 +.. _Cuccuru2014: http://bioinformatics.oxfordjournals.org/content/30/13/1928 +.. |Ronen2012| replace:: Ronen R., *et al.* (2012) SEQuel: improving the accuracy of genome assemblies. *Bioinformatics* 28 (12), i188-i196 +.. _Ronen2012: http://bioinformatics.oxfordjournals.org/content/28/12/i188 + + diff -r 000000000000 -r 33f282b9dafe tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Fri Jul 18 07:07:31 2014 -0400 @@ -0,0 +1,29 @@ + + + + + + + + + + + + http://bix.ucsd.edu/SEQuel/download/SEQuel.v102.tar.gz + + . + $INSTALL_DIR + + + $INSTALL_DIR + + + $INSTALL_DIR + + + + +Configuration: Previously (until Release 0), the SEQUEL_SITE_OPTIONS variable in the installed env.sh file was used to adjust the number of threads to use in BWA alignment (-t) or the maximum number of threads for SEQuel (-p) or the minimum number of threads for SEQuel (-u). This is not used anymore and may be removed. + + +