# HG changeset patch # User peterjc # Date 1381480125 14400 # Node ID df86ed992a1bea4faf94f17a8d429cbfa4fce532 # Parent c7538ae82a249ec5bcab1dfa7f86291a54bec5de Uploaded preview 4, lots of work on mapping diff -r c7538ae82a24 -r df86ed992a1b test-data/tvc_contigs.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tvc_contigs.fasta Fri Oct 11 04:28:45 2013 -0400 @@ -0,0 +1,20 @@ +>mira_c1 +ttagcgtggtcgcggccgaggtaccctctaccatgaaaccaggcttgggtccctctggct +gtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgttct +tcttgatcctctccatgatctcctcatggcaccttgatggctggacatgttccacacgaa +catgaatcctcttccttatgattctgtttccaacctgcttgttgacctcaacaccaacag +cacgcttggtgacgttccatacccgacccgtgcggccatgatagaacttgtggggcatac +ctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatacgaa +ggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatctctgg +tgcgcgaccccaaaccgtgacccgccggcattttgcggtgtttttcagacctgcccgggc +ggccgctcgaaa +>mira_c2 +tttcgagcggycgcccggscgaggtaccctscaccatgaaaccaggcttgggtccctcwg +gctgyctcttggtgctgataatcttwccytgtgccttkgcctcagccttcaacttatcrt +tcttcttgatcctctccattatctcctcatggcamckagatggctggacatgttccacac +gaacatgaatcctcttccttatgattctgtttccaacctgyttgttgacctcaacaccaa +cagcgcgcttggtgacgttccatacccgacccgtgcggccatggtagaacttgtggggca +tacctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatac +gaaggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatccc +tggtgcgtgacctcaaaccgtgacccgccggcattttgaggtgtttttcagacctgcccg +ggcggccgctcgaaa diff -r c7538ae82a24 -r df86ed992a1b test-data/tvc_map_ref_strain.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tvc_map_ref_strain.fasta Fri Oct 11 04:28:45 2013 -0400 @@ -0,0 +1,20 @@ +>mira_c1_bb +ttagcgtggtcgcggccgaggtaccctctaccatgaaaccaggcttgggtccctctggct +gtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgttct +tcttgatcctctccatgatctcctcatggcaccttgatggctggacatgttccacacgaa +catgaatcctcttccttatgattctgtttccaacctgcttgttgacctcaacaccaacag +cacgcttggtgacgttccatacccgacccgtgcggccatgatagaacttgtggggcatac +ctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatacgaa +ggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatctctgg +tgcgcgaccccaaaccgtgacccgccggcattttgcggtgtttttcagacctgcccgggc +ggccgctcgaaa +>mira_c2_bb +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxaccatgaaaccaggcttgggtccctctg +gctgtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgt +tcttcttgatcctctccattatctcctcatggcamckagatggctggacatgttccacac +gaacatgaatcctcttccttatgatyctgttwccaacctgyttgttgacctcaacaccaa +cagcgcgcttggtgacgttccatacccgacccgtgcggccatggtagaacttgtggggca +tacctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatac +gaaggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatcyc +tggtgcgtgacctcaaaccgtgacccgccggcattttgaggtgtttttcagacctgcccg +ggcggccgctcgaaa diff -r c7538ae82a24 -r df86ed992a1b test-data/tvc_map_same_strain.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tvc_map_same_strain.fasta Fri Oct 11 04:28:45 2013 -0400 @@ -0,0 +1,20 @@ +>mira_c1_bb +ttagcgtggtcgcggccgaggtaccctctaccatgaaaccaggcttgggtccctctggct +gtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgttct +tcttgatcctctccatgatctcctcatggcaccttgatggctggacatgttccacacgaa +catgaatcctcttccttatgattctgtttccaacctgcttgttgacctcaacaccaacag +cacgcttggtgacgttccatacccgacccgtgcggccatgatagaacttgtggggcatac +ctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatacgaa +ggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatctctgg +tgcgcgaccccaaaccgtgacccgccggcattttgcggtgtttttcagacctgcccgggc +ggccgctcgaaa +>mira_c2_bb +tttcgagcggncgcccggncgaggtaccctncaccatgaaaccaggcttgggtccctctg +gctgtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgt +tcttcttgatcctctccattatctcctcatggcamckagatggctggacatgttccacac +gaacatgaatcctcttccttatgattctgtttccaacctgyttgttgacctcaacaccaa +cagcgcgcttggtgacgttccatacccgacccgtgcggccatggtagaacttgtggggca +tacctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatac +gaaggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatccc +tggtgcgtgacctcaaaccgtgacccgccggcattttgaggtgtttttcagacctgcccg +ggcggccgctcgaaa diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/README.rst --- a/tools/mira4/README.rst Thu Sep 26 12:30:08 2013 -0400 +++ b/tools/mira4/README.rst Fri Oct 11 04:28:45 2013 -0400 @@ -37,7 +37,7 @@ * http://toolshed.g2.bx.psu.edu/view/peterjc/mira_datatypes -There are just two Galaxy files to install: +There are just three Galaxy files to install: * mira4.py (the Python script) * mira4_de_novo.xml (the Galaxy tool definition for de novo usage) @@ -50,7 +50,7 @@ -You will also need to install MIRA, we used version 4.0 RC2. See: +You will also need to install MIRA, we used version 4.0 RC3. See: * http://chevreux.org/projects_mira.html * http://sourceforge.net/projects/mira-assembler/ @@ -65,7 +65,7 @@ ======= ====================================================================== Version Changes ------- ---------------------------------------------------------------------- -v0.0.1 - Initial version (prototype using MIRA 4.0 RC2, and wrapper for v3.4) +v0.0.1 - Initial version (prototype for MIRA 4.0 RC3, based on wrapper for v3.4) ======= ====================================================================== @@ -73,12 +73,12 @@ ========== Development is on a dedicated GitHub repository: -https://github.com/peterjc/pico_galaxy/tree/master/tools/mira_4_0 +https://github.com/peterjc/pico_galaxy/tree/master/tools/mira4 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder:: - $ tar -czf mira4_wrapper.tar.gz tools/mira4/README.rst tools/mira4/mira4_de_novo.xml tools/mira4/mira4_mapping.xml tools/mira4/mira4.py tools/mira4/tool_dependencies.xml test-data/tvc_mini.fastq test-data/tvc_contigs_mira4.fasta + $ tar -czf mira4_wrapper.tar.gz tools/mira4/README.rst tools/mira4/mira4_de_novo.xml tools/mira4/mira4_mapping.xml tools/mira4/mira4.py tools/mira4/tool_dependencies.xml test-data/tvc_mini.fastq test-data/tvc_contigs.fasta test-data/tvc_map_ref_strain.fasta test-data/tvc_map_same_strain.fasta Check this worked:: @@ -89,7 +89,9 @@ tools/mira4/mira4.py tools/mira4/tool_dependencies.xml test-data/tvc_mini.fastq - test-data/tvc_contigs_mira4.fasta + test-data/tvc_contigs.fasta + test-data/tvc_map_ref_strain.fasta + test-data/tvc_map_same_strain.fasta Licence (MIT) diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/mira4.py --- a/tools/mira4/mira4.py Thu Sep 26 12:30:08 2013 -0400 +++ b/tools/mira4/mira4.py Fri Oct 11 04:28:45 2013 -0400 @@ -31,7 +31,7 @@ return ver.split("\n", 1)[0] -os.environ["PATH"] = "/mnt/galaxy/downloads/mira_4.0rc2_linux-gnu_x86_64_static/bin/:%s" % os.environ["PATH"] +os.environ["PATH"] = "/mnt/galaxy/downloads/mira_4.0rc3_linux-gnu_x86_64_static/bin/:%s" % os.environ["PATH"] mira_binary = "mira" mira_ver = get_version(mira_binary) if not mira_ver.strip().startswith("4.0"): @@ -51,41 +51,7 @@ sys.stderr.write("\n%s\nEnd of manifest\n%s\n" % ("="*60, "="*60)) -def massage_symlinks(manifest): - """Create FASTQ aliases and edit the manifest to use them. - - Short term measure for MIRA 4.0RC2 which depends on data file - extensions to decide the file format, and doesn't like *.dat - as used in Galaxy. - """ - base = os.path.split(manifest)[0] - with open(manifest) as h: - lines = h.readlines() - f = 0 - for i, line in enumerate(lines): - if not line.startswith("data ="): - continue - #Assumes no spaces in filename, would they have to be escaped? - new_line = "data =" - for filename in line[6:].strip().split(): - if not filename: - continue - assert os.path.isfile(filename), filename - f += 1 - alias = os.path.join(base, "input%i.fastq" % f) - new_line += " " + alias - cmd = "ln -s %s %s" % (filename, alias) - if os.system(cmd): - stop_err("Problem creating FASTQ alias:\n%s" % cmd) - lines[i] = new_line + "\n" - with open(manifest, "w") as h: - for line in lines: - #sys.stderr.write(line) - h.write(line) - return True - - -def collect_output(temp, name): +def collect_output(temp, name, handle): n3 = (temp, name, name, name) f = "%s/%s_assembly/%s_d_results" % (temp, name, name) if not os.path.isdir(f): @@ -95,16 +61,34 @@ log_manifest(manifest) stop_err("Empty output folder") missing = [] - for old, new in [("%s/%s_out.maf" % (f, name), out_maf), - ("%s/%s_out.unpadded.fasta" % (f, name), out_fasta)]: + + old_maf = "%s/%s_out.maf" % (f, name) + if not os.path.isfile(old_maf): + #Triggered extractLargeContigs.sh? + old_maf = "%s/%s_LargeContigs_out.maf" % (f, name) + + #De novo or single strain mapping, + old_fasta = "%s/%s_out.unpadded.fasta" % (f, name) + if not os.path.isfile(old_fasta): + #Mapping (currently StrainX versus reference) + old_fasta = "%s/%s_out_StrainX.unpadded.fasta" % (f, name) + if not os.path.isfile(old_fasta): + #Triggered extractLargeContigs.sh? + old_fasta = "%s/%s_LargeContigs_out.fasta" % (f, name) + + missing = False + for old, new in [(old_maf, out_maf), + (old_fasta, out_fasta)]: if not os.path.isfile(old): - missing.append(os.path.splitext(old)[-1]) + missing = True else: + handle.write("Capturing %s\n" % old) shutil.move(old, new) if missing: log_manifest(manifest) - sys.stderr.write("Contents of %r: %r\n" % (f, os.listdir(f))) - stop_err("Missing output files: %s" % ", ".join(missing)) + sys.stderr.write("Contents of %r:\n" % f) + for filename in sorted(os.listdir(f)): + sys.stderr.write("%s\n" % filename) def clean_up(temp, name): folder = "%s/%s_assembly" % (temp, name) @@ -119,9 +103,6 @@ name = "MIRA" manifest, out_maf, out_fasta, out_log = sys.argv[1:5] -#Hack until MIRA v4 lets us specify file format explicitly, -massage_symlinks(manifest) - start_time = time.time() #cmd_list =sys.argv[8:] cmd_list = [mira_binary, manifest] @@ -142,6 +123,15 @@ #print cmd handle = open(out_log, "w") +handle.write("======================== MIRA manifest (instructions) ========================\n") +m = open(manifest, "rU") +for line in m: + handle.write(line) +m.close() +del m +handle.write("\n") +handle.write("============================ Starting MIRA now ===============================\n") +handle.flush() try: #Run MIRA child = subprocess.Popen(cmd_list, @@ -159,8 +149,10 @@ assert not stdout and not stderr #Should be empty as sent to handle run_time = time.time() - start_time return_code = child.returncode -handle.write("\n\nMIRA took %0.2f minutes\n" % (run_time / 60.0)) -print "MIRA took %0.2f minutes" % (run_time / 60.0) +handle.write("\n") +handle.write("============================ MIRA has finished ===============================\n") +handle.write("MIRA took %0.2f hours\n" % (run_time / 3600.0)) +print "MIRA took %0.2f hours" % (run_time / 3600.0) if return_code: handle.write("Return error code %i from command:\n" % return_code) handle.write(cmd + "\n") @@ -169,12 +161,30 @@ log_manifest(manifest) stop_err("Return error code %i from command:\n%s" % (return_code, cmd), return_code) -handle.close() +handle.flush() + +if os.path.isfile("MIRA_assembly/MIRA_d_results/ec.log"): + handle.write("\n") + handle.write("====================== Extract Large Contigs failed ==========================\n") + e = open("MIRA_assembly/MIRA_d_results/ec.log", "rU") + for line in e: + handle.write(line) + e.close() + handle.write("============================ (end of ec.log) =================================\n") + handle.flush() #print "Collecting output..." -collect_output(temp, name) +collect_output(temp, name, handle) + +if os.path.isfile("MIRA_assembly/MIRA_d_results/ec.log"): + #Treat as an error, but doing this AFTER collect_output + sys.stderr.write("Extract Large Contigs failed\n") + handle.write("Extract Large Contigs failed\n") + handle.close() + sys.exit(1) #print "Cleaning up..." clean_up(temp, name) -print "Done" +handle.close() +print("Done") diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/mira4_de_novo.xml --- a/tools/mira4/mira4_de_novo.xml Thu Sep 26 12:30:08 2013 -0400 +++ b/tools/mira4/mira4_de_novo.xml Fri Oct 11 04:28:45 2013 -0400 @@ -1,5 +1,5 @@ - Takes Sanger, Roche, Illumina, Ion Torrent and PacBio data + Takes Sanger, Roche 454, Solexa/Illumina, Ion Torrent and PacBio reads Bio mira @@ -19,7 +19,7 @@ - + @@ -29,15 +29,14 @@ - - - + - - - + + + @@ -55,16 +54,26 @@ ## point to a local hard drive (not something like NFS on network). #for $rg in $read_group -#======================================================= + +##This bar goes into the manifest as a comment line +#------------------------------------------------------------------------------ + readgroup technology = ${rg.technology} ##MIRA will accept multiple filenames on one data line, or multiple data lines -#for f in $rg.reads -data = ${f.filename} +#for $f in $rg.filenames +##Must now map Galaxy datatypes to MIRA file types... +#if $f.ext.startswith("fastq") +##MIRA doesn't like fastqsanger etc, just plain old fastq: +data = fastq::$f +#elif $f.ext == "mira" +##We're calling *.maf the "mira" format in Galaxy (name space collision) +data = maf::$f +#else +##MIRA is happy with fasta as name, +data = ${f.ext}::$f +#end if #end for -### Cheetah doesn't want dollar sign on list comprehension intermediate variables -###set $files = ' '.join([str(f['filename']) for f in rg['reads']]) -##data = $files #end for diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/mira4_mapping.xml --- a/tools/mira4/mira4_mapping.xml Thu Sep 26 12:30:08 2013 -0400 +++ b/tools/mira4/mira4_mapping.xml Fri Oct 11 04:28:45 2013 -0400 @@ -1,5 +1,5 @@ - Takes Sanger, Roche, Illumina, Ion Torrent and PacBio data + Maps Sanger, Roche 454, Solexa/Illumina, Ion Torrent and PacBio reads Bio mira @@ -18,8 +18,18 @@ + + + + + + + + + - + @@ -27,17 +37,15 @@ - - - - + - - - + + + @@ -54,21 +62,80 @@ ## -DI:trt is short for -DIRECTORY:tmp_redirected_to and should ## point to a local hard drive (not something like NFS on network). +##This bar goes into the manifest as a comment line +#------------------------------------------------------------------------------ + +readgroup +is_reference +#if str($strain_setup)=="same" +strain = StrainX +#end if +#for $f in $references +##Must now map Galaxy datatypes to MIRA file types... +#if $f.ext.startswith("fastq") +##MIRA doesn't like fastqsanger etc, just plain old fastq: +data = fastq::$f +#elif $f.ext == "mira" +##We're calling *.maf the "mira" format in Galaxy (name space collision) +data = maf::$f +#elif $f.ext == "fasta" +##We're calling MIRA with the file type as "fna" as otherwise it wants quals +data = fna::$f +#else +##Currently don't expect anything else... +data = ${f.ext}::$f +#end if +#end for #for $rg in $read_group -#======================================================= + +##This bar goes into the manifest as a comment line +#------------------------------------------------------------------------------ + readgroup technology = ${rg.technology} +#if str($strain_setup)=="same" +##This is perhaps redundant as MIRA defaults to StrainX for the reads: +strain = StrainX +#end if ##MIRA will accept multiple filenames on one data line, or multiple data lines -#for f in $rg.reads -data = ${f.filename} +#for $f in $rg.filenames +##Must now map Galaxy datatypes to MIRA file types... +#if $f.ext.startswith("fastq") +##MIRA doesn't like fastqsanger etc, just plain old fastq: +data = fastq::$f +#elif $f.ext == "mira" +##We're calling *.maf the "mira" format in Galaxy (name space collision) +data = maf::$f +#else +##Currently don't expect anything else... +data = ${f.ext}::$f +#end if #end for -### Cheetah doesn't want dollar sign on list comprehension intermediate variables -###set $files = ' '.join([str(f['filename']) for f in rg['reads']]) -##data = $files #end for + + diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/tool_dependencies.xml --- a/tools/mira4/tool_dependencies.xml Thu Sep 26 12:30:08 2013 -0400 +++ b/tools/mira4/tool_dependencies.xml Fri Oct 11 04:28:45 2013 -0400 @@ -3,10 +3,9 @@ - - https://downloads.sourceforge.net/project/mira-assembler/MIRA/stable/mira_4.0rc2_linux-gnu_x86_64_static.tar.bz2?r=&ts=1380039004&use_mirror=kent + https://downloads.sourceforge.net/project/mira-assembler/MIRA/stable/mira_4.0rc3_linux-gnu_x86_64_static.tar.bz2 - mira_4.0rc2_linux-gnu_x86_64_static/bin + mira_4.0rc3_linux-gnu_x86_64_static/bin $INSTALL_DIR @@ -17,7 +16,7 @@ Downloads MIRA v4.0 from Sourceforge, requesting Bastien's precompiled binaries for 64bit Linux (x86_64). He also has binaries for Mac OS X, which we could -use once the Galaxy installation framework allow that kind of flexibility. +use once the extensions to allow that are in the stable Galaxy releases. http://chevreux.org/projects_mira.html http://sourceforge.net/projects/mira-assembler/