# HG changeset patch
# User peterjc
# Date 1381480125 14400
# Node ID df86ed992a1bea4faf94f17a8d429cbfa4fce532
# Parent c7538ae82a249ec5bcab1dfa7f86291a54bec5de
Uploaded preview 4, lots of work on mapping
diff -r c7538ae82a24 -r df86ed992a1b test-data/tvc_contigs.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tvc_contigs.fasta Fri Oct 11 04:28:45 2013 -0400
@@ -0,0 +1,20 @@
+>mira_c1
+ttagcgtggtcgcggccgaggtaccctctaccatgaaaccaggcttgggtccctctggct
+gtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgttct
+tcttgatcctctccatgatctcctcatggcaccttgatggctggacatgttccacacgaa
+catgaatcctcttccttatgattctgtttccaacctgcttgttgacctcaacaccaacag
+cacgcttggtgacgttccatacccgacccgtgcggccatgatagaacttgtggggcatac
+ctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatacgaa
+ggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatctctgg
+tgcgcgaccccaaaccgtgacccgccggcattttgcggtgtttttcagacctgcccgggc
+ggccgctcgaaa
+>mira_c2
+tttcgagcggycgcccggscgaggtaccctscaccatgaaaccaggcttgggtccctcwg
+gctgyctcttggtgctgataatcttwccytgtgccttkgcctcagccttcaacttatcrt
+tcttcttgatcctctccattatctcctcatggcamckagatggctggacatgttccacac
+gaacatgaatcctcttccttatgattctgtttccaacctgyttgttgacctcaacaccaa
+cagcgcgcttggtgacgttccatacccgacccgtgcggccatggtagaacttgtggggca
+tacctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatac
+gaaggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatccc
+tggtgcgtgacctcaaaccgtgacccgccggcattttgaggtgtttttcagacctgcccg
+ggcggccgctcgaaa
diff -r c7538ae82a24 -r df86ed992a1b test-data/tvc_map_ref_strain.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tvc_map_ref_strain.fasta Fri Oct 11 04:28:45 2013 -0400
@@ -0,0 +1,20 @@
+>mira_c1_bb
+ttagcgtggtcgcggccgaggtaccctctaccatgaaaccaggcttgggtccctctggct
+gtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgttct
+tcttgatcctctccatgatctcctcatggcaccttgatggctggacatgttccacacgaa
+catgaatcctcttccttatgattctgtttccaacctgcttgttgacctcaacaccaacag
+cacgcttggtgacgttccatacccgacccgtgcggccatgatagaacttgtggggcatac
+ctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatacgaa
+ggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatctctgg
+tgcgcgaccccaaaccgtgacccgccggcattttgcggtgtttttcagacctgcccgggc
+ggccgctcgaaa
+>mira_c2_bb
+xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxaccatgaaaccaggcttgggtccctctg
+gctgtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgt
+tcttcttgatcctctccattatctcctcatggcamckagatggctggacatgttccacac
+gaacatgaatcctcttccttatgatyctgttwccaacctgyttgttgacctcaacaccaa
+cagcgcgcttggtgacgttccatacccgacccgtgcggccatggtagaacttgtggggca
+tacctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatac
+gaaggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatcyc
+tggtgcgtgacctcaaaccgtgacccgccggcattttgaggtgtttttcagacctgcccg
+ggcggccgctcgaaa
diff -r c7538ae82a24 -r df86ed992a1b test-data/tvc_map_same_strain.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tvc_map_same_strain.fasta Fri Oct 11 04:28:45 2013 -0400
@@ -0,0 +1,20 @@
+>mira_c1_bb
+ttagcgtggtcgcggccgaggtaccctctaccatgaaaccaggcttgggtccctctggct
+gtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgttct
+tcttgatcctctccatgatctcctcatggcaccttgatggctggacatgttccacacgaa
+catgaatcctcttccttatgattctgtttccaacctgcttgttgacctcaacaccaacag
+cacgcttggtgacgttccatacccgacccgtgcggccatgatagaacttgtggggcatac
+ctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatacgaa
+ggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatctctgg
+tgcgcgaccccaaaccgtgacccgccggcattttgcggtgtttttcagacctgcccgggc
+ggccgctcgaaa
+>mira_c2_bb
+tttcgagcggncgcccggncgaggtaccctncaccatgaaaccaggcttgggtccctctg
+gctgtctcttggtgctgataatcttaccttgtgccttggcctcagccttcaacttatcgt
+tcttcttgatcctctccattatctcctcatggcamckagatggctggacatgttccacac
+gaacatgaatcctcttccttatgattctgtttccaacctgyttgttgacctcaacaccaa
+cagcgcgcttggtgacgttccatacccgacccgtgcggccatggtagaacttgtggggca
+tacctttgtggatcgacccgttaaccttgacatcaacatagtcgccgactttgaagatac
+gaaggtaagttgtgagatgggtaggacccttcttcctgaatgcccgagcaaatagatccc
+tggtgcgtgacctcaaaccgtgacccgccggcattttgaggtgtttttcagacctgcccg
+ggcggccgctcgaaa
diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/README.rst
--- a/tools/mira4/README.rst Thu Sep 26 12:30:08 2013 -0400
+++ b/tools/mira4/README.rst Fri Oct 11 04:28:45 2013 -0400
@@ -37,7 +37,7 @@
* http://toolshed.g2.bx.psu.edu/view/peterjc/mira_datatypes
-There are just two Galaxy files to install:
+There are just three Galaxy files to install:
* mira4.py (the Python script)
* mira4_de_novo.xml (the Galaxy tool definition for de novo usage)
@@ -50,7 +50,7 @@
-You will also need to install MIRA, we used version 4.0 RC2. See:
+You will also need to install MIRA, we used version 4.0 RC3. See:
* http://chevreux.org/projects_mira.html
* http://sourceforge.net/projects/mira-assembler/
@@ -65,7 +65,7 @@
======= ======================================================================
Version Changes
------- ----------------------------------------------------------------------
-v0.0.1 - Initial version (prototype using MIRA 4.0 RC2, and wrapper for v3.4)
+v0.0.1 - Initial version (prototype for MIRA 4.0 RC3, based on wrapper for v3.4)
======= ======================================================================
@@ -73,12 +73,12 @@
==========
Development is on a dedicated GitHub repository:
-https://github.com/peterjc/pico_galaxy/tree/master/tools/mira_4_0
+https://github.com/peterjc/pico_galaxy/tree/master/tools/mira4
For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
the following command from the Galaxy root folder::
- $ tar -czf mira4_wrapper.tar.gz tools/mira4/README.rst tools/mira4/mira4_de_novo.xml tools/mira4/mira4_mapping.xml tools/mira4/mira4.py tools/mira4/tool_dependencies.xml test-data/tvc_mini.fastq test-data/tvc_contigs_mira4.fasta
+ $ tar -czf mira4_wrapper.tar.gz tools/mira4/README.rst tools/mira4/mira4_de_novo.xml tools/mira4/mira4_mapping.xml tools/mira4/mira4.py tools/mira4/tool_dependencies.xml test-data/tvc_mini.fastq test-data/tvc_contigs.fasta test-data/tvc_map_ref_strain.fasta test-data/tvc_map_same_strain.fasta
Check this worked::
@@ -89,7 +89,9 @@
tools/mira4/mira4.py
tools/mira4/tool_dependencies.xml
test-data/tvc_mini.fastq
- test-data/tvc_contigs_mira4.fasta
+ test-data/tvc_contigs.fasta
+ test-data/tvc_map_ref_strain.fasta
+ test-data/tvc_map_same_strain.fasta
Licence (MIT)
diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/mira4.py
--- a/tools/mira4/mira4.py Thu Sep 26 12:30:08 2013 -0400
+++ b/tools/mira4/mira4.py Fri Oct 11 04:28:45 2013 -0400
@@ -31,7 +31,7 @@
return ver.split("\n", 1)[0]
-os.environ["PATH"] = "/mnt/galaxy/downloads/mira_4.0rc2_linux-gnu_x86_64_static/bin/:%s" % os.environ["PATH"]
+os.environ["PATH"] = "/mnt/galaxy/downloads/mira_4.0rc3_linux-gnu_x86_64_static/bin/:%s" % os.environ["PATH"]
mira_binary = "mira"
mira_ver = get_version(mira_binary)
if not mira_ver.strip().startswith("4.0"):
@@ -51,41 +51,7 @@
sys.stderr.write("\n%s\nEnd of manifest\n%s\n" % ("="*60, "="*60))
-def massage_symlinks(manifest):
- """Create FASTQ aliases and edit the manifest to use them.
-
- Short term measure for MIRA 4.0RC2 which depends on data file
- extensions to decide the file format, and doesn't like *.dat
- as used in Galaxy.
- """
- base = os.path.split(manifest)[0]
- with open(manifest) as h:
- lines = h.readlines()
- f = 0
- for i, line in enumerate(lines):
- if not line.startswith("data ="):
- continue
- #Assumes no spaces in filename, would they have to be escaped?
- new_line = "data ="
- for filename in line[6:].strip().split():
- if not filename:
- continue
- assert os.path.isfile(filename), filename
- f += 1
- alias = os.path.join(base, "input%i.fastq" % f)
- new_line += " " + alias
- cmd = "ln -s %s %s" % (filename, alias)
- if os.system(cmd):
- stop_err("Problem creating FASTQ alias:\n%s" % cmd)
- lines[i] = new_line + "\n"
- with open(manifest, "w") as h:
- for line in lines:
- #sys.stderr.write(line)
- h.write(line)
- return True
-
-
-def collect_output(temp, name):
+def collect_output(temp, name, handle):
n3 = (temp, name, name, name)
f = "%s/%s_assembly/%s_d_results" % (temp, name, name)
if not os.path.isdir(f):
@@ -95,16 +61,34 @@
log_manifest(manifest)
stop_err("Empty output folder")
missing = []
- for old, new in [("%s/%s_out.maf" % (f, name), out_maf),
- ("%s/%s_out.unpadded.fasta" % (f, name), out_fasta)]:
+
+ old_maf = "%s/%s_out.maf" % (f, name)
+ if not os.path.isfile(old_maf):
+ #Triggered extractLargeContigs.sh?
+ old_maf = "%s/%s_LargeContigs_out.maf" % (f, name)
+
+ #De novo or single strain mapping,
+ old_fasta = "%s/%s_out.unpadded.fasta" % (f, name)
+ if not os.path.isfile(old_fasta):
+ #Mapping (currently StrainX versus reference)
+ old_fasta = "%s/%s_out_StrainX.unpadded.fasta" % (f, name)
+ if not os.path.isfile(old_fasta):
+ #Triggered extractLargeContigs.sh?
+ old_fasta = "%s/%s_LargeContigs_out.fasta" % (f, name)
+
+ missing = False
+ for old, new in [(old_maf, out_maf),
+ (old_fasta, out_fasta)]:
if not os.path.isfile(old):
- missing.append(os.path.splitext(old)[-1])
+ missing = True
else:
+ handle.write("Capturing %s\n" % old)
shutil.move(old, new)
if missing:
log_manifest(manifest)
- sys.stderr.write("Contents of %r: %r\n" % (f, os.listdir(f)))
- stop_err("Missing output files: %s" % ", ".join(missing))
+ sys.stderr.write("Contents of %r:\n" % f)
+ for filename in sorted(os.listdir(f)):
+ sys.stderr.write("%s\n" % filename)
def clean_up(temp, name):
folder = "%s/%s_assembly" % (temp, name)
@@ -119,9 +103,6 @@
name = "MIRA"
manifest, out_maf, out_fasta, out_log = sys.argv[1:5]
-#Hack until MIRA v4 lets us specify file format explicitly,
-massage_symlinks(manifest)
-
start_time = time.time()
#cmd_list =sys.argv[8:]
cmd_list = [mira_binary, manifest]
@@ -142,6 +123,15 @@
#print cmd
handle = open(out_log, "w")
+handle.write("======================== MIRA manifest (instructions) ========================\n")
+m = open(manifest, "rU")
+for line in m:
+ handle.write(line)
+m.close()
+del m
+handle.write("\n")
+handle.write("============================ Starting MIRA now ===============================\n")
+handle.flush()
try:
#Run MIRA
child = subprocess.Popen(cmd_list,
@@ -159,8 +149,10 @@
assert not stdout and not stderr #Should be empty as sent to handle
run_time = time.time() - start_time
return_code = child.returncode
-handle.write("\n\nMIRA took %0.2f minutes\n" % (run_time / 60.0))
-print "MIRA took %0.2f minutes" % (run_time / 60.0)
+handle.write("\n")
+handle.write("============================ MIRA has finished ===============================\n")
+handle.write("MIRA took %0.2f hours\n" % (run_time / 3600.0))
+print "MIRA took %0.2f hours" % (run_time / 3600.0)
if return_code:
handle.write("Return error code %i from command:\n" % return_code)
handle.write(cmd + "\n")
@@ -169,12 +161,30 @@
log_manifest(manifest)
stop_err("Return error code %i from command:\n%s" % (return_code, cmd),
return_code)
-handle.close()
+handle.flush()
+
+if os.path.isfile("MIRA_assembly/MIRA_d_results/ec.log"):
+ handle.write("\n")
+ handle.write("====================== Extract Large Contigs failed ==========================\n")
+ e = open("MIRA_assembly/MIRA_d_results/ec.log", "rU")
+ for line in e:
+ handle.write(line)
+ e.close()
+ handle.write("============================ (end of ec.log) =================================\n")
+ handle.flush()
#print "Collecting output..."
-collect_output(temp, name)
+collect_output(temp, name, handle)
+
+if os.path.isfile("MIRA_assembly/MIRA_d_results/ec.log"):
+ #Treat as an error, but doing this AFTER collect_output
+ sys.stderr.write("Extract Large Contigs failed\n")
+ handle.write("Extract Large Contigs failed\n")
+ handle.close()
+ sys.exit(1)
#print "Cleaning up..."
clean_up(temp, name)
-print "Done"
+handle.close()
+print("Done")
diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/mira4_de_novo.xml
--- a/tools/mira4/mira4_de_novo.xml Thu Sep 26 12:30:08 2013 -0400
+++ b/tools/mira4/mira4_de_novo.xml Fri Oct 11 04:28:45 2013 -0400
@@ -1,5 +1,5 @@
- Takes Sanger, Roche, Illumina, Ion Torrent and PacBio data
+ Takes Sanger, Roche 454, Solexa/Illumina, Ion Torrent and PacBio readsBiomira
@@ -19,7 +19,7 @@
-
+
@@ -29,15 +29,14 @@
-
-
-
+
-
-
-
+
+
+
@@ -55,16 +54,26 @@
## point to a local hard drive (not something like NFS on network).
#for $rg in $read_group
-#=======================================================
+
+##This bar goes into the manifest as a comment line
+#------------------------------------------------------------------------------
+
readgroup
technology = ${rg.technology}
##MIRA will accept multiple filenames on one data line, or multiple data lines
-#for f in $rg.reads
-data = ${f.filename}
+#for $f in $rg.filenames
+##Must now map Galaxy datatypes to MIRA file types...
+#if $f.ext.startswith("fastq")
+##MIRA doesn't like fastqsanger etc, just plain old fastq:
+data = fastq::$f
+#elif $f.ext == "mira"
+##We're calling *.maf the "mira" format in Galaxy (name space collision)
+data = maf::$f
+#else
+##MIRA is happy with fasta as name,
+data = ${f.ext}::$f
+#end if
#end for
-### Cheetah doesn't want dollar sign on list comprehension intermediate variables
-###set $files = ' '.join([str(f['filename']) for f in rg['reads']])
-##data = $files
#end for
diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/mira4_mapping.xml
--- a/tools/mira4/mira4_mapping.xml Thu Sep 26 12:30:08 2013 -0400
+++ b/tools/mira4/mira4_mapping.xml Fri Oct 11 04:28:45 2013 -0400
@@ -1,5 +1,5 @@
- Takes Sanger, Roche, Illumina, Ion Torrent and PacBio data
+ Maps Sanger, Roche 454, Solexa/Illumina, Ion Torrent and PacBio readsBiomira
@@ -18,8 +18,18 @@
+
+
+
+
+
+
+
+
+
-
+
@@ -27,17 +37,15 @@
-
-
-
-
+
-
-
-
+
+
+
@@ -54,21 +62,80 @@
## -DI:trt is short for -DIRECTORY:tmp_redirected_to and should
## point to a local hard drive (not something like NFS on network).
+##This bar goes into the manifest as a comment line
+#------------------------------------------------------------------------------
+
+readgroup
+is_reference
+#if str($strain_setup)=="same"
+strain = StrainX
+#end if
+#for $f in $references
+##Must now map Galaxy datatypes to MIRA file types...
+#if $f.ext.startswith("fastq")
+##MIRA doesn't like fastqsanger etc, just plain old fastq:
+data = fastq::$f
+#elif $f.ext == "mira"
+##We're calling *.maf the "mira" format in Galaxy (name space collision)
+data = maf::$f
+#elif $f.ext == "fasta"
+##We're calling MIRA with the file type as "fna" as otherwise it wants quals
+data = fna::$f
+#else
+##Currently don't expect anything else...
+data = ${f.ext}::$f
+#end if
+#end for
#for $rg in $read_group
-#=======================================================
+
+##This bar goes into the manifest as a comment line
+#------------------------------------------------------------------------------
+
readgroup
technology = ${rg.technology}
+#if str($strain_setup)=="same"
+##This is perhaps redundant as MIRA defaults to StrainX for the reads:
+strain = StrainX
+#end if
##MIRA will accept multiple filenames on one data line, or multiple data lines
-#for f in $rg.reads
-data = ${f.filename}
+#for $f in $rg.filenames
+##Must now map Galaxy datatypes to MIRA file types...
+#if $f.ext.startswith("fastq")
+##MIRA doesn't like fastqsanger etc, just plain old fastq:
+data = fastq::$f
+#elif $f.ext == "mira"
+##We're calling *.maf the "mira" format in Galaxy (name space collision)
+data = maf::$f
+#else
+##Currently don't expect anything else...
+data = ${f.ext}::$f
+#end if
#end for
-### Cheetah doesn't want dollar sign on list comprehension intermediate variables
-###set $files = ' '.join([str(f['filename']) for f in rg['reads']])
-##data = $files
#end for
+
+
diff -r c7538ae82a24 -r df86ed992a1b tools/mira4/tool_dependencies.xml
--- a/tools/mira4/tool_dependencies.xml Thu Sep 26 12:30:08 2013 -0400
+++ b/tools/mira4/tool_dependencies.xml Fri Oct 11 04:28:45 2013 -0400
@@ -3,10 +3,9 @@
-
- https://downloads.sourceforge.net/project/mira-assembler/MIRA/stable/mira_4.0rc2_linux-gnu_x86_64_static.tar.bz2?r=&ts=1380039004&use_mirror=kent
+ https://downloads.sourceforge.net/project/mira-assembler/MIRA/stable/mira_4.0rc3_linux-gnu_x86_64_static.tar.bz2
- mira_4.0rc2_linux-gnu_x86_64_static/bin
+ mira_4.0rc3_linux-gnu_x86_64_static/bin$INSTALL_DIR
@@ -17,7 +16,7 @@
Downloads MIRA v4.0 from Sourceforge, requesting Bastien's precompiled binaries
for 64bit Linux (x86_64). He also has binaries for Mac OS X, which we could
-use once the Galaxy installation framework allow that kind of flexibility.
+use once the extensions to allow that are in the stable Galaxy releases.
http://chevreux.org/projects_mira.html
http://sourceforge.net/projects/mira-assembler/