changeset 0:a0c8173dbdc9 draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 10:59:29 -0400
parents
children 065a60688a90
files rmap_wrapper.py rmap_wrapper.xml test-data/rmap_wrapper_test1.bed test-data/rmap_wrapper_test1.fasta tool-data/faseq.loc.sample tool_dependencies.xml
diffstat 6 files changed, 234 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmap_wrapper.py	Mon May 19 10:59:29 2014 -0400
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= (2.4)
+
+def stop_err( msg ):
+    
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+
+def __main__():
+    
+    # I/O
+    target_path = sys.argv[1]
+    infile = sys.argv[2]
+    read_len = sys.argv[3]              # -w
+    align_len = sys.argv[4]             # -h
+    mismatch = sys.argv[5]              # -m
+    output_file = sys.argv[6]
+    
+    # first guess the read length
+    guess_read_len = 0
+    seq = ''
+    for i, line in enumerate(open(infile)):
+        line = line.rstrip('\r\n')
+        if line.startswith('>'):
+            if seq:
+                guess_read_len = len(seq)
+                break
+        else:
+            seq += line
+            
+    try: 
+        test = int(read_len)
+        if test == 0:
+            read_len = str(guess_read_len)
+        else:
+            assert test >= 20 and test <= 64
+    except:
+        stop_err('Invalid value for read length. Must be between 20 and 64.')
+    
+    try:
+        int(align_len)    
+    except:
+        stop_err('Invalid value for minimal length of a hit.')
+    
+    try:
+        int(mismatch)
+        #assert test >= 0 and test <= int(0.1*int(read_len))
+    except:
+        stop_err('Invalid value for mismatch numbers in an alignment.')
+    
+    all_files = []
+    if os.path.isdir(target_path):
+        
+        # check target genome
+        fa_files = os.listdir(target_path)
+            
+        for file in fa_files:
+            file = "%s/%s" % ( target_path, file )
+            file = os.path.normpath(file)
+            all_files.append(file)
+    else:
+        stop_err("No sequences for %s are available for search, please report this error." %(target_path))
+   
+    for detail_file_path in all_files:
+        output_tempfile = tempfile.NamedTemporaryFile().name
+        command = "rmap -h %s -w %s -m %s -c %s %s -o %s 2>&1" % ( align_len, read_len, mismatch, detail_file_path, infile, output_tempfile )
+        #print command
+        try:
+            os.system( command )
+        except Exception, e:
+            stop_err( str( e ) )
+
+        try:
+            os.system( 'cat %s >> %s' % ( output_tempfile, output_file ) )
+        except Exception, e:
+            stop_err( str( e ) )
+        
+        try:
+            os.remove( output_tempfile )
+        except:
+            pass
+        
+        
+if __name__ == '__main__': __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmap_wrapper.xml	Mon May 19 10:59:29 2014 -0400
@@ -0,0 +1,86 @@
+<tool id="rmap_wrapper" name="RMAP" version="1.0.0">
+    <description>for Solexa Short Reads Alignment</description>
+    <requirements>
+      <requirement type="package" version="2.05">rmap</requirement>
+    </requirements>
+    <command interpreter="python">
+    #if $trim.choice=="No":
+        rmap_wrapper.py $database $input_seq 0 $align_len $mismatch $output1
+    #else:
+        rmap_wrapper.py $database $input_seq $trim.read_len $align_len $mismatch $output1
+    #end if
+    </command>
+    <inputs>
+        <param name="database" type="select" display="radio" label="Target database">
+			<options from_file="faseq.loc">
+			  <column name="name" index="0"/>
+			  <column name="value" index="0"/>
+			</options>
+        </param>
+        <param name="input_seq" type="data" format="fasta" label="Sequence file"/>
+        <param name="align_len" type="integer" size="15" value="11" label="Minimal length of a hit (-h)" help="seed" />
+        <param name="mismatch" type="select" label="Number of mismatches allowed (-m)">
+            <option value="0">0</option>
+            <option value="1">1</option>
+            <option value="3">3</option>
+            <option value="5">5</option>
+        </param>
+        <conditional name="trim">
+            <param name="choice" type="select" label="To trim the reads">
+                <option value="No">No</option>
+                <option value="Yes">Yes</option>
+            </param>
+            <when value="No">
+            </when>
+            <when value="Yes">
+                <param name="read_len" type="integer" size="15" value="36" label="Read length (-w)"/> 
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output1" format="bed"/>
+    </outputs>
+    <!--     
+    <tests>
+        <test>
+            <param name="database" value="/galaxy/data/faseq/test" />
+            <param name="input_seq" value="rmap_wrapper_test1.fasta" ftype="fasta"/>
+            <param name="read_len" value="36" />
+            <param name="align_len" value="36" />
+            <param name="mismatch" value="3" />
+            <output name="output1" file="rmap_wrapper_test1.bed"/> 
+        </test>
+    </tests>
+     -->
+    <help>
+    
+.. class:: warningmark
+
+ RMAP was developed for **Solexa** reads. 
+
+.. class:: infomark
+
+**TIP**. The tool will guess the length of the reads, however, if you select to trim the reads, the *Reads length* must be between 20 and 64. Reads with lengths longer than the specified value will be trimmed at the 3'end. 
+
+-----
+
+**What it does**
+
+This tool runs **rmap** (for more information, please see the reference below), mapping Solexa reads onto a genome build.   
+
+-----
+
+**Parameters**
+
+- *Minimal Length of a Hit* (**-h**) : this is the seed length or the minimal exact match length   
+- *Number of Mismatches Allowed* (**-m**) : the maximal number of mismatches allowed in an alignment 
+- *Read Length* (**-w**) : maximal length of the reads; reads longer than the threshold will be truncated at 3' end.
+
+-----
+
+**Reference**
+
+ **RMAP** is developed by Dr. Andrew D Smith and Dr. Zhenyu Xuan at the Cold Spring Harbor Laboratory. Please see http://rulai.cshl.edu/rmap/
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rmap_wrapper_test1.bed	Mon May 19 10:59:29 2014 -0400
@@ -0,0 +1,8 @@
+phix	360	396	seq1	1	-
+phix	4188	4224	seq2	1	+
+phix	4908	4944	seq4	0	-
+phix	2811	2847	seq5	2	+
+phix	3847	3883	seq6	0	-
+phix	91	127	seq7	0	+
+phix	2302	2338	seq8	2	+
+phix	2448	2484	seq9	0	+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rmap_wrapper_test1.fasta	Mon May 19 10:59:29 2014 -0400
@@ -0,0 +1,20 @@
+>seq1
+GACTCATGATTTCTTACCTATTAGTGGTTGAACATC
+>seq2
+GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT
+>seq3
+GTTGTCGATAGAACTTCATGTGCCTGTAAAACAAGT
+>seq4
+ACCAACCAGAACGTGAAAAAGCGTCCTGCGTGTAGC
+>seq5
+GTTTATGTTGGTTTCATGGTTTTGTCTAACTTTATC
+>seq6
+GCTTTACCGTCTTTCCAGAAATTGTTCCAAGTATCG
+>seq7
+GCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGC
+>seq8
+GTTATAACGCCGAAGCGGTAAAAATTTTTATTTTTT
+>seq9
+GTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCT
+>seq10
+GTGGCCTGTTGATTCTAAAGGTTAGTTTCTTCACGC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/faseq.loc.sample	Mon May 19 10:59:29 2014 -0400
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use genome fasta sequence files.  The faseq.loc file has this format 
+#(white space characters are TAB characters):
+#
+# <GenomeBuild> <dir>
+#
+# In the dir, each file is fasta format and contains only one sequence. So,
+#for example, if you had hg18 fasta sequences stored in /depot/data2/galaxy/faseq/hg18,
+#then your faseq.loc entry would look like this:
+#
+#hg18	/depot/data2/galaxy/faseq/hg18
+#
+#and your /depot/data2/galaxy/faseq/hg18 directory would contain all of 
+#your fasta sequence files (e.g.):
+#
+#-rw-r--r--  1 wychung galaxy 138082251 2008-04-16 11:57 chr10.fa
+#-rw-r--r--  1 wychung galaxy    115564 2008-04-16 11:57 chr10_random.fa
+#-rw-r--r--  1 wychung galaxy 137141451 2008-04-16 11:58 chr11.fa
+#...etc...
+#Your faseq.loc file should include an entry per line for each set of fasta 
+#sequence files you have stored.  For example:
+#
+#hg18			/depot/data2/galaxy/faseq/hg18
+#mm9			/depot/data2/galaxy/faseq/mm9
+#Arabidopsis	/depot/data2/galaxy/faseq/Arabidopsis
+#...etc...
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon May 19 10:59:29 2014 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="rmap" version="2.05">
+    <repository changeset_revision="19250a976d28" name="package_rmap_2_05" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+  </package>
+</tool_dependency>