Mercurial > repos > drosofff > msp_sr_bowtie_cascade

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtieCascade.py	Mon Nov 03 10:26:15 2014 -0500
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+# small RNA oriented bowtie wrapper in cascade for small RNA data set genome annotation
+# version 0.9 13-6-2014
+# Usage sRbowtie_cascade.py see Parser() for valid arguments
+# Christophe Antoniewski <drosofff@gmail.com>
+
+import sys, os, subprocess, tempfile, shutil, argparse
+from collections import defaultdict
+
+def Parser():
+  the_parser = argparse.ArgumentParser()
+  the_parser.add_argument('--output', action="store", type=str, help="output file")
+  the_parser.add_argument('--num-threads', dest="num_threads", action="store", type=str, help="number of bowtie threads")
+  the_parser.add_argument('--mismatch', action="store", type=str, help="number of mismatches allowed")
+  the_parser.add_argument('--indexing-flags', dest="indexing_flags", nargs='+', help="whether the index should be generated or not by bowtie-buid")
+  the_parser.add_argument('--index',nargs='+', help="paths to indexed or fasta references")
+  the_parser.add_argument('--indexName',nargs='+', help="Names of the indexes")
+  the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
+  the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
+  args = the_parser.parse_args()
+  return args
+
+def stop_err( msg ):
+  sys.stderr.write( '%s\n' % msg )
+  sys.exit()
+
+def bowtie_squash(fasta):
+  tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes
+  ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
+  ref_file_name = ref_file.name
+  ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name
+  os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name
+  cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir)
+  try:
+    FNULL = open(os.devnull, 'w')
+    tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string
+    tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file
+    proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in  dev/null
+    returncode = proc.wait()
+    tmp_stderr.close()
+    FNULL.close()
+    sys.stdout.write(cmd1 + "\n")
+  except Exception, e:
+    # clean up temp dir
+    if os.path.exists( tmp_index_dir ):
+      shutil.rmtree( tmp_index_dir )
+      stop_err( 'Error indexing reference sequence\n' + str( e ) )
+  # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment()
+  index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention
+  return index_full_path
+
+def make_working_dir():
+  working_dir = tempfile.mkdtemp()
+  return working_dir
+
+def Clean_TempDir(directory):
+  if os.path.exists( directory ):
+    shutil.rmtree( directory )
+  return
+
+def bowtie_alignment(command_line="None", working_dir = ""):
+  FNULL = open(os.devnull, 'w')
+  p = subprocess.Popen(args=command_line, cwd=working_dir, shell=True, stderr=FNULL, stdout=FNULL)
+  returncode = p.wait()
+  sys.stdout.write("%s\n" % command_line)
+  FNULL.close()
+  #p = subprocess.Popen(["wc", "-l", "%s/al.fasta"%working_dir], cwd=working_dir, stdout=subprocess.PIPE)
+  #aligned =  p.communicate()[0].split()[0]
+  aligned = 0
+  try: # hacked at gcc2014 in case of no alignment, no al.fasta file generated (?)
+    F = open ("%s/al.fasta" % working_dir, "r")
+    for line in F:
+      aligned += 1
+    F.close()
+  except: pass
+  sys.stdout.write("Aligned: %s\n" % aligned)
+  return aligned/2
+
+def CommandLiner (v_mis="1", pslots="12", index="dum/my", input="dum/my", working_dir=""):
+  return "bowtie -v %s -k 1 --best -p %s --al %s/al.fasta --un %s/unal.fasta --suppress 1,2,3,4,5,6,7,8 %s -f %s" % (v_mis, pslots, working_dir, working_dir, index, input)
+
+def __main__():
+  args = Parser()
+  ## first we make all indexes available. They can be already available or be squashed by bowtie-build
+  ## we keep them in a list that alternates indexPath and "toClear" or "DoNotDelete"
+  BowtieIndexList = []
+  for indexing_flags, bowtiePath in zip (args.indexing_flags, args.index):
+    if indexing_flags == "history":
+      BowtieIndexList.append ( bowtie_squash (bowtiePath) )
+      BowtieIndexList.append ( "toClear" )
+    else:
+      BowtieIndexList.append ( bowtiePath )
+      BowtieIndexList.append ( "DoNotDelete")
+  ###### temporary Indexes are generated. They must be deleted at the end (after removing file name in the temp path)
+  ResultDict = defaultdict(list)
+  for label, input in zip(args.label, args.input): ## the main cascade, iterating over samples and bowtie indexes
+    workingDir = make_working_dir()
+    cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[0], input=input, working_dir=workingDir)
+    ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) # first step of the cascade
+    if len(BowtieIndexList) > 2: # is there a second step to perform ?
+      os.rename("%s/al.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) ## end of first step. the aligned reads are the input of the next step
+      cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[2], input="%s/toAlign.fasta"%workingDir, working_dir=workingDir)
+      ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )## second step of the cascade
+    if len(BowtieIndexList) > 4:  ## remaining steps
+      for BowtieIndexPath in BowtieIndexList[4::2]:
+        try:
+          os.unlink("%s/al.fasta" % workingDir) # hacked at gcc 2014, to remove previous al.fasta file that may interfere with counting if new al.fasta is empty
+        except: pass
+        os.rename("%s/unal.fasta"%workingDir, "%s/toAlign.fasta"%workingDir)
+        cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexPath, input="%s/toAlign.fasta"%workingDir, working_dir=workingDir)
+        ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )
+    Fun = open("%s/unal.fasta"%workingDir, "r") ## to finish, compute the number of unmatched reads
+    n = 0
+    for line in Fun:
+      n += 1
+    ResultDict[label].append(n/2)
+    Fun.close()
+    Clean_TempDir (workingDir) # clean the sample working directory
+  ## cleaning
+  for IndexPath, IndexFlag in zip(BowtieIndexList[::2], BowtieIndexList[1::2]):
+    if IndexFlag == "toClear":
+      Clean_TempDir ("/".join(IndexPath.split("/")[:-1]))
+  ## end of cleaning
+
+
+
+  F = open (args.output, "w")
+  print >> F, "alignment reference\t%s" % "\t".join(args.label)
+  for i, reference in enumerate(args.indexName):
+    F.write ("%s" % reference)
+    for sample in args.label:
+      F.write ("\t%s" % "{:,}".format(ResultDict[sample][i]) )
+    print >> F
+  F.write ("Remaining Unmatched")
+  for sample in args.label:
+    F.write ("\t%s" % "{:,}".format(ResultDict[sample][-1]) )
+  print >> F
+
+  F.close()
+
+if __name__=="__main__": __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtieCascade.xml	Mon Nov 03 10:26:15 2014 -0500
@@ -0,0 +1,142 @@
+<tool id="sRbowtie_cascade" name="Annotate smRNA datasets" version="1.0.0">
+  <description>Using iterative sRbowtie Alignments</description>
+  <requirements>
+        <requirement type="package" version="0.12.7">bowtie</requirement>
+  </requirements>
+  <parallelism method="basic"></parallelism>
+  <command interpreter="python"> sRbowtieCascade.py --output $output
+                                                    --num-threads \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie
+                                                    --mismatch $mismatches
+                                                    --input
+						    #for $i in $input:
+						      $i
+						    #end for
+                                                   --label
+                                                    #for $i in $input:
+                                                      "$i.name"
+                                                    #end for
+                                                   --index
+                                                    #if $refGenomeSource1.genomeSource == "history":
+                                                      $refGenomeSource1.ownFile
+                                                    #else:
+                                                      $refGenomeSource1.index.fields.path
+                                                    #end if
+                                                    #for $i in $AdditionalQueries:
+						      #if $i.refGenomeSource.genomeSource == "history":
+						        $i.refGenomeSource.ownFile
+                                                      #else:
+                                                        $i.refGenomeSource.index.fields.path
+                                                      #end if
+                                                    #end for
+                                                   --indexing-flags
+						    $refGenomeSource1.genomeSource
+                                                    #for $i in $AdditionalQueries:
+						      $i.refGenomeSource.genomeSource
+                                                    #end for
+                                                   --indexName
+                                                    #if $refGenomeSource1.genomeSource == "history":
+                                                      "$refGenomeSource1.ownFile.name"
+                                                    #else:
+                                                      "$refGenomeSource1.index.fields.name"
+                                                    #end if
+                                                    #for $i in $AdditionalQueries:
+                                                      #if $i.refGenomeSource.genomeSource == "history":
+                                                        "$i.refGenomeSource.ownFile.name"
+                                                      #else:
+                                                        "$i.refGenomeSource.index.fields.name"
+                                                      #end if
+                                                    #end for
+  </command>
+  <inputs>
+      <param name="input" type="data" format="fasta" label="Input fasta file: reads clipped from their adapter" help="Only with clipped, raw fasta files" multiple="true"/>
+    <param name="mismatches" type="select" label="Number of mismatches allowed" help="specify the number of mismatches allowed during alignments">
+        <option value="0">0</option>
+        <option value="1" selected="true">1</option>
+        <option value="2">2</option>
+        <option value="3">3</option>
+    </param>
+<!-- First bowtie index selection -->
+    <conditional name="refGenomeSource1">
+      <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="index" type="select" label="Select a DNA reference index" help="if your genome of interest is not listed - contact GED team">
+          <options from_data_table="bowtie_indexes"/>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="fasta" label="Select a fasta file, to serve as index reference" />
+      </when>
+    </conditional>
+<!-- End of first bowtie index selection -->
+<!-- other  bowtie index selections -->
+    <repeat name="AdditionalQueries" title="Additional Alignment Step">
+	<conditional name="refGenomeSource">
+      		<param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options">
+			<option value="indexed">Use a built-in index</option>
+			<option value="history">Use one from the history</option>
+		</param>
+		<when value="indexed">
+			<param name="index" type="select" label="Select a DNA reference index" help="if your genome of interest is not listed - contact GED team">
+				<options from_data_table="bowtie_indexes"/>
+			</param>
+		</when>
+		<when value="history">
+			<param name="ownFile" type="data" format="fasta" label="Select a fasta file, to serve as index reference" />
+		</when>
+        </conditional>
+    </repeat>
+<!-- End of other bowtie index selections -->
+   </inputs>
+   <outputs>
+   <data format="tabular" name="output" label="Cascade Annotation Analysis"/>
+   </outputs>
+
+    <test>
+    </test>
+
+  <help>
+
+**Intro**
+
+Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient.
+A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution.
+However, this Bowtie wrapper tool only takes FASTQ files as inputs.
+
+Here The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode, with -k 1)
+
+.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml
+
+
+------
+
+**What it does**
+
+.. class:: infomark
+
+This script uses the sRbowtie wrapper to iteratively match reads on a reference indexes.
+
+Reads are Matched on DNA references as fast as possible, without taking care of mapping issues
+
+*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8*
+
+unaligned reads at step N are used as input for sRbowtie at step N+1
+
+-----
+
+**Input formats**
+
+.. class:: warningmark
+
+*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter*
+
+-----
+
+**OUTPUTS**
+
+**Annotation table**
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sRbowtie.fa	Mon Nov 03 10:26:15 2014 -0500
@@ -0,0 +1,40 @@
+>1
+GTATTGTAAGTGGCAGAGTGGC
+>2
+TGGAATGTAAAGAAGTATGGAG
+>3
+GTGGGGAGTTTGGATGGGGCGGCA
+>4
+AATGGCACTGGAAGAATTCACGG
+>5
+GTACGGACAAGGGGAATC
+>6
+TTGGGTTCTGGGGGGAGTATGG
+>7
+GTGGGGAGTTTCGCTGGGGCGGCA
+>8
+TAAGGGTCGGGTAGTGAGGGC
+>9
+AGCTGGGACTGAGGACTG
+>10
+AGCTGGGACTGAGGACTGC
+>11
+AAGGGTCGGGTAGTGAGG
+>12
+GTCGGGTAGTGAGGGCCTT
+>13
+TGGTGGGGCTTGGAACAATTGGAGGGC
+>14
+TGACGGAAGGGCACCACC
+>15
+TGGAATGTAAAGAAGTATGGAG
+>16
+TTGGGTTCTGGGGGGAGT
+>17
+TCAGGTGGGGAGTTTGGCTGGGGCGGCACA
+>18
+TTGGGTATAGGGGCGAAAGA
+>19
+AGCGGGCGTGCTTGTGGAC
+>20
+GTCGAATTTGGGTATAGGGGC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sRbowtie.out	Mon Nov 03 10:26:15 2014 -0500
@@ -0,0 +1,5 @@
+2	+	2L	20487495	TGGAATGTAAAGAAGTATGGAG
+4	-	2L	11953463	CCGTGAATTCTTCCAGTGCCATT
+15	+	2L	20487495	TGGAATGTAAAGAAGTATGGAG
+14	-	Uextra	7115665	GGTGGTGCCCTTCCGTCA
+18	+	Uextra	7726410	TTGGGTATAGGGGCGAAAGA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/bowtie_indices.loc.sample	Mon Nov 03 10:26:15 2014 -0500
@@ -0,0 +1,37 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Bowtie indexed sequences data files. You will
+#need to create these data files and then create a bowtie_indices.loc
+#file similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bowtie_indices.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_base_path>
+#
+#So, for example, if you had hg18 indexed stored in
+#/depot/data2/galaxy/bowtie/hg18/,
+#then the bowtie_indices.loc entry would look like this:
+#
+#hg18	hg18	hg18	/depot/data2/galaxy/bowtie/hg18/hg18
+#
+#and your /depot/data2/galaxy/bowtie/hg18/ directory
+#would contain hg18.*.ebwt files:
+#
+#-rw-r--r--  1 james    universe 830134 2005-09-13 10:12 hg18.1.ebwt
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 hg18.2.ebwt
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 hg18.3.ebwt
+#...etc...
+#
+#Your bowtie_indices.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files. For example:
+#
+#hg18canon	 hg18	hg18 Canonical	/depot/data2/galaxy/bowtie/hg18/hg18canon
+#hg18full	 hg18	hg18 Full	 /depot/data2/galaxy/bowtie/hg18/hg18full
+#/orig/path/hg19	hg19	hg19	 /depot/data2/galaxy/bowtie/hg19/hg19
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon Nov 03 10:26:15 2014 -0500
@@ -0,0 +1,8 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of indexes in the Bowtie mapper format -->
+    <table name="bowtie_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/bowtie_indices.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Nov 03 10:26:15 2014 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="bowtie" version="0.12.7">
+      <repository changeset_revision="f54826948b0b" name="package_bowtie_0_12_7" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>