# HG changeset patch # User drosofff # Date 1415028375 18000 # Node ID ecb041b49cd75eb520fddf09b10fab5a3ede582f Imported from capsule None diff -r 000000000000 -r ecb041b49cd7 sRbowtieCascade.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtieCascade.py Mon Nov 03 10:26:15 2014 -0500 @@ -0,0 +1,141 @@ +#!/usr/bin/env python +# small RNA oriented bowtie wrapper in cascade for small RNA data set genome annotation +# version 0.9 13-6-2014 +# Usage sRbowtie_cascade.py see Parser() for valid arguments +# Christophe Antoniewski + +import sys, os, subprocess, tempfile, shutil, argparse +from collections import defaultdict + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument('--output', action="store", type=str, help="output file") + the_parser.add_argument('--num-threads', dest="num_threads", action="store", type=str, help="number of bowtie threads") + the_parser.add_argument('--mismatch', action="store", type=str, help="number of mismatches allowed") + the_parser.add_argument('--indexing-flags', dest="indexing_flags", nargs='+', help="whether the index should be generated or not by bowtie-buid") + the_parser.add_argument('--index',nargs='+', help="paths to indexed or fasta references") + the_parser.add_argument('--indexName',nargs='+', help="Names of the indexes") + the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") + the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") + args = the_parser.parse_args() + return args + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def bowtie_squash(fasta): + tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes + ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) + ref_file_name = ref_file.name + ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name + os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name + cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir) + try: + FNULL = open(os.devnull, 'w') + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string + tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file + proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null + returncode = proc.wait() + tmp_stderr.close() + FNULL.close() + sys.stdout.write(cmd1 + "\n") + except Exception, e: + # clean up temp dir + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + stop_err( 'Error indexing reference sequence\n' + str( e ) ) + # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment() + index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention + return index_full_path + +def make_working_dir(): + working_dir = tempfile.mkdtemp() + return working_dir + +def Clean_TempDir(directory): + if os.path.exists( directory ): + shutil.rmtree( directory ) + return + +def bowtie_alignment(command_line="None", working_dir = ""): + FNULL = open(os.devnull, 'w') + p = subprocess.Popen(args=command_line, cwd=working_dir, shell=True, stderr=FNULL, stdout=FNULL) + returncode = p.wait() + sys.stdout.write("%s\n" % command_line) + FNULL.close() + #p = subprocess.Popen(["wc", "-l", "%s/al.fasta"%working_dir], cwd=working_dir, stdout=subprocess.PIPE) + #aligned = p.communicate()[0].split()[0] + aligned = 0 + try: # hacked at gcc2014 in case of no alignment, no al.fasta file generated (?) + F = open ("%s/al.fasta" % working_dir, "r") + for line in F: + aligned += 1 + F.close() + except: pass + sys.stdout.write("Aligned: %s\n" % aligned) + return aligned/2 + +def CommandLiner (v_mis="1", pslots="12", index="dum/my", input="dum/my", working_dir=""): + return "bowtie -v %s -k 1 --best -p %s --al %s/al.fasta --un %s/unal.fasta --suppress 1,2,3,4,5,6,7,8 %s -f %s" % (v_mis, pslots, working_dir, working_dir, index, input) + +def __main__(): + args = Parser() + ## first we make all indexes available. They can be already available or be squashed by bowtie-build + ## we keep them in a list that alternates indexPath and "toClear" or "DoNotDelete" + BowtieIndexList = [] + for indexing_flags, bowtiePath in zip (args.indexing_flags, args.index): + if indexing_flags == "history": + BowtieIndexList.append ( bowtie_squash (bowtiePath) ) + BowtieIndexList.append ( "toClear" ) + else: + BowtieIndexList.append ( bowtiePath ) + BowtieIndexList.append ( "DoNotDelete") + ###### temporary Indexes are generated. They must be deleted at the end (after removing file name in the temp path) + ResultDict = defaultdict(list) + for label, input in zip(args.label, args.input): ## the main cascade, iterating over samples and bowtie indexes + workingDir = make_working_dir() + cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[0], input=input, working_dir=workingDir) + ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) # first step of the cascade + if len(BowtieIndexList) > 2: # is there a second step to perform ? + os.rename("%s/al.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) ## end of first step. the aligned reads are the input of the next step + cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[2], input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) + ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )## second step of the cascade + if len(BowtieIndexList) > 4: ## remaining steps + for BowtieIndexPath in BowtieIndexList[4::2]: + try: + os.unlink("%s/al.fasta" % workingDir) # hacked at gcc 2014, to remove previous al.fasta file that may interfere with counting if new al.fasta is empty + except: pass + os.rename("%s/unal.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) + cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexPath, input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) + ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) + Fun = open("%s/unal.fasta"%workingDir, "r") ## to finish, compute the number of unmatched reads + n = 0 + for line in Fun: + n += 1 + ResultDict[label].append(n/2) + Fun.close() + Clean_TempDir (workingDir) # clean the sample working directory + ## cleaning + for IndexPath, IndexFlag in zip(BowtieIndexList[::2], BowtieIndexList[1::2]): + if IndexFlag == "toClear": + Clean_TempDir ("/".join(IndexPath.split("/")[:-1])) + ## end of cleaning + + + + F = open (args.output, "w") + print >> F, "alignment reference\t%s" % "\t".join(args.label) + for i, reference in enumerate(args.indexName): + F.write ("%s" % reference) + for sample in args.label: + F.write ("\t%s" % "{:,}".format(ResultDict[sample][i]) ) + print >> F + F.write ("Remaining Unmatched") + for sample in args.label: + F.write ("\t%s" % "{:,}".format(ResultDict[sample][-1]) ) + print >> F + + F.close() + +if __name__=="__main__": __main__() diff -r 000000000000 -r ecb041b49cd7 sRbowtieCascade.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtieCascade.xml Mon Nov 03 10:26:15 2014 -0500 @@ -0,0 +1,142 @@ + + Using iterative sRbowtie Alignments + + bowtie + + + sRbowtieCascade.py --output $output + --num-threads \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie + --mismatch $mismatches + --input + #for $i in $input: + $i + #end for + --label + #for $i in $input: + "$i.name" + #end for + --index + #if $refGenomeSource1.genomeSource == "history": + $refGenomeSource1.ownFile + #else: + $refGenomeSource1.index.fields.path + #end if + #for $i in $AdditionalQueries: + #if $i.refGenomeSource.genomeSource == "history": + $i.refGenomeSource.ownFile + #else: + $i.refGenomeSource.index.fields.path + #end if + #end for + --indexing-flags + $refGenomeSource1.genomeSource + #for $i in $AdditionalQueries: + $i.refGenomeSource.genomeSource + #end for + --indexName + #if $refGenomeSource1.genomeSource == "history": + "$refGenomeSource1.ownFile.name" + #else: + "$refGenomeSource1.index.fields.name" + #end if + #for $i in $AdditionalQueries: + #if $i.refGenomeSource.genomeSource == "history": + "$i.refGenomeSource.ownFile.name" + #else: + "$i.refGenomeSource.index.fields.name" + #end if + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Intro** + +Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient. +A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution. +However, this Bowtie wrapper tool only takes FASTQ files as inputs. + +Here The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode, with -k 1) + +.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml + + +------ + +**What it does** + +.. class:: infomark + +This script uses the sRbowtie wrapper to iteratively match reads on a reference indexes. + +Reads are Matched on DNA references as fast as possible, without taking care of mapping issues + +*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8* + +unaligned reads at step N are used as input for sRbowtie at step N+1 + +----- + +**Input formats** + +.. class:: warningmark + +*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter* + +----- + +**OUTPUTS** + +**Annotation table** + + + diff -r 000000000000 -r ecb041b49cd7 test-data/sRbowtie.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sRbowtie.fa Mon Nov 03 10:26:15 2014 -0500 @@ -0,0 +1,40 @@ +>1 +GTATTGTAAGTGGCAGAGTGGC +>2 +TGGAATGTAAAGAAGTATGGAG +>3 +GTGGGGAGTTTGGATGGGGCGGCA +>4 +AATGGCACTGGAAGAATTCACGG +>5 +GTACGGACAAGGGGAATC +>6 +TTGGGTTCTGGGGGGAGTATGG +>7 +GTGGGGAGTTTCGCTGGGGCGGCA +>8 +TAAGGGTCGGGTAGTGAGGGC +>9 +AGCTGGGACTGAGGACTG +>10 +AGCTGGGACTGAGGACTGC +>11 +AAGGGTCGGGTAGTGAGG +>12 +GTCGGGTAGTGAGGGCCTT +>13 +TGGTGGGGCTTGGAACAATTGGAGGGC +>14 +TGACGGAAGGGCACCACC +>15 +TGGAATGTAAAGAAGTATGGAG +>16 +TTGGGTTCTGGGGGGAGT +>17 +TCAGGTGGGGAGTTTGGCTGGGGCGGCACA +>18 +TTGGGTATAGGGGCGAAAGA +>19 +AGCGGGCGTGCTTGTGGAC +>20 +GTCGAATTTGGGTATAGGGGC diff -r 000000000000 -r ecb041b49cd7 test-data/sRbowtie.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sRbowtie.out Mon Nov 03 10:26:15 2014 -0500 @@ -0,0 +1,5 @@ +2 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG +4 - 2L 11953463 CCGTGAATTCTTCCAGTGCCATT +15 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG +14 - Uextra 7115665 GGTGGTGCCCTTCCGTCA +18 + Uextra 7726410 TTGGGTATAGGGGCGAAAGA diff -r 000000000000 -r ecb041b49cd7 tool-data/bowtie_indices.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/bowtie_indices.loc.sample Mon Nov 03 10:26:15 2014 -0500 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +# +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 +# +#and your /depot/data2/galaxy/bowtie/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +# diff -r 000000000000 -r ecb041b49cd7 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Nov 03 10:26:15 2014 -0500 @@ -0,0 +1,8 @@ + + + + + value, dbkey, name, path + +
+
diff -r 000000000000 -r ecb041b49cd7 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Nov 03 10:26:15 2014 -0500 @@ -0,0 +1,6 @@ + + + + + +