# HG changeset patch
# User drosofff
# Date 1403626603 14400
# Node ID de6a6afc5a79856be1bc72160d9e6a6a70233a12
Uploaded
diff -r 000000000000 -r de6a6afc5a79 FeaturesParser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/FeaturesParser.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+# python parser module to analyse Features in sRbowtie alignments (guided by a GFF3 file)
+# version 0.9
+# Usage FeaturesParser.py <1:index source> <2:extraction directive> <3:output> <4:GFF3 guide file> <5:6:7 filePath:FileExt:FileLabel> <.. ad lib>
+
+import sys
+from smRtools import *
+from collections import *
+
+IndexSource = sys.argv[1]
+ExtractionDirective = sys.argv[2]
+if ExtractionDirective == "--do_not_extract_index":
+ genomeRefFormat = "fastaSource"
+elif ExtractionDirective == "--extract_index":
+ genomeRefFormat = "bowtieIndex"
+Output = sys.argv[3]
+GFF3_file = sys.argv[4]
+Triplets = [sys.argv[5:][i:i+3] for i in xrange(0, len(sys.argv[5:]), 3)]
+MasterListOfGenomes = {}
+FeatureDict = defaultdict(dict)
+
+for [filePath, FileExt, FileLabel] in Triplets:
+ MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (filePath, FileExt, IndexSource, genomeRefFormat)
+ FeatureDict[FileLabel] = MasterListOfGenomes[FileLabel].CountFeatures(GFF3=GFF3_file)
+
+# add some code to pick up the GFF3 features in their order of appearence.
+F = open(GFF3_file, "r")
+featureList = []
+for line in F:
+ if line[0] == "#": continue
+ feature = line.split()[2]
+ if feature not in featureList:
+ featureList.append(feature)
+F.close()
+
+header = ["#Feature"]
+for [filePath, FileExt, FileLabel] in Triplets:
+ header.append(FileLabel)
+
+F = open (sys.argv[3], "w")
+print >> F, "\t".join(header)
+for feature in featureList:
+ line=[feature]
+ for sample in header[1:]:
+ count = str (FeatureDict[sample][feature])
+# uncomment to get percentage in addition to counts
+# percent = float(FeatureDict[sample][feature]) / MasterListOfGenomes[sample].alignedReads
+# value = "%s | %0.2f" % (count, percent)
+# line.append(value)
+ line.append(count)
+ print >> F, "\t".join(line )
+line = ["Unfeatured"]
+for sample in header[1:]:
+ matched = 0
+ for feature in FeatureDict[sample]:
+ matched += FeatureDict[sample][feature]
+ unmatched = MasterListOfGenomes[sample].alignedReads - matched
+# uncomment to get percentage in addition to counts
+# percent = float (unmatched) / (matched + unmatched)
+# value = "%s | %0.2f" % (unmatched, percent)
+# line.append(value)
+ line.append("%s" % unmatched)
+print >> F, "\t".join(line)
+F.close()
diff -r 000000000000 -r de6a6afc5a79 FeaturesParser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/FeaturesParser.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,61 @@
+
+ in sRbowtie alignment
+ bowtie-inspect
+
+
+ FeaturesParser.py
+ #if $refGenomeSource.genomeSource == "history":
+ $refGenomeSource.ownFile ## index source ## 1
+ --do_not_extract_index ## 2
+ #else:
+ #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1]
+ $reference ## index source ## 1
+ --extract_index ## 2
+ #end if
+ $output ## 3
+ $gff3 ## 4
+ #for $i in $refGenomeSource.input_list
+ $i $i.ext "$i.name"
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Parses Features Counts from one or several sRBowtie alignments (in tabular, Sam or Bam format).
+
+Both sense and antisense alignments are counted
+
+The library labels are infered from the input dataset names in the galaxy history.
+
+**It is thus essential that input datasets are appropriately renamed**
+
+**it is preferable that you do not put any space in this input dataset names. You may edit these names in the history**
+
+
+
+
+
+
diff -r 000000000000 -r de6a6afc5a79 MDS_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MDS_wrapper.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# mirdeep* python wrapper
+# refactoring a python version of the MDS_wrapper.pl
+# Usage MDS_wrapper.py
+
+import sys, re, os, subprocess, shlex, tempfile, shutil
+
+def stop_err( msg ):
+ sys.stderr.write( '%s\n' % msg )
+ sys.exit()
+
+MDS_path = "/home/galaxy/bin/MDS_command_line_v32/MDS_command_line"
+
+input_full_path = sys.argv[1]
+MDS_genome = sys.argv[2]
+gff3_output = sys.argv[3]
+dataresult = sys.argv[4]
+datacluster = sys.argv[5]
+
+tmp_MDS_work_dir = tempfile.mkdtemp(dir = MDS_path) # make temp directory for MDS analysis
+os.symlink(input_full_path, tmp_MDS_work_dir+"/data.fa" ) # symlink between the fasta source file and the required "data.fa" input
+os.symlink(MDS_path+"/MDS_command_line.jar", tmp_MDS_work_dir+"/MDS_command_line.jar" ) # symlink to jar source in working directory
+os.symlink(MDS_path+"/genome", tmp_MDS_work_dir+"/genome")
+os.symlink(MDS_path+"/targetScan", tmp_MDS_work_dir+"/targetScan")
+os.symlink(MDS_path+"/targetScan_files", tmp_MDS_work_dir+"/targetScan_files")
+
+# execute MirDeep*
+command_line = "java -Xmx4g -jar " + MDS_path + "/MDS_command_line.jar -r 5 -g " + MDS_genome + " data.fa" # -Xmx12g
+print command_line
+#tmp = tempfile.NamedTemporaryFile( dir=tmp_MDS_work_dir ).name
+#tmp_stderr = open( tmp, 'wb' )
+
+try:
+ os.chdir(tmp_MDS_work_dir)
+ p = subprocess.Popen(args=command_line, cwd=tmp_MDS_work_dir, shell=True, stderr=sys.stderr)
+ returncode = p.wait()
+ shutil.copy2 ("data.result", dataresult)
+ shutil.copy2 ("data.cluster", datacluster)
+ dataFILE = open("data.result", "r")
+ datafile = dataFILE.readlines()
+ dataFILE.close()
+ GFF3OUT = open(gff3_output, "w")
+ print >> GFF3OUT,"##gff-version 3"
+ print >> GFF3OUT, "##Seqid Source Type Start End Score Strand Phase Attributes"
+ print >> GFF3OUT, "##"
+ for line in datafile[1:]:
+ fields = line.split("\t")
+ Seqid, Source, Type, Start, End, Score, Strand, Phase = fields[2], "MirDeep*", "hairPin_loci", fields[4].split("-")[0], fields[4].split("-")[1], fields[1], fields[3], "."
+ ID = "ID=%s;%s_reads;%s;%s;mature_seq:%s" % (fields[0],fields[5],fields[7],fields[8],fields[9])
+ print >> GFF3OUT, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (Seqid, Source, Type, Start, End, Score, Strand, Phase, ID)
+ GFF3OUT.close()
+ if os.path.exists( tmp_MDS_work_dir ):
+ shutil.rmtree( tmp_MDS_work_dir )
+ else:
+ print "Error in cleaning tmp working directory"
+
+except Exception, e:
+ # clean up temp dir
+ if os.path.exists( tmp_MDS_work_dir ):
+ shutil.rmtree( tmp_MDS_work_dir )
+ stop_err( 'Error running MDS_command_line.jar\n' + str( e ) )
+
+
diff -r 000000000000 -r de6a6afc5a79 MirDeepStar.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MirDeepStar.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,32 @@
+
+
+MDS_wrapper.py $input $genome $output $output2 $output3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+MirDeep* wrapper
+
+
+
+
+
+
diff -r 000000000000 -r de6a6afc5a79 MirParser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MirParser.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+# python parser module for pre-mir and mature miRNAs, guided by mirbase.org GFF3
+# version 0.0.9 (1-6-2014)
+# Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
+# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
+# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
+
+import sys, subprocess
+from smRtools import *
+
+IndexSource = sys.argv[1]
+ExtractionDirective = sys.argv[2]
+if ExtractionDirective == "--do_not_extract_index":
+ genomeRefFormat = "fastaSource"
+elif ExtractionDirective == "--extract_index":
+ genomeRefFormat = "bowtieIndex"
+OutputPre_mirs = sys.argv[3]
+OutputMature_Mirs = sys.argv[4]
+GFF3_file = sys.argv[5]
+lattice = sys.argv[6]
+Rcode = sys.argv[7]
+latticePDF = sys.argv[8]
+Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)]
+MasterListOfGenomes = {}
+
+for [filePath, FileExt, FileLabel] in Triplets:
+ print FileLabel
+ MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel)
+
+header = ["gene"]
+for [filePath, FileExt, FileLabel] in Triplets:
+ header.append(FileLabel)
+
+hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation
+
+## read GFF3 to subinstantiate
+gff3 = open (GFF3_file, "r")
+lattice_dataframe = []
+for line in gff3:
+ if line[0] == "#": continue
+ gff_fields = line[:-1].split("\t")
+ chrom = gff_fields[0]
+ gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
+ item_upstream_coordinate = int(gff_fields[3])
+ item_downstream_coordinate = int(gff_fields[4])
+ if gff_fields[6] == "+":
+ item_polarity = "forward"
+ else:
+ item_polarity = "reverse"
+ item_line = [gff_name]
+ for sample in header[1:]:
+ count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity)
+ item_line.append(str(count))
+ ## subtreatement for lattice
+ if lattice != "dummy_dataframe_path":
+ if ("5p" not in gff_name) and ("3p" not in gff_name):
+ lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) )
+ ## end of subtreatement for lattice
+ hit_table.append("\t".join(item_line) )
+gff3.close()
+
+Fpremirs = open (OutputPre_mirs, "w")
+print >> Fpremirs, hit_table[0]
+finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)]
+print >> Fpremirs, "\n".join(finalPreList )
+Fpremirs.close()
+
+Fmaturemires = open (OutputMature_Mirs, "w")
+print >> Fmaturemires, hit_table[0]
+finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)]
+print >> Fmaturemires, "\n".join(finalMatureList )
+Fmaturemires.close()
+
+if lattice != "dummy_dataframe_path":
+ Flattice = open(lattice, "w")
+ print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity")
+ print >> Flattice, "\n".join(lattice_dataframe)
+ Flattice.close()
+ R_command="Rscript "+ Rcode
+ process = subprocess.Popen(R_command.split())
+ process.wait()
diff -r 000000000000 -r de6a6afc5a79 MirParser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MirParser.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,148 @@
+
+ from sRbowtie aligment
+ bowtie-inspect
+
+
+ MirParser.py
+ #if $refGenomeSource.genomeSource == "history":
+ $refGenomeSource.ownFile ## index source sys.arg[1]
+ --do_not_extract_index ## sys.argv[2]
+ #else:
+ #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1]
+ $reference ## sys.argv[1]
+ --extract_index ## sys.argv[2]
+ #end if
+ $output1 ## for pre-mirs ## sys.argv[3]
+ $output2 ## for mature mirs ## sys.argv[4]
+ $GFF3 ## sys.argv[5]
+ #if $plotting.plottingOption == "yes":
+ $lattice_dataframe ## sys.argv[6]
+ $plotCode ## sys.argv[7]
+ $latticePDF ## sys.argv[8]
+ #else:
+ "dummy_dataframe_path" ## sys.argv[6]
+ "dummy_plotCode" ## sys.argv[7]
+ "dummy_latticePDF" ## sys.argv[8]
+ #end if
+ #for $i in $refGenomeSource.input_list
+ $i $i.ext "$i.name" ## sys.argv[9,10,11] modulo 3
+ #end for
+ #silent plottingoption = $plotting.plottingOption
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #if $plotting.plottingOption == "yes":
+ graph_type = "${plotting.display}" ## "relative" or "absolute"
+ ## Setup R error handling to go to stderr
+ options( show.error.messages=F,
+ error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+ library(lattice)
+ coverage = read.delim("${lattice_dataframe}", header=T)
+ Numb_of_biosamples = length(levels(coverage\$sample))
+ if (graph_type=="relative") {
+ graph = xyplot(countsNorm~offsetNorm | mir, data=coverage, groups=polarity, col=c("red", "blue"), type="l", lwd=1,
+ scales=list(x=list(cex=.5), y=list(cex=.5)), par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), layout=c(Numb_of_biosamples,15), as.table=TRUE, main="miRNA coverage maps")
+ } else {
+ graph = xyplot(counts~offset | mir, data=coverage, groups=polarity, col=c("red", "blue"), type="l", lwd=1,
+ scales=list(x=list(cex=.5), y=list(cex=.5)), par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), layout=c(Numb_of_biosamples,15), as.table=TRUE, main="miRNA coverage maps")
+ }
+ ## pdf output
+ pdf(file="${latticePDF}", paper="special", height=11.69, width=8.2677)
+ plot(graph, newpage = T)
+ dev.off()
+ #end if
+
+
+
+
+
+
+
+ plotting['plottingOption'] == "yes"
+
+
+ plotting['plottingOption'] == "yes"
+
+
+
+
+**What it does**
+
+This tool uses a specie-specific GFF3 file from mirBase_ to guide the parsing of an alignment file produced with the sRbowtie tool.
+
+.. _mirBase: ftp://mirbase.org/pub/mirbase/CURRENT/genomes/
+
+------
+
+.. class:: warningmark
+
+the Guide GFF3 file must be in the following format:
+
+2L . miRNA_primary_transcript 243035 243141 . - . ID=MI0005821;Alias=MI0005821;Name=dme-mir-965
+
+2L . miRNA 243055 243076 . - . ID=MIMAT0005480;Alias=MIMAT0005480;Name=dme-miR-965-3p;Derives_from=MI0005821
+
+2L . miRNA 243096 243118 . - . ID=MIMAT0020861;Alias=MIMAT0020861;Name=dme-miR-965-5p;Derives_from=MI0005821
+
+2L . miRNA_primary_transcript 857542 857632 . + . ID=MI0005813;Alias=MI0005813;Name=dme-mir-375
+
+2L . miRNA 857596 857617 . + . ID=MIMAT0005472;Alias=MIMAT0005472;Name=dme-miR-375-3p;Derives_from=MI0005813
+
+2L . miRNA 857556 857579 . + . ID=MIMAT0020853;Alias=MIMAT0020853;Name=dme-miR-375-5p;Derives_from=MI0005813
+
+2L . miRNA_primary_transcript 1831685 1831799 . - . ID=MI0011290;Alias=MI0011290;Name=dme-mir-2280
+
+With name for mature miRNA (3rd column = miRNA) containing either the -3p or -5p string in the attribute Name (Name=dme-miR-965-3p, for instance)
+
+------
+
+**Input formats**
+
+1. One or sereral alignment files generated with sRbowtie tool and **renamed** according to the name of the biosample (avoid spaces in biosample labels)
+
+.. class:: warningmark
+
+Alignment datasets generated with sRbowtie must be renamed according to a biosample name
+
+2. A GFF3 file retrieved from mirBase_
+
+------
+
+**Outputs**
+
+Two count list files for counts of reads aligned to pre-mir or mature miRNA
+
+A pdf of pre-mir coverages. Red coverages indicate that the mir gene is in the genomic up strand, blue coverages indicate that the mir gene is in the genomic down strand.
+
+
+
diff -r 000000000000 -r de6a6afc5a79 annotation_collector.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_collector.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+#By, drosofff@gmail.com
+# command: annotation_collector.py $output input1, label1, input2, label2, etc...
+
+import sys, os
+
+def countlineinfile(file):
+ F = open (file, "r")
+ count = 0
+ for line in F:
+ count += 1
+ F.close()
+ return count/2
+results = []
+
+for file, label in zip (sys.argv[2:-1:2], sys.argv[3:-1:2]):
+ results.append ( (countlineinfile(file), label) )
+
+Fout = open (sys.argv[1], "w")
+
+print >> Fout, "# %s" % (sys.argv[-1])
+for filecount, label in results:
+ print >> Fout, "%s\t%s\t%.2f %%" % (label, filecount, filecount/float(results[0][0])*100 )
+print >> Fout
+Fout.close()
diff -r 000000000000 -r de6a6afc5a79 annotation_collector.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_collector.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,37 @@
+
+ from matched fasta files
+
+ annotation_collector.py
+ $out_file
+ $input1 $label1
+ #for $q in $queries
+ ${q.input2} ${q.label2}
+ #end for
+ $title
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: warningmark
+
+**WARNING:** Be careful not to agregate datasets of different kinds (e.g., datasets must be matched fasta files). This tool does not check if the datasets being annotated are in the same format.
+
+-----
+
+**What it does**
+
+Generates Annotation Table from fasta files
+
+
+
\ No newline at end of file
diff -r 000000000000 -r de6a6afc5a79 bowtie_indices.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bowtie_indices.loc.sample Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,37 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Bowtie indexed sequences data files. You will
+#need to create these data files and then create a bowtie_indices.loc
+#file similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bowtie_indices.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#
+#
+#So, for example, if you had hg18 indexed stored in
+#/depot/data2/galaxy/bowtie/hg18/,
+#then the bowtie_indices.loc entry would look like this:
+#
+#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18
+#
+#and your /depot/data2/galaxy/bowtie/hg18/ directory
+#would contain hg18.*.ebwt files:
+#
+#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt
+#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt
+#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt
+#...etc...
+#
+#Your bowtie_indices.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files. For example:
+#
+#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon
+#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full
+#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
diff -r 000000000000 -r de6a6afc5a79 piRNAsignature.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/piRNAsignature.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,78 @@
+#!/usr/bin/python
+# script for computing overlap signatures from a bowtie output
+# Christophe Antoniewski
+# Usage piRNAsignature.py <1:input> <2:format of input> <3:minsize query> <4:maxsize query> <5:minsize target> <6:maxsize target>
+# <7:minscope> <8:maxscope> <9:output> <10:bowtie index> <11:procedure option> <12: graph (global or lattice)>
+# <13: R code>
+
+import sys, subprocess
+from smRtools import *
+from collections import defaultdict # test whether it is required
+
+if sys.argv[11] == "--extract_index":
+ if sys.argv[2] == "tabular":
+ Genome = HandleSmRNAwindows (sys.argv[1],"tabular",sys.argv[10],"bowtieIndex")
+ elif sys.argv[2] == "sam":
+ Genome = HandleSmRNAwindows (sys.argv[1],"sam",sys.argv[10],"bowtieIndex")
+ else:
+ Genome = HandleSmRNAwindows (sys.argv[1],"bam",sys.argv[10],"bowtieIndex")
+else:
+ if sys.argv[2] == "tabular":
+ Genome = HandleSmRNAwindows (sys.argv[1],"tabular",sys.argv[10],"fastaSource")
+ elif sys.argv[2] == "sam":
+ Genome = HandleSmRNAwindows (sys.argv[1],"sam",sys.argv[10],"fastaSource")
+ else:
+ Genome = HandleSmRNAwindows (sys.argv[1],"bam",sys.argv[10],"fastaSource")
+# this decisional tree may be simplified if sam and bam inputs are treated the same way by pysam
+
+# replace objDic by Genome.instanceDict or... objDic = Genome.instanceDict
+objDic = Genome.instanceDict
+
+minquery = int(sys.argv[3])
+maxquery = int(sys.argv[4])
+mintarget = int(sys.argv[5])
+maxtarget = int(sys.argv[6])
+minscope = int(sys.argv[7])
+maxscope = int(sys.argv[8]) + 1
+general_frequency_table = dict ([(i,0) for i in range(minscope,maxscope)])
+general_percent_table = dict ([(i,0) for i in range(minscope,maxscope)])
+OUT = open (sys.argv[9], "w")
+
+if sys.argv[12] == "global":
+ ###### for normalized summing of local_percent_table(s)
+ readcount_dic = {}
+ Total_read_in_objDic = 0
+ for item in objDic:
+ readcount_dic[item] = objDic[item].readcount(minquery, maxquery)
+ Total_read_in_objDic += readcount_dic[item]
+ ######
+ for x in (objDic):
+ local_frequency_table = objDic[x].signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) )
+ local_percent_table = objDic[x].hannon_signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) )
+ try:
+ for overlap in local_frequency_table.keys():
+ general_frequency_table[overlap] = general_frequency_table.get(overlap, 0) + local_frequency_table[overlap]
+ except:
+ pass
+ try:
+ for overlap in local_percent_table.keys():
+ general_percent_table[overlap] = general_percent_table.get(overlap, 0) + (1./Total_read_in_objDic*readcount_dic[x]*local_percent_table[overlap])
+ except:
+ pass
+ print >> OUT, "overlap\tnum of pairs\tprobability"
+ for classe in sorted(general_frequency_table):
+ print >> OUT, "%i\t%i\t%f" % (classe, general_frequency_table[classe], general_percent_table[classe])
+
+else:
+ print >> OUT, "overlap\tnum of pairs\tprobability\titem"
+ for x in (objDic):
+ local_frequency_table = objDic[x].signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) )
+ local_percent_table = objDic[x].hannon_signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) )
+ for classe in range(minscope,maxscope):
+ print >> OUT, "%i\t%i\t%f\t%s" % (classe, local_frequency_table[classe], local_percent_table[classe], x)
+
+OUT.close()
+
+## Run the R script that is defined in the xml using the Rscript binary provided with R.
+R_command="Rscript "+ sys.argv[13]
+process = subprocess.Popen(R_command.split())
diff -r 000000000000 -r de6a6afc5a79 piRNAsignature.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/piRNAsignature.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,122 @@
+
+
+
+ piRNAsignature.py $refGenomeSource.input $refGenomeSource.input.ext $minquery $maxquery $mintarget $maxtarget $minscope $maxscope $output
+ #if $refGenomeSource.genomeSource == "history":
+ $refGenomeSource.ownFile
+ --do_not_extract_index
+ #else:
+ #silent reference= filter( lambda x: str( x[0] ) == str( $input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1]
+ $reference
+ --extract_index
+ #end if
+ $graph_type
+ $sigplotter
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ graph_type = "${graph_type}"
+
+ globalgraph = function () {
+ ## Setup R error handling to go to stderr
+ options( show.error.messages=F,
+ error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+ signature = read.delim("${output}", header=TRUE)
+ ## Open output1 PDF file
+ pdf( "${output1}" )
+ signaturez=(signature[,2] -mean(signature[,2]))/sd(signature[,2])
+ plot(signaturez, type = "b", main="signature", cex.main=2, xlab="overlap (nt)", ylab="z-score", pch=19, cex=0.8, col="darkslateblue", lwd=3, cex.lab=1.5, cex.axis=1.4, xaxt="n")
+ axis(1, at=seq(from=1, to=length(signature[,1]), by=3) )
+ devname = dev.off()
+ ## Close the PDF file
+ ## Open output2 PDF file
+ pdf( "${output2}" )
+ YLIM=max(signature[,2])
+ plot(signature[,1:2], type = "h", xlab="overlap (nt)", ylim=c(0,YLIM), ylab="number of pairs", col="darkslateblue", lwd=7)
+ devname = dev.off()
+ ## Close the PDF file
+ ## Open output3 PDF file
+ pdf( "${output3}" )
+ plot(signature[,1], signature[,3]*100, type = "l", main="ping-pong Signature of ${minquery}-${maxquery} against ${mintarget}-${maxtarget}nt small RNAs",
+ cex.main=1, xlab="overlap (nt)", ylab="ping-pong signal [%]", ylim=c(0,50),
+ pch=19, col="darkslateblue", lwd =4, cex.lab=1.2, cex.axis=1, xaxt="n")
+ axis(1, at=seq(from=1, to=length(signature[,1]), by=3) )
+ devname = dev.off()
+ ## Close the PDF file
+ }
+
+ treillisgraph = function () {
+ ## Open output3 PDF file
+ pdf( "${output3}", paper="special", height=11.69, width=8.2677 )
+ signature = read.delim("${output}", header=TRUE)
+ options( show.error.messages=F,
+ error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+ library(lattice)
+ print (xyplot(signature[,3]*100~signature[,1]|signature[,4], type = "l", xlim=c(1,26), main="ping-pong Signature of ${minquery}-${maxquery} against ${mintarget}-${maxtarget}nt small RNAs",
+ par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), scales=list(cex=0.5),
+ cex.main=1, cex=.5, xlab="overlap (nt)", ylab="ping-pong signal [%]",
+ pch=19, col="darkslateblue", lwd =1.5, cex.lab=1.2, cex.axis=1.2,
+ layout=c(4,12), as.table=TRUE, newpage = T) )
+ devnname = dev.off()
+ }
+
+ if (graph_type=="global") {
+ globalgraph()
+
+ }
+ if(graph_type=="lattice") {
+ treillisgraph()
+ }
+
+
+
+
+
+
+ (graph_type == "global")
+
+
+ (graph_type == "global")
+
+
+
+
+
+
+**What it does**
+
+This tool computes the number of pairs by overlap classes (in nt) from a bowtie output file, the z-score calculated from these numbers of pairs, and the ping-pong signal as described in Brennecke et al (2009) Science.
+The numerical options set the min and max size of both the query small rna class and the target small rna class
+Three type of signals are plotted in separate pdf files, the number of pairs founds, the z-score calculated from these numbers of pairs, and the ping-pong signal as described in Brennecke et al (2009) Science.
+
+
+
diff -r 000000000000 -r de6a6afc5a79 readmap.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readmap.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+# python parser module for for readmaps and size distributions, guided by GFF3
+# version 0.9.1 (1-6-2014)
+# Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
+# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
+# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
+
+import sys, subprocess, argparse
+from smRtools import *
+from collections import OrderedDict, defaultdict
+import os
+
+def Parser():
+ the_parser = argparse.ArgumentParser()
+ the_parser.add_argument('--output_readmap', action="store", type=str, help="readmap dataframe")
+ the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe")
+ the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file")
+ the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references")
+ the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
+ the_parser.add_argument('--ext',nargs='+', help="input file type")
+ the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
+ the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file")
+ the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
+ the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
+ the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
+ the_parser.add_argument('--rcode', type=str, help="R script")
+ args = the_parser.parse_args()
+ return args
+
+args=Parser()
+if args.reference_fasta:
+ genomeRefFormat = "fastaSource"
+ genomeRefFile = args.reference_fasta
+if args.reference_bowtie_index:
+ genomeRefFormat = "bowtieIndex"
+ genomeRefFile = args.reference_bowtie_index
+readmap_file=args.output_readmap
+size_distribution_file=args.output_size_distribution
+minquery=args.minquery
+maxquery=args.maxquery
+Rcode = args.rcode
+filePath=args.input
+fileExt=args.ext
+fileLabel=args.label
+normalization_factor=args.normalization_factor
+
+MasterListOfGenomes = OrderedDict()
+
+def process_samples(filePath):
+ for i, filePath in enumerate(filePath):
+ norm=normalization_factor[i]
+ print fileLabel[i]
+ MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\
+ biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
+ return MasterListOfGenomes
+
+def write_readplot_dataframe(readDict, readmap_file):
+ with open(readmap_file, 'w') as readmap:
+ print >>readmap, "gene\tcoord\tcount\tpolarity\tsample"
+ for sample in readDict.keys():
+ if args.gff:
+ dict=readDict[sample]
+ else:
+ dict=readDict[sample].instanceDict
+ for gene in dict.keys():
+ plottable = dict[gene].readplot()
+ for line in plottable:
+ print >>readmap, "%s\t%s" % (line, sample)
+
+def write_size_distribution_dataframe(readDict, size_distribution_file):
+ with open(size_distribution_file, 'w') as size_distrib:
+ print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
+ for sample in readDict.keys():
+ if args.gff:
+ dict=readDict[sample]
+ else:
+ dict=readDict[sample].instanceDict
+ for gene in dict.keys():
+ histogram = dict[gene].size_histogram()
+ for polarity in histogram.keys():
+ for item in histogram[polarity].iteritems():
+ print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample)
+
+def gff_item_subinstances(readDict, gff3):
+ GFFinstanceDict=OrderedDict()
+ with open(gff3) as gff:
+ for line in gff:
+ if line[0] == "#": continue
+ gff_fields = line[:-1].split("\t")
+ chrom = gff_fields[0]
+ gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
+ item_upstream_coordinate = int(gff_fields[3])
+ item_downstream_coordinate = int(gff_fields[4])
+ item_polarity = gff_fields[6]
+ for sample in readDict.keys():
+ if not GFFinstanceDict.has_key(sample):
+ GFFinstanceDict[sample]={}
+ subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
+ if item_polarity == '-':
+ subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
+ subinstance.gene=gff_name
+ GFFinstanceDict[sample][gff_name]=subinstance
+ return GFFinstanceDict
+
+MasterListOfGenomes=process_samples(filePath)
+
+if args.gff:
+ MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff)
+
+write_readplot_dataframe(MasterListOfGenomes, readmap_file)
+write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file)
+
+R_command="Rscript "+ Rcode
+process = subprocess.Popen(R_command.split())
+process.wait()
+
diff -r 000000000000 -r de6a6afc5a79 readmap.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readmap.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,225 @@
+
+ from sRbowtie aligment
+ bowtie-inspect
+
+
+ readmap.py
+ #if $refGenomeSource.genomeSource == "history":
+ --reference_fasta ## sys.argv[2]
+ $refGenomeSource.ownFile ## index source
+ #else:
+ #silent reference= filter( lambda x: str( x[0] ) == str( $refGenomeSource.series[0].input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1]
+ --reference_bowtie_index
+ $reference
+ #end if
+ --rcode
+ $plotCode
+ --output_readmap
+ $readmap_dataframe
+ --output_size_distribution
+ $size_distribution_dataframe
+ --minquery
+ $minquery
+ --maxquery
+ $maxquery
+ --input
+ #for $i in $refGenomeSource.series
+ $i.input
+ #end for
+ --ext
+ #for $i in $refGenomeSource.series
+ $i.input.ext
+ #end for
+ --label
+ #for $i in $refGenomeSource.series
+ "$i.input.name"
+ #end for
+ --normalization_factor
+ #for $i in $refGenomeSource.series
+ $i.norm
+ #end for
+ #if $gff:
+ --gff
+ $gff
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ## Setup R error handling to go to stderr
+ options( show.error.messages=F,
+ error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+ library(RColorBrewer)
+ library(lattice)
+ library(latticeExtra)
+ library(grid)
+ library(gridExtra)
+ ##cheetahtemplate data frame implementation
+
+ rm=read.delim("${readmap_dataframe}", header=T, row.names=NULL)
+ pdf(file="${readmap_PDF}", paper="special", height=11.69, width=8.2677)
+ n_samples=length(unique(rm\$sample))
+
+ genes=unique(levels(rm\$gene))
+ per_gene_readmap=lapply(genes, function(x) subset(rm, gene==x))
+ n_genes=length(per_gene_readmap)
+
+
+ par.settings.readmap=list(layout.heights=list(top.padding=0, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8))
+ par.settings.size=list(layout.heights=list(top.padding=-1, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8))
+ par.settings.combination.readmap=list(layout.heights=list(top.padding=0, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8))
+ par.settings.combination.size=list(layout.heights=list(top.padding=-2, bottom.padding=-0.5), fontsize = list(text=96/${rows_per_page}, points=8))
+
+
+ plot_readmap=function(df, ...) {
+ combineLimits(xyplot(count~coord|factor(sample, levels=unique(sample))+reorder(gene, count, function(x) -sum(abs(x))),
+ data=df,
+ type='h',
+ scales= list(relation="free", x=list(rot=0, cex=0.75, axs="i", tck=0.5), y=list(tick.number=4, rot=90, cex=0.75)),
+ xlab=NULL, main=NULL, ylab=NULL,
+ as.table=T,
+ origin = 0,
+ horizontal=FALSE,
+ group=polarity,
+ col=c("red","blue"),
+ ...))
+ }
+
+ plot_size_distribution= function(df, ...) {
+ smR.prepanel=function(x,y,...){; yscale=c(-max(abs(y)), max(abs(y)));list(ylim=yscale);}
+ bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample))+gene, data = df, origin = 0,
+ horizontal=FALSE,
+ group=polarity,
+ stack=TRUE,
+ col=c('red', 'blue'),
+ cex=0.75,
+ scales=list(y=list(tick.number=4, rot=90, relation="free"), cex=0.75),
+ prepanel=smR.prepanel,
+ xlab = NULL,
+ ylab = NULL,
+# par.settings=list(layout.heights=list(top.padding=-2, bottom.padding=-3), fontsize = list(text=8, points=8)),
+ main = NULL , as.table=TRUE, newpage = T, ...)
+ combineLimits(bc)
+ }
+
+ for (i in seq(1,n_genes,${rows_per_page})) {
+ start=i
+ end=i+${rows_per_page}-1
+ if (end>n_genes) {end=n_genes}
+ readmap_plot.list=lapply(per_gene_readmap[start:end], function(x) plot_readmap(x, par.settings=par.settings.readmap))
+ args.list=c(readmap_plot.list, list(nrow=${rows_per_page}, ncol=1, main="readmaps", left="${ylabel}", sub="readmap coordinate"))
+ do.call(grid.arrange, args.list)
+ }
+
+ devname=dev.off()
+
+ size=read.delim("${size_distribution_dataframe}", header=T, row.names=NULL)
+ per_gene_size=lapply(genes, function(x) subset(size, gene==x))
+
+ pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677)
+
+ for (i in seq(1,n_genes,${rows_per_page})) {
+ start=i
+ end=i+${rows_per_page}-1
+ if (end>n_genes) {end=n_genes}
+ plot.list=lapply(per_gene_size[start:end], function(x) plot_size_distribution(x, par.settings=par.settings.size))
+ args.list=c(plot.list, list(nrow=${rows_per_page}, ncol=1, main="size distribution", left="${ylabel}", sub="readsize in nucleotides"))
+ do.call(grid.arrange, args.list)
+ }
+
+ devname=dev.off()
+
+ pdf(file="${combi_PDF}", paper="special", height=11.69, width=8.2677)
+
+ for (i in seq(1,n_genes,${rows_per_page}/2)) {
+ start=i
+ end=i+${rows_per_page}/2-1
+ if (end>n_genes) {end=n_genes}
+ read_plot.list=lapply(per_gene_readmap[start:end], function(x) plot_readmap(x, par.settings=par.settings.combination.readmap))
+ size_plot.list=lapply(per_gene_size[start:end], function(x) plot_size_distribution(x, strip=FALSE, par.settings=par.settings.combination.size))
+ plot.list=rbind(read_plot.list, size_plot.list )
+ args.list=c(plot.list, list(nrow=${rows_per_page}, ncol=1, main="${title}", left="${ylabel}", sub="${xlabel}"))
+ do.call(grid.arrange, args.list)
+ }
+
+ devname=dev.off()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Takes one or more alignment files (BAM, SAM or tabular bowtie output) as input and produces a "Readmap",
+where by default for each "chromosome" the position of the read is recorded on the x-axis, and the y-axis indicates
+the number of reads per position. Reads that map in sense are on the top, reads that map antisense are on the bottom.
+
+
+.. class:: warningmark
+
+'''TIP''' The input data can be produced using the sRbowtie tool.
+
+----
+
+'''Example'''
+
+Query sequence::
+For a SAM file as the following:
+
+ 5 16 2L_79 24393 255 17M * 0 0 CCTTCATCTTTTTTTTT IIIIIIIIIIIIIIIII XA:i:0 MD:Z:17 NM:i:0
+
+ 11 0 2R_1 12675 255 21M * 0 0 AAAAAAAACGCGTCCTTGTGC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0
+
+ 2 16 2L_5 669 255 23M * 0 0 TGTTGCTGCATTTCTTTTTTTTT IIIIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:23 NM:i:0
+
+produce a plot like this:
+
+----
+
+.. image:: static/images/readmap.png
+ :height: 800
+ :width: 500
+
+
+
diff -r 000000000000 -r de6a6afc5a79 sRbowtie.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtie.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+# small RNA oriented bowtie wrapper
+# version 1.01 29-5-2014
+# Usage sRbowtie.py <1 input_fasta_file> <2 alignment method> <3 -v mismatches> <4 out_type> <5 buildIndexIfHistory> <6 fasta/bowtie index> <7 bowtie output> <8 ali_fasta> <9 unali_fasta> <10 --num-threads \${GALAXY_SLOTS:-4}>
+# current rev: for bowtie __norc, move from --supress 2,6,7,8 to --supress 6,7,8. Future Parser must be updated to take into account this standardisation
+# To Do:
+# implement an arg parser
+# Christophe Antoniewski
+
+import sys, os, subprocess, tempfile, shutil
+
+def stop_err( msg ):
+ sys.stderr.write( '%s\n' % msg )
+ sys.exit()
+
+def bowtieCommandLiner (alignment_method, v_mis, out_type, aligned, unaligned, input, index, output, pslots="12"):
+ if alignment_method=="RNA":
+ x = "-v %s -M 1 --best --strata -p %s --norc --suppress 6,7,8" % (v_mis, pslots)
+ elif alignment_method=="unique":
+ x = "-v %s -m 1 -p %s --suppress 6,7,8" % (v_mis, pslots)
+ elif alignment_method=="multiple":
+ x = "-v %s -M 1 --best --strata -p %s --suppress 6,7,8" % (v_mis, pslots)
+ elif alignment_method=="k_option":
+ x = "-v %s -k 1 --best -p %s --suppress 6,7,8" % (v_mis, pslots)
+ elif alignment_method=="n_option":
+ x = "-n %s -M 1 --best -p %s --suppress 6,7,8" % (v_mis, pslots)
+ elif alignment_method=="a_option":
+ x = "-v %s -a --best -p %s --suppress 6,7,8" % (v_mis, pslots)
+ if aligned == "None" and unaligned == "None": fasta_command = ""
+ elif aligned != "None" and unaligned == "None": fasta_command= " --al %s" % aligned
+ elif aligned == "None" and unaligned != "None": fasta_command = " --un %s" % unaligned
+ else: fasta_command = " --al %s --un %s" % (aligned, unaligned)
+ x = x + fasta_command
+ if out_type == "tabular":
+ return "bowtie %s %s -f %s > %s" % (x, index, input, output)
+ elif out_type=="sam":
+ return "bowtie %s -S %s -f %s > %s" % (x, index, input, output)
+ elif out_type=="bam":
+ return "bowtie %s -S %s -f %s |samtools view -bS - > %s" % (x, index, input, output)
+
+def bowtie_squash(fasta):
+ tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes
+ ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
+ ref_file_name = ref_file.name
+ ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name
+ os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name
+ cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir)
+ try:
+ FNULL = open(os.devnull, 'w')
+ tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string
+ tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file
+ proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null
+ returncode = proc.wait()
+ tmp_stderr.close()
+ FNULL.close()
+ sys.stdout.write(cmd1 + "\n")
+ except Exception, e:
+ # clean up temp dir
+ if os.path.exists( tmp_index_dir ):
+ shutil.rmtree( tmp_index_dir )
+ stop_err( 'Error indexing reference sequence\n' + str( e ) )
+ # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment()
+ index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention
+ return tmp_index_dir, index_full_path
+
+def bowtie_alignment(command_line, flyPreIndexed=''):
+ # make temp directory just for stderr
+ tmp_index_dir = tempfile.mkdtemp()
+ tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
+ tmp_stderr = open( tmp, 'wb' )
+ # conditional statement for sorted bam generation viewable in Trackster
+ if "samtools" in command_line:
+ target_file = command_line.split()[-1] # recover the final output file name
+ path_to_unsortedBam = os.path.join(tmp_index_dir, "unsorted.bam")
+ path_to_sortedBam = os.path.join(tmp_index_dir, "unsorted.bam.sorted")
+ first_command_line = " ".join(command_line.split()[:-3]) + " -o " + path_to_unsortedBam + " - "
+ # example: bowtie -v 0 -M 1 --best --strata -p 12 --suppress 6,7,8 -S /home/galaxy/galaxy-dist/bowtie/Dmel/dmel-all-chromosome-r5.49 -f /home/galaxy/galaxy-dist/database/files/003/dataset_3460.dat |samtools view -bS -o /tmp/tmp_PgMT0/unsorted.bam -
+ second_command_line = "samtools sort %s %s" % (path_to_unsortedBam, path_to_sortedBam) # generates an "unsorted.bam.sorted.bam file", NOT an "unsorted.bam.sorted" file
+ p = subprocess.Popen(args=first_command_line, cwd=tmp_index_dir, shell=True, stderr=tmp_stderr.fileno()) # fileno() method return the file descriptor number of tmp_stderr
+ returncode = p.wait()
+ sys.stdout.write("%s\n" % first_command_line + str(returncode))
+ p = subprocess.Popen(args=second_command_line, cwd=tmp_index_dir, shell=True, stderr=tmp_stderr.fileno())
+ returncode = p.wait()
+ sys.stdout.write("\n%s\n" % second_command_line + str(returncode))
+ if os.path.isfile(path_to_sortedBam + ".bam"):
+ shutil.copy2(path_to_sortedBam + ".bam", target_file)
+ else:
+ p = subprocess.Popen(args=command_line, shell=True, stderr=tmp_stderr.fileno())
+ returncode = p.wait()
+ sys.stdout.write(command_line + "\n")
+ tmp_stderr.close()
+ ## cleaning if the index was created in the fly
+ if os.path.exists( flyPreIndexed ):
+ shutil.rmtree( flyPreIndexed )
+ # cleaning tmp files and directories
+ if os.path.exists( tmp_index_dir ):
+ shutil.rmtree( tmp_index_dir )
+ return
+
+def __main__():
+ F = open (sys.argv[7], "w")
+ if sys.argv[5] == "history":
+ tmp_dir, index_path = bowtie_squash(sys.argv[6])
+ else:
+ tmp_dir, index_path = "dummy/dymmy", sys.argv[6]
+ command_line = bowtieCommandLiner(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[8], sys.argv[9], sys.argv[1], index_path, sys.argv[7], sys.argv[10])
+ bowtie_alignment(command_line, flyPreIndexed=tmp_dir)
+ F.close()
+if __name__=="__main__": __main__()
diff -r 000000000000 -r de6a6afc5a79 sRbowtie.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtie.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,210 @@
+
+ for FASTA small reads
+
+ bowtie
+
+
+ sRbowtie.py $input
+ $method
+ $v_mismatches
+ $output_type
+ $refGenomeSource.genomeSource
+ ## the very source of the index (indexed or fasta file)
+ #if $refGenomeSource.genomeSource == "history":
+ $refGenomeSource.ownFile
+ #else:
+ $refGenomeSource.index.fields.path
+ #end if
+ $output
+ $aligned
+ $unaligned
+ \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ additional_fasta == "al" or additional_fasta == "al_and_unal"
+
+
+ additional_fasta == "unal" or additional_fasta == "al_and_unal"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient. It is developed by Ben Langmead and Cole Trapnell. Please cite: Langmead B, Trapnell C, Pop M, Salzberg SL. Ultrafast and memory-efficient alignment of short DNA sequences to the human genome. Genome Biology 10:R25.
+
+.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml
+
+A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution.
+
+However, this Bowtie wrapper tool only takes FASTQ files as inputs.
+
+The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode)
+
+------
+
+**OPTIONS**
+
+.. class:: infomark
+
+This script uses Bowtie to match reads on a reference index.
+
+Depending on the type of matching, different bowtie options are used:
+
+**Match on sense strand RNA reference index, multiple mappers randomly matched at a single position**
+
+Match on RNA reference, SENSE strand, randomly attributing multiple mapper to target with least mismatches, the polarity column is suppressed in the bowtie tabular report:
+
+*-v [0,1,2,3] -M 1 --best --strata -p 12 --norc --suppress 2,6,7,8*
+
+**Match unique mappers on DNA reference index**
+
+Match ONLY unique mappers on DNA reference index
+
+*-v [0,1,2,3] -m 1 -p 12 --suppress 6,7,8*
+
+Note that using this option with -v values other than 0 is questionnable...
+
+**Match on DNA, multiple mappers randomly matched at a single position**
+
+Match multiple mappers, randomly attributing multiple mapper to target with least mismatches, number of mismatch allowed specified by -v option:
+
+*-v [0,1,2,3] -M 1 --best --strata -p 12 --suppress 6,7,8*
+
+**Match on DNA as fast as possible, without taking care of mapping issues (for raw annotation of reads)**
+
+Match with highest speed, not guaranteeing best hit for speed gain:
+
+*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8*
+
+
+-----
+
+**Input formats**
+
+.. class:: warningmark
+
+*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter*
+
+-----
+
+**OUTPUTS**
+
+If you choose tabular as the output format, you will obtain the matched reads in standard bowtie output format, having the following columns::
+
+ Column Description
+ -------- --------------------------------------------------------
+ 1 FastaID fasta identifier
+ 2 polarity + or - depending whether the match was reported on the forward or reverse strand
+ 3 target name of the matched target
+ 4 Offset O-based coordinate of the miR on the miRBase pre-miR sequence
+ 5 Seq sequence of the matched Read
+
+If you choose SAM, you will get the output in unordered SAM format.
+
+.. class:: warningmark
+
+if you choose BAM, the output will be in sorted BAM format.
+To be viewable in Trackster, several condition must be fulfilled:
+
+.. class:: infomark
+
+Reads must have been matched to a genome whose chromosome names are compatible with Trackster genome indexes
+
+.. class:: infomark
+
+the database/Build (dbkey) which is indicated for the dataset (Pencil - Database/Build field) must match a Trackster genome index.
+
+Please contact the Galaxy instance administrator if your genome is not referenced
+
+**Matched and unmatched fasta reads can be retrieved, for further analyses**
+
+
+
diff -r 000000000000 -r de6a6afc5a79 sRbowtieCascade.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtieCascade.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+# small RNA oriented bowtie wrapper in cascade for small RNA data set genome annotation
+# version 0.9 13-6-2014
+# Usage sRbowtie_cascade.py see Parser() for valid arguments
+# Christophe Antoniewski
+
+import sys, os, subprocess, tempfile, shutil, argparse
+from collections import defaultdict
+
+def Parser():
+ the_parser = argparse.ArgumentParser()
+ the_parser.add_argument('--output', action="store", type=str, help="output file")
+ the_parser.add_argument('--num-threads', dest="num_threads", action="store", type=str, help="number of bowtie threads")
+ the_parser.add_argument('--mismatch', action="store", type=str, help="number of mismatches allowed")
+ the_parser.add_argument('--indexing-flags', dest="indexing_flags", nargs='+', help="whether the index should be generated or not by bowtie-buid")
+ the_parser.add_argument('--index',nargs='+', help="paths to indexed or fasta references")
+ the_parser.add_argument('--indexName',nargs='+', help="Names of the indexes")
+ the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
+ the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
+ args = the_parser.parse_args()
+ return args
+
+def stop_err( msg ):
+ sys.stderr.write( '%s\n' % msg )
+ sys.exit()
+
+def bowtie_squash(fasta):
+ tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes
+ ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
+ ref_file_name = ref_file.name
+ ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name
+ os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name
+ cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir)
+ try:
+ FNULL = open(os.devnull, 'w')
+ tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string
+ tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file
+ proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null
+ returncode = proc.wait()
+ tmp_stderr.close()
+ FNULL.close()
+ sys.stdout.write(cmd1 + "\n")
+ except Exception, e:
+ # clean up temp dir
+ if os.path.exists( tmp_index_dir ):
+ shutil.rmtree( tmp_index_dir )
+ stop_err( 'Error indexing reference sequence\n' + str( e ) )
+ # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment()
+ index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention
+ return index_full_path
+
+def make_working_dir():
+ working_dir = tempfile.mkdtemp()
+ return working_dir
+
+def Clean_TempDir(directory):
+ if os.path.exists( directory ):
+ shutil.rmtree( directory )
+ return
+
+def bowtie_alignment(command_line="None", working_dir = ""):
+ FNULL = open(os.devnull, 'w')
+ p = subprocess.Popen(args=command_line, cwd=working_dir, shell=True, stderr=FNULL, stdout=FNULL)
+ returncode = p.wait()
+ sys.stdout.write("%s\n" % command_line)
+ FNULL.close()
+ #p = subprocess.Popen(["wc", "-l", "%s/al.fasta"%working_dir], cwd=working_dir, stdout=subprocess.PIPE)
+ #aligned = p.communicate()[0].split()[0]
+ aligned = 0
+ F = open ("%s/al.fasta" % working_dir, "r")
+ for line in F:
+ aligned += 1
+ F.close()
+ sys.stdout.write("Aligned: %s\n" % aligned)
+ return aligned/2
+
+def CommandLiner (v_mis="1", pslots="12", index="dum/my", input="dum/my", working_dir=""):
+ return "bowtie -v %s -k 1 --best -p %s --al %s/al.fasta --un %s/unal.fasta --suppress 1,2,3,4,5,6,7,8 %s -f %s" % (v_mis, pslots, working_dir, working_dir, index, input)
+
+def __main__():
+ args = Parser()
+ ## first we make all indexes available. They can be already available or be squashed by bowtie-build
+ ## we keep them in a list that alternates indexPath and "toClear" or "DoNotDelete"
+ BowtieIndexList = []
+ for indexing_flags, bowtiePath in zip (args.indexing_flags, args.index):
+ if indexing_flags == "history":
+ BowtieIndexList.append ( bowtie_squash (bowtiePath) )
+ BowtieIndexList.append ( "toClear" )
+ else:
+ BowtieIndexList.append ( bowtiePath )
+ BowtieIndexList.append ( "DoNotDelete")
+ ###### temporary Indexes are generated. They must be deleted at the end (after removing file name in the temp path)
+ ResultDict = defaultdict(list)
+ for label, input in zip(args.label, args.input): ## the main cascade, iterating over samples and bowtie indexes
+ workingDir = make_working_dir()
+ cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[0], input=input, working_dir=workingDir)
+ ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) # first step of the cascade
+ if len(BowtieIndexList) > 2: # is there a second step to perform ?
+ os.rename("%s/al.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) ## end of first step. the aligned reads are the input of the next step
+ cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[2], input="%s/toAlign.fasta"%workingDir, working_dir=workingDir)
+ ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )## second step of the cascade
+ if len(BowtieIndexList) > 4: ## remaining steps
+ for BowtieIndexPath in BowtieIndexList[4::2]:
+ os.rename("%s/unal.fasta"%workingDir, "%s/toAlign.fasta"%workingDir)
+ cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexPath, input="%s/toAlign.fasta"%workingDir, working_dir=workingDir)
+ ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )
+ Fun = open("%s/unal.fasta"%workingDir, "r") ## to finish, compute the number of unmatched reads
+ n = 0
+ for line in Fun:
+ n += 1
+ ResultDict[label].append(n/2)
+ Fun.close()
+ Clean_TempDir (workingDir) # clean the sample working directory
+ ## cleaning
+ for IndexPath, IndexFlag in zip(BowtieIndexList[::2], BowtieIndexList[1::2]):
+ if IndexFlag == "toClear":
+ Clean_TempDir ("/".join(IndexPath.split("/")[:-1]))
+ ## end of cleaning
+
+
+
+ F = open (args.output, "w")
+ print >> F, "alignment reference\t%s" % "\t".join(args.label)
+ for i, reference in enumerate(args.indexName):
+ F.write ("%s" % reference)
+ for sample in args.label:
+ F.write ("\t%s" % "{:,}".format(ResultDict[sample][i]) )
+ print >> F
+ F.write ("Remaining Unmatched")
+ for sample in args.label:
+ F.write ("\t%s" % "{:,}".format(ResultDict[sample][-1]) )
+ print >> F
+
+ F.close()
+
+if __name__=="__main__": __main__()
diff -r 000000000000 -r de6a6afc5a79 sRbowtieCascade.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtieCascade.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,142 @@
+
+ Using iterative sRbowtie Alignments
+
+ bowtie
+
+
+ sRbowtieCascade.py --output $output
+ --num-threads \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie
+ --mismatch $mismatches
+ --input
+ #for $i in $input:
+ $i
+ #end for
+ --label
+ #for $i in $input:
+ "$i.name"
+ #end for
+ --index
+ #if $refGenomeSource1.genomeSource == "history":
+ $refGenomeSource1.ownFile
+ #else:
+ $refGenomeSource1.index.fields.path
+ #end if
+ #for $i in $AdditionalQueries:
+ #if $i.refGenomeSource.genomeSource == "history":
+ $i.refGenomeSource.ownFile
+ #else:
+ $i.refGenomeSource.index.fields.path
+ #end if
+ #end for
+ --indexing-flags
+ $refGenomeSource1.genomeSource
+ #for $i in $AdditionalQueries:
+ $i.refGenomeSource.genomeSource
+ #end for
+ --indexName
+ #if $refGenomeSource1.genomeSource == "history":
+ "$refGenomeSource1.ownFile.name"
+ #else:
+ "$refGenomeSource1.index.fields.name"
+ #end if
+ #for $i in $AdditionalQueries:
+ #if $i.refGenomeSource.genomeSource == "history":
+ "$i.refGenomeSource.ownFile.name"
+ #else:
+ "$i.refGenomeSource.index.fields.name"
+ #end if
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Intro**
+
+Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient.
+A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution.
+However, this Bowtie wrapper tool only takes FASTQ files as inputs.
+
+Here The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode, with -k 1)
+
+.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml
+
+
+------
+
+**What it does**
+
+.. class:: infomark
+
+This script uses the sRbowtie wrapper to iteratively match reads on a reference indexes.
+
+Reads are Matched on DNA references as fast as possible, without taking care of mapping issues
+
+*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8*
+
+unaligned reads at step N are used input for sRbowtie at step N+1
+
+-----
+
+**Input formats**
+
+.. class:: warningmark
+
+*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter*
+
+-----
+
+**OUTPUTS**
+
+**Annotation table**
+
+
+
diff -r 000000000000 -r de6a6afc5a79 sRbowtieParser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtieParser.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,35 @@
+#!/usr/bin/python
+# python parser module to analyse sRbowtie alignments
+# version 0.9
+# Usage sRbowtieParser.py <1:index source> <2:extraction directive> <3:outputL> <4:polarity> <5:6:7 filePath:FileExt:FileLabel> <.. ad lib>
+
+import sys
+from smRtools import *
+
+IndexSource = sys.argv[1]
+ExtractionDirective = sys.argv[2]
+if ExtractionDirective == "--do_not_extract_index":
+ genomeRefFormat = "fastaSource"
+elif ExtractionDirective == "--extract_index":
+ genomeRefFormat = "bowtieIndex"
+Output = sys.argv[3]
+Polarity = sys.argv[4] # maybe "both", "forward", "reverse"
+Triplets = [sys.argv[5:][i:i+3] for i in xrange(0, len(sys.argv[5:]), 3)]
+MasterListOfGenomes = {}
+
+for [filePath, FileExt, FileLabel] in Triplets:
+ MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (filePath, FileExt, IndexSource, genomeRefFormat)
+
+header = ["gene"]
+for [filePath, FileExt, FileLabel] in Triplets:
+ header.append(FileLabel)
+
+F = open (sys.argv[3], "w")
+print >> F, "\t".join(header)
+for item in sorted (MasterListOfGenomes[header[1]].instanceDict.keys() ):
+ line=[item]
+ for sample in header[1:]:
+ count = str (MasterListOfGenomes[sample].instanceDict[item].readcount(polarity=Polarity))
+ line.append(count)
+ print >> F, "\t".join(line )
+F.close()
diff -r 000000000000 -r de6a6afc5a79 sRbowtieParser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sRbowtieParser.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,67 @@
+
+
+ bowtie-inspect
+
+
+ sRbowtieParser.py
+ #if $refGenomeSource.genomeSource == "history":
+ $refGenomeSource.ownFile ## index source
+ --do_not_extract_index
+ #else:
+ #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1]
+ $reference ## index source
+ --extract_index
+ #end if
+ $output
+ $polarity
+ #for $i in $refGenomeSource.input_list
+ $i $i.ext "$i.name"
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Parses read counts from one or several sRBowtie alignments (in tabular, Sam or Bam format).
+
+Here a bowtie match done against an index composed of a set of items is parsed and expressed as a hit list of the corresponding items
+
+Sense, antisense or both sense and antisense alignments can be counted
+
+The library labels are infered from the input dataset names in the galaxy history.
+
+**It is thus essential that input datasets are appropriately renamed**
+
+**it is preferable that you do not put any space in this input dataset names. You may edit these names in the history**
+
+
+
+
+
+
diff -r 000000000000 -r de6a6afc5a79 size_histogram.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/size_histogram.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,126 @@
+#!/usr/bin/python
+# python parser module for size distributions, guided by GFF3
+# version 0.9.1 (1-6-2014)
+# Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
+# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
+# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
+
+import sys, subprocess, argparse
+from smRtools import *
+from collections import OrderedDict, defaultdict
+import os
+
+def Parser():
+ the_parser = argparse.ArgumentParser()
+ the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe")
+ the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file")
+ the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references")
+ the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
+ the_parser.add_argument('--ext',nargs='+', help="input file type")
+ the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
+ the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file")
+ the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
+ the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
+ the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
+ the_parser.add_argument('--rcode', type=str, help="R script")
+ the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calcilated for the sum of all items")
+ the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed")
+ args = the_parser.parse_args()
+ return args
+
+args=Parser()
+if args.reference_fasta:
+ genomeRefFormat = "fastaSource"
+ genomeRefFile = args.reference_fasta
+if args.reference_bowtie_index:
+ genomeRefFormat = "bowtieIndex"
+ genomeRefFile = args.reference_bowtie_index
+size_distribution_file=args.output_size_distribution
+minquery=args.minquery
+maxquery=args.maxquery
+Rcode = args.rcode
+filePath=args.input
+fileExt=args.ext
+fileLabel=args.label
+normalization_factor=args.normalization_factor
+global_size=args.global_size
+collapse=args.collapse
+
+if collapse:
+ pol=["both"]
+else:
+ pol=["F", "R"]
+
+MasterListOfGenomes = OrderedDict()
+
+def process_samples(filePath):
+ for i, filePath in enumerate(filePath):
+ norm=normalization_factor[i]
+ print fileLabel[i]
+ MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\
+ biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
+ return MasterListOfGenomes
+
+def write_size_distribution_dataframe(readDict, size_distribution_file):
+ with open(size_distribution_file, 'w') as size_distrib:
+ print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
+ for sample in readDict.keys():
+ if args.gff:
+ dict=readDict[sample]
+ else:
+ dict=readDict[sample].instanceDict
+ for gene in dict.keys():
+ histogram = dict[gene].size_histogram()
+ for polarity in histogram.keys():
+ for item in histogram[polarity].iteritems():
+ print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample)
+
+def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]):
+ with open(size_distribution_file, 'w') as size_distrib:
+ print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
+ for sample in readDict.keys():
+ histogram = readDict[sample].size_histogram()
+ gene="sample"
+ for polarity in pol:
+ for item in histogram[polarity].iteritems():
+ if polarity=="R":
+ print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], -item[1], sample)
+ else:
+ print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample)
+
+def gff_item_subinstances(readDict, gff3):
+ GFFinstanceDict=OrderedDict()
+ with open(gff3) as gff:
+ for line in gff:
+ if line[0] == "#": continue
+ gff_fields = line[:-1].split("\t")
+ chrom = gff_fields[0]
+ gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
+ item_upstream_coordinate = int(gff_fields[3])
+ item_downstream_coordinate = int(gff_fields[4])
+ item_polarity = gff_fields[6]
+ for sample in readDict.keys():
+ if not GFFinstanceDict.has_key(sample):
+ GFFinstanceDict[sample]={}
+ subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
+ if item_polarity == '-':
+ subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
+# subinstance.readDict.setdefault(key, [])
+ subinstance.gene=gff_name
+ GFFinstanceDict[sample][gff_name]=subinstance
+ return GFFinstanceDict
+
+MasterListOfGenomes=process_samples(filePath)
+
+if args.gff:
+ MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff)
+
+if global_size:
+ write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol)
+else:
+ write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file)
+
+R_command="Rscript "+ Rcode
+process = subprocess.Popen(R_command.split())
+process.wait()
+
diff -r 000000000000 -r de6a6afc5a79 size_histogram.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/size_histogram.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,200 @@
+
+ from sRbowtie aligment
+ bowtie-inspect
+
+
+ size_histogram.py
+ #if $refGenomeSource.genomeSource == "history":
+ --reference_fasta ## sys.argv[2]
+ $refGenomeSource.ownFile ## index source
+ #else:
+ #silent reference= filter( lambda x: str( x[0] ) == str( $refGenomeSource.series[0].input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1]
+ --reference_bowtie_index
+ $reference
+ #end if
+ --rcode
+ $plotCode
+ --output_size_distribution
+ $size_distribution_dataframe
+ --minquery
+ $minquery
+ --maxquery
+ $maxquery
+ --input
+ #for $i in $refGenomeSource.series
+ $i.input
+ #end for
+ --ext
+ #for $i in $refGenomeSource.series
+ $i.input.ext
+ #end for
+ --label
+ #for $i in $refGenomeSource.series
+ "$i.input.name"
+ #end for
+ --normalization_factor
+ #for $i in $refGenomeSource.series
+ $i.norm
+ #end for
+ #if $gff:
+ --gff
+ $gff
+ #end if
+ #if $global.value == 'yes':
+ --global_size
+ #end if
+ #if $collapsestrands.value == 'yes':
+ --collapse
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ## Setup R error handling to go to stderr
+ options( show.error.messages=F,
+ error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+ library(RColorBrewer)
+ library(lattice)
+ library(latticeExtra)
+ library(grid)
+ library(gridExtra)
+ ##cheetahtemplate data frame implementation
+
+ size=read.delim("${size_distribution_dataframe}", header=T, row.names=NULL)
+
+ n_samples=length(unique(size\$sample))
+ genes=unique(levels(size\$gene))
+ n_genes=length(genes)
+
+ par.settings.size=list(layout.heights=list(top.padding=-1, bottom.padding=-3, strip = .75), fontsize = list(text=96/${rows_per_page}, points=8))
+ smR.prepanel=function(x,y,...){; yscale=c(-max(abs(y)), max(abs(y)));list(ylim=yscale);}
+
+ plot_size_distribution= function(df, ...) {
+ bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample))+gene, data = df, origin = 0,
+ horizontal=FALSE,
+ group=polarity,
+ stack=TRUE,
+ col=c('red', 'blue'),
+ strip = strip.custom(par.strip.text = list(cex = 0.5)),
+ cex=0.75,
+ scales=list(y=list(tick.number=4, rot=90, relation="free"), cex=0.75),
+ xlab = "readsize in nucleotides",
+ ylab = "${ylabel}",
+ main="${title}" ,
+ as.table=TRUE, newpage = T, ...)
+ combineLimits(update(useOuterStrips(bc), layout=c(n_samples,${rows_per_page})), margin.x=F, margin.y=1)
+ }
+
+ per_gene_size=lapply(genes, function(x) subset(size, gene==x))
+
+ global = "no"
+ #if $global.value == 'yes':
+ global = "yes"
+ #end if
+
+ if (global=="no") {
+ options(warn=-1)
+ pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677*n_samples/4)
+ plot_size_distribution(size, par.settings=par.settings.size, prepanel=smR.prepanel)
+ } else {
+ pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677)
+ bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample)), data = size, origin = 0,
+ horizontal=FALSE,
+ group=polarity,
+ stack=TRUE,
+ col=c('red', 'blue'),
+ cex=0.75,
+ par.settings=list(fontsize = list(text=8, points=8)),
+ scales=list(y=list(tick.number=4, rot=90, relation="same"), cex=0.75),
+ xlab = "readsize in nucleotides",
+ ylab = "${ylabel}",
+ main="${title}" , as.table=TRUE, newpage = T,
+ aspect=0.5)
+ #layout=c(n_samples, ${rows_per_page}))
+ bc
+ }
+ devname=dev.off()
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+Takes one or more alignment files (BAM, SAM or tabular bowtie output) as input and produces a histogram of read sizes,
+where by default for each "chromosome" a histogram of read sizes is drawn.
+Reads that map in sense are on the top (red), reads that map antisense are on the bottom (blue).
+
+
+.. class:: warningmark
+
+'''TIP''' The input data can be produced using the sRbowtie tool.
+
+----
+
+'''Example'''
+
+Query sequence::
+For a SAM file as the following:
+
+ 5 16 2L_79 24393 255 17M * 0 0 CCTTCATCTTTTTTTTT IIIIIIIIIIIIIIIII XA:i:0 MD:Z:17 NM:i:0
+
+ 11 0 2R_1 12675 255 21M * 0 0 AAAAAAAACGCGTCCTTGTGC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0
+
+ 2 16 2L_5 669 255 23M * 0 0 TGTTGCTGCATTTCTTTTTTTTT IIIIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:23 NM:i:0
+
+produce a plot like this:
+
+----
+
+.. image:: static/images/size_histogram.png
+ :height: 800
+ :width: 500
+
+
+
diff -r 000000000000 -r de6a6afc5a79 smRtools.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/smRtools.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,704 @@
+#!/usr/bin/python
+# version 1 7-5-2012 unification of the SmRNAwindow class
+
+import sys, subprocess
+from collections import defaultdict
+from numpy import mean, median, std
+from scipy import stats
+
+def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"):
+ '''This function will return a dictionary containing fasta identifiers as keys and the
+ sequence as values. Index must be the path to a fasta file.'''
+ p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
+ outputlines = p.stdout.readlines()
+ p.wait()
+ item_dic = {}
+ for line in outputlines:
+ if (line[0] == ">"):
+ try:
+ item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
+ except: pass
+ current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers !
+ item_dic[current_item] = ""
+ stringlist=[]
+ else:
+ stringlist.append(line.rstrip() )
+ item_dic[current_item] = "".join(stringlist) # for the last item
+ return item_dic
+
+def get_fasta_headers (index):
+ p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
+ outputlines = p.stdout.readlines()
+ p.wait()
+ item_dic = {}
+ for line in outputlines:
+ header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers !
+ item_dic[header] = 1
+ return item_dic
+
+
+def get_file_sample (file, numberoflines):
+ '''import random to use this function'''
+ F=open(file)
+ fullfile = F.read().splitlines()
+ F.close()
+ if len(fullfile) < numberoflines:
+ return "sample size exceeds file size"
+ return random.sample(fullfile, numberoflines)
+
+def get_fasta_from_history (file):
+ F = open (file, "r")
+ item_dic = {}
+ for line in F:
+ if (line[0] == ">"):
+ try:
+ item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
+ except: pass
+ current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers !
+ item_dic[current_item] = ""
+ stringlist=[]
+ else:
+ stringlist.append(line[:-1])
+ item_dic[current_item] = "".join(stringlist) # for the last item
+ return item_dic
+
+def antipara (sequence):
+ antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"}
+ revseq = sequence[::-1]
+ return "".join([antidict[i] for i in revseq])
+
+def RNAtranslate (sequence):
+ return "".join([i if i in "AGCN" else "U" for i in sequence])
+def DNAtranslate (sequence):
+ return "".join([i if i in "AGCN" else "T" for i in sequence])
+
+def RNAfold (sequence_list):
+ thestring= "\n".join(sequence_list)
+ p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ output=p.communicate(thestring)[0]
+ p.wait()
+ output=output.split("\n")
+ if not output[-1]: output = output[:-1] # nasty patch to remove last empty line
+ buffer=[]
+ for line in output:
+ if line[0] in ["N","A","T","U","G","C"]:
+ buffer.append(DNAtranslate(line))
+ if line[0] in ["(",".",")"]:
+ fields=line.split("(")
+ energy= fields[-1]
+ energy = energy[:-1] # remove the ) parenthesis
+ energy=float(energy)
+ buffer.append(str(energy))
+ return dict(zip(buffer[::2], buffer[1::2]))
+
+def extractsubinstance (start, end, instance):
+ ''' Testing whether this can be an function external to the class to save memory'''
+ subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start)
+ subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1)
+ upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ]
+ downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ]
+ for i in upcoordinate:
+ subinstance.readDict[i]=instance.readDict[i]
+ for i in downcoordinate:
+ subinstance.readDict[i]=instance.readDict[i]
+ return subinstance
+
+class HandleSmRNAwindows:
+ def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0):
+ self.biosample = biosample
+ self.alignmentFile = alignmentFile
+ self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam"
+ self.genomeRefFile = genomeRefFile
+ self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource"
+ self.alignedReads = 0
+ self.instanceDict = {}
+ self.size_inf=size_inf
+ self.size_sup=size_sup
+ self.norm=norm
+ if genomeRefFormat == "bowtieIndex":
+ self.itemDict = get_fasta (genomeRefFile)
+ elif genomeRefFormat == "fastaSource":
+ self.itemDict = get_fasta_from_history (genomeRefFile)
+ for item in self.itemDict:
+ self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items
+ self.readfile()
+
+ def readfile (self) :
+ if self.alignmentFileFormat == "tabular":
+ F = open (self.alignmentFile, "r")
+ for line in F:
+ fields = line.split()
+ polarity = fields[1]
+ gene = fields[2]
+ offset = int(fields[3])
+ size = len (fields[4])
+ if self.size_inf:
+ if (size>=self.size_inf and size<= self.size_sup):
+ self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
+ self.alignedReads += 1
+ else:
+ self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
+ self.alignedReads += 1
+ F.close()
+ return self.instanceDict
+# elif self.alignmentFileFormat == "sam":
+# F = open (self.alignmentFile, "r")
+# dict = {"0":"+", "16":"-"}
+# for line in F:
+# if line[0]=='@':
+# continue
+# fields = line.split()
+# if fields[2] == "*": continue
+# polarity = dict[fields[1]]
+# gene = fields[2]
+# offset = int(fields[3])
+# size = len (fields[9])
+# if self.size_inf:
+# if (size>=self.size_inf and size<= self.size_sup):
+# self.instanceDict[gene].addread (polarity, offset, size)
+# self.alignedReads += 1
+# else:
+# self.instanceDict[gene].addread (polarity, offset, size)
+# self.alignedReads += 1
+# F.close()
+ elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam":
+ import pysam
+ samfile = pysam.Samfile(self.alignmentFile)
+ for read in samfile:
+ if read.tid == -1:
+ continue # filter out unaligned reads
+ if read.is_reverse:
+ polarity="-"
+ else:
+ polarity="+"
+ gene = samfile.getrname(read.tid)
+ offset = read.pos
+ size = read.qlen
+ if self.size_inf:
+ if (size>=self.size_inf and size<= self.size_sup):
+ self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
+ self.alignedReads += 1
+ else:
+ self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
+ self.alignedReads += 1
+ return self.instanceDict
+
+ def size_histogram (self):
+ size_dict={}
+ size_dict['F']= defaultdict (int)
+ size_dict['R']= defaultdict (int)
+ size_dict['both'] = defaultdict (int)
+ for item in self.instanceDict:
+ buffer_dict_F = self.instanceDict[item].size_histogram()['F']
+ buffer_dict_R = self.instanceDict[item].size_histogram()['R']
+ for size in buffer_dict_F:
+ size_dict['F'][size] += buffer_dict_F[size]
+ for size in buffer_dict_R:
+ size_dict['R'][size] -= buffer_dict_R[size]
+ allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) )
+ for size in allSizeKeys:
+ size_dict['both'][size] = size_dict['F'][size] + size_dict['R'][size]
+ return size_dict
+
+ def CountFeatures (self, GFF3="path/to/file"):
+ featureDict = defaultdict(int)
+ F = open (GFF3, "r")
+ for line in F:
+ if line[0] == "#": continue
+ fields = line[:-1].split()
+ chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6]
+ featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive")
+ F.close()
+ return featureDict
+
+class SmRNAwindow:
+
+ def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0):
+ self.biosample = biosample
+ self.sequence = sequence
+ self.gene = gene
+ self.windowoffset = windowoffset
+ self.size = len(sequence)
+ self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...}
+ self.matchedreadsUp = 0
+ self.matchedreadsDown = 0
+ self.norm=norm
+
+ def addread (self, polarity, offset, size):
+ '''ATTENTION ATTENTION ATTENTION'''
+ ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.'''
+ if polarity == "+":
+ self.readDict[offset].append(size)
+ self.matchedreadsUp += 1
+ else:
+ self.readDict[-(offset + size -1)].append(size)
+ self.matchedreadsDown += 1
+ return
+
+ def barycenter (self, upstream_coord=None, downstream_coord=None):
+ '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that
+ In this version, attempt to replace the dictionary structure by a list of tupple to save memory too'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ window_size = downstream_coord - upstream_coord +1
+ def weigthAverage (TuppleList):
+ weightSum = 0
+ PonderWeightSum = 0
+ for tuple in TuppleList:
+ PonderWeightSum += tuple[0] * tuple[1]
+ weightSum += tuple[1]
+ if weightSum > 0:
+ return PonderWeightSum / float(weightSum)
+ else:
+ return 0
+ forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window
+ reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window
+ Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size
+ Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size
+ return Fbarycenter, Rbarycenter
+
+ def correlation_mapper (self, reference, window_size):
+ '''to map correlation with a sliding window 26-2-2013'''
+ if window_size > self.size:
+ return []
+ F=open(reference, "r")
+ reference_forward = []
+ reference_reverse = []
+ for line in F:
+ fields=line.split()
+ reference_forward.append(int(float(fields[1])))
+ reference_reverse.append(int(float(fields[2])))
+ F.close()
+ local_object_forward=[]
+ local_object_reverse=[]
+ ## Dict to list for the local object
+ for i in range(1, self.size+1):
+ local_object_forward.append(len(self.readDict[i]))
+ local_object_reverse.append(len(self.readDict[-i]))
+ ## start compiling results by slides
+ results=[]
+ for coordinate in range(self.size - window_size):
+ local_forward=local_object_forward[coordinate:coordinate + window_size]
+ local_reverse=local_object_reverse[coordinate:coordinate + window_size]
+ if sum(local_forward) == 0 or sum(local_reverse) == 0:
+ continue
+ try:
+ reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward)
+ reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse)
+ if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2):
+ results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]])
+ except:
+ pass
+ return results
+
+ def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"):
+ '''refactored 24-12-2013 to save memory and introduce offset filtering
+ take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation
+ the trick is to pass None and then test
+ polarity parameter can take "both", "forward" or "reverse" as value'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both":
+ return self.matchedreadsUp + self.matchedreadsDown
+ if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward":
+ return self.matchedreadsUp
+ if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse":
+ return self.matchedreadsDown
+ n=0
+ if polarity == "both":
+ for offset in xrange(upstream_coord, downstream_coord+1):
+ if self.readDict.has_key(offset):
+ for read in self.readDict[offset]:
+ if (read>=size_inf and read<= size_sup):
+ n += 1
+ if method != "conservative":
+ del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST
+ if self.readDict.has_key(-offset):
+ for read in self.readDict[-offset]:
+ if (read>=size_inf and read<= size_sup):
+ n += 1
+ if method != "conservative":
+ del self.readDict[-offset]
+ return n
+ elif polarity == "forward":
+ for offset in xrange(upstream_coord, downstream_coord+1):
+ if self.readDict.has_key(offset):
+ for read in self.readDict[offset]:
+ if (read>=size_inf and read<= size_sup):
+ n += 1
+ return n
+ elif polarity == "reverse":
+ for offset in xrange(upstream_coord, downstream_coord+1):
+ if self.readDict.has_key(-offset):
+ for read in self.readDict[-offset]:
+ if (read>=size_inf and read<= size_sup):
+ n += 1
+ return n
+
+ def readsizes (self):
+ '''return a dictionary of number of reads by size (the keys)'''
+ dicsize = {}
+ for offset in self.readDict:
+ for size in self.readDict[offset]:
+ dicsize[size] = dicsize.get(size, 0) + 1
+ for offset in range (min(dicsize.keys()), max(dicsize.keys())+1):
+ dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values
+ return dicsize
+
+ def size_histogram(self):
+ norm=self.norm
+ hist_dict={}
+ hist_dict['F']={}
+ hist_dict['R']={}
+ for offset in self.readDict:
+ for size in self.readDict[offset]:
+ if offset < 0:
+ hist_dict['R'][size] = hist_dict['R'].get(size, 0) - 1*norm
+ else:
+ hist_dict['F'][size] = hist_dict['F'].get(size, 0) + 1*norm
+ return hist_dict
+
+ def statsizes (self, upstream_coord=None, downstream_coord=None):
+ ''' migration to memory saving by specifying possible subcoordinates
+ see the readcount method for further discussion'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ L = []
+ for offset in self.readDict:
+ if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
+ for size in self.readDict[offset]:
+ L.append(size)
+ meansize = mean(L)
+ stdv = std(L)
+ mediansize = median(L)
+ return meansize, mediansize, stdv
+
+ def foldEnergy (self, upstream_coord=None, downstream_coord=None):
+ ''' migration to memory saving by specifying possible subcoordinates
+ see the readcount method for further discussion'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ])
+ return float(Energy[self.sequence[upstream_coord-1:downstream_coord]])
+
+ def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None):
+ ''' migration to memory saving by specifying possible subcoordinates
+ see the readcount method for further discussion. size_scope must be an interable'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0}
+ convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
+ for offset in self.readDict:
+ if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
+ for size in self.readDict[offset]:
+ if size in size_scope:
+ startbase = self.sequence[abs(offset)-self.windowoffset]
+ if offset < 0:
+ startbase = convertDic[startbase]
+ freqDic[startbase] += 1
+ base_sum = float ( sum( freqDic.values()) )
+ if base_sum == 0:
+ return "."
+ else:
+ return freqDic["T"] / base_sum * 100
+
+ def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None):
+ ''' migration to memory saving by specifying possible subcoordinates
+ see the readcount method for further discussion. size_scope must be an interable
+ This method is similar to the Ufreq method but take strandness into account'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0}
+ convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
+ for offset in self.readDict:
+ if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
+ for size in self.readDict[offset]:
+ if size in size_scope:
+ startbase = self.sequence[abs(offset)-self.windowoffset]
+ if offset < 0:
+ startbase = convertDic[startbase]
+ freqDic[startbase+"rev"] += 1
+ else:
+ freqDic[startbase+"for"] += 1
+ forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"])
+ reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"])
+ if forward_sum == 0 and reverse_sum == 0:
+ return ". | ."
+ elif reverse_sum == 0:
+ return "%s | ." % (freqDic["Tfor"] / forward_sum * 100)
+ elif forward_sum == 0:
+ return ". | %s" % (freqDic["Trev"] / reverse_sum * 100)
+ else:
+ return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100)
+
+
+ def readplot (self):
+ norm=self.norm
+ readmap = {}
+ for offset in self.readDict.keys():
+ readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm )
+ mylist = []
+ for offset in sorted(readmap):
+ if readmap[offset][1] != 0:
+ mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") )
+ if readmap[offset][0] != 0:
+ mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") )
+ return mylist
+
+ def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None):
+ '''Use by MirParser tool'''
+ upstream_coord = upstream_coord or 1
+ downstream_coord = downstream_coord or self.size
+ windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord)
+ forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)])
+ totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward")
+ totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse")
+ if totalforward > totalreverse:
+ majorcoverage = "forward"
+ for offset in self.readDict.keys():
+ if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ):
+ for read in self.readDict[offset]:
+ for i in xrange(read):
+ try:
+ forORrev_coverage[offset-upstream_coord+1+i] += 1
+ except KeyError:
+ continue # a sense read may span over the downstream limit
+ else:
+ majorcoverage = "reverse"
+ for offset in self.readDict.keys():
+ if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ):
+ for read in self.readDict[offset]:
+ for i in xrange(read):
+ try:
+ forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage
+ except KeyError:
+ continue # an antisense read may span over the upstream limit
+ output_list = []
+ maximum = max (forORrev_coverage.values()) or 1
+ for n in sorted (forORrev_coverage):
+ output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage))
+ return "\n".join(output_list)
+
+
+ def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None):
+ ''' migration to memory saving by specifying possible subcoordinates
+ see the readcount method for further discussion
+ scope must be a python iterable; scope define the *relative* offset range to be computed'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ query_range = range (minquery, maxquery+1)
+ target_range = range (mintarget, maxtarget+1)
+ Query_table = {}
+ Target_table = {}
+ frequency_table = dict ([(i, 0) for i in scope])
+ for offset in self.readDict:
+ if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
+ for size in self.readDict[offset]:
+ if size in query_range:
+ Query_table[offset] = Query_table.get(offset, 0) + 1
+ if size in target_range:
+ Target_table[offset] = Target_table.get(offset, 0) + 1
+ for offset in Query_table:
+ for i in scope:
+ frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0))
+ if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py
+ frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table])
+ if zscore == "yes":
+ z_mean = mean(frequency_table.values() )
+ z_std = std(frequency_table.values() )
+ if z_std == 0:
+ frequency_table = dict([(i,0) for i in frequency_table] )
+ else:
+ frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] )
+ return frequency_table
+
+ def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None):
+ ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion
+ note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed'''
+ upstream_coord = upstream_coord or self.windowoffset
+ downstream_coord = downstream_coord or self.windowoffset+self.size-1
+ query_range = range (minquery, maxquery+1)
+ target_range = range (mintarget, maxtarget+1)
+ Query_table = {}
+ Target_table = {}
+ Total_Query_Numb = 0
+ general_frequency_table = dict ([(i,0) for i in scope])
+ ## filtering the appropriate reads for the study
+ for offset in self.readDict:
+ if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
+ for size in self.readDict[offset]:
+ if size in query_range:
+ Query_table[offset] = Query_table.get(offset, 0) + 1
+ Total_Query_Numb += 1
+ if size in target_range:
+ Target_table[offset] = Target_table.get(offset, 0) + 1
+ for offset in Query_table:
+ frequency_table = dict ([(i,0) for i in scope])
+ number_of_targets = 0
+ for i in scope:
+ frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0)
+ number_of_targets += Target_table.get(-offset -i +1, 0)
+ for i in scope:
+ try:
+ general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i]
+ except ZeroDivisionError :
+ continue
+ return general_frequency_table
+
+ def phasing (self, size_range, scope):
+ ''' to calculate autocorelation like signal - scope must be an python iterable'''
+ read_table = {}
+ total_read_number = 0
+ general_frequency_table = dict ([(i, 0) for i in scope])
+ ## read input filtering
+ for offset in self.readDict:
+ for size in self.readDict[offset]:
+ if size in size_range:
+ read_table[offset] = read_table.get(offset, 0) + 1
+ total_read_number += 1
+ ## per offset read phasing computing
+ for offset in read_table:
+ frequency_table = dict ([(i, 0) for i in scope]) # local frequency table
+ number_of_targets = 0
+ for i in scope:
+ if offset > 0:
+ frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0)
+ number_of_targets += read_table.get(offset + i, 0)
+ else:
+ frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0)
+ number_of_targets += read_table.get(offset - i, 0)
+ ## inclusion of local frequency table in the general frequency table (all offsets average)
+ for i in scope:
+ try:
+ general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i]
+ except ZeroDivisionError :
+ continue
+ return general_frequency_table
+
+
+
+ def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
+ '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute'''
+ frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
+ z_table = {}
+ frequency_list = [frequency_table[i] for i in sorted (frequency_table)]
+ if std(frequency_list):
+ meanlist = mean(frequency_list)
+ stdlist = std(frequency_list)
+ z_list = [(i-meanlist)/stdlist for i in frequency_list]
+ return dict (zip (sorted(frequency_table), z_list) )
+ else:
+ return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) )
+
+ def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
+ frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
+ total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) )
+ if total == 0:
+ return dict( [(i,0) for i in scope])
+ return dict( [(i, frequency_table[i]/total*100) for i in scope])
+
+ def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget):
+ queryhash = defaultdict(list)
+ targethash = defaultdict(list)
+ query_range = range (int(minquery), int(maxquery)+1)
+ target_range = range (int(mintarget), int(maxtarget)+1)
+ paired_sequences = []
+ for offset in self.readDict: # selection of data
+ for size in self.readDict[offset]:
+ if size in query_range:
+ queryhash[offset].append(size)
+ if size in target_range:
+ targethash[offset].append(size)
+ for offset in queryhash:
+ if offset >= 0: matched_offset = -offset - overlap + 1
+ else: matched_offset = -offset - overlap + 1
+ if targethash[matched_offset]:
+ paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) )
+ if offset >= 0:
+ for i in range (paired):
+ paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) )
+ paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) )
+ if offset < 0:
+ for i in range (paired):
+ paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) )
+ paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) )
+ return paired_sequences
+
+ def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget):
+ queryhash = defaultdict(list)
+ targethash = defaultdict(list)
+ query_range = range (int(minquery), int(maxquery)+1)
+ target_range = range (int(mintarget), int(maxtarget)+1)
+ paired_sequences = []
+
+ for offset in self.readDict: # selection of data
+ for size in self.readDict[offset]:
+ if size in query_range:
+ queryhash[offset].append(size)
+ if size in target_range:
+ targethash[offset].append(size)
+
+ for offset in queryhash:
+ matched_offset = -offset - overlap + 1
+ if targethash[matched_offset]:
+ if offset >= 0:
+ for i in queryhash[offset]:
+ paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) )
+ for i in targethash[matched_offset]:
+ paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) )
+ if offset < 0:
+ for i in queryhash[offset]:
+ paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) )
+ for i in targethash[matched_offset]:
+ paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) )
+ return paired_sequences
+
+ def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget):
+ ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates'''
+ queryhash = defaultdict(list)
+ targethash = defaultdict(list)
+ query_range = range (int(minquery), int(maxquery)+1)
+ target_range = range (int(mintarget), int(maxtarget)+1)
+ bowtie_output = []
+
+ for offset in self.readDict: # selection of data
+ for size in self.readDict[offset]:
+ if size in query_range:
+ queryhash[offset].append(size)
+ if size in target_range:
+ targethash[offset].append(size)
+ counter = 0
+ for offset in queryhash:
+ matched_offset = -offset - overlap + 1
+ if targethash[matched_offset]:
+ if offset >= 0:
+ for i in queryhash[offset]:
+ counter += 1
+ bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset
+ if offset < 0:
+ for i in queryhash[offset]:
+ counter += 1
+ bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset
+ return bowtie_output
+
+
+def __main__(bowtie_index_path, bowtie_output_path):
+ sequenceDic = get_fasta (bowtie_index_path)
+ objDic = {}
+ F = open (bowtie_output_path, "r") # F is the bowtie output taken as input
+ for line in F:
+ fields = line.split()
+ polarity = fields[1]
+ gene = fields[2]
+ offset = int(fields[3])
+ size = len (fields[4])
+ try:
+ objDic[gene].addread (polarity, offset, size)
+ except KeyError:
+ objDic[gene] = SmRNAwindow(gene, sequenceDic[gene])
+ objDic[gene].addread (polarity, offset, size)
+ F.close()
+ for gene in objDic:
+ print gene, objDic[gene].pairer(19,19,23,19,23)
+
+if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])
diff -r 000000000000 -r de6a6afc5a79 static/.DS_Store
Binary file static/.DS_Store has changed
diff -r 000000000000 -r de6a6afc5a79 static/._.DS_Store
Binary file static/._.DS_Store has changed
diff -r 000000000000 -r de6a6afc5a79 static/images/readmap.png
Binary file static/images/readmap.png has changed
diff -r 000000000000 -r de6a6afc5a79 static/images/size_histogram.png
Binary file static/images/size_histogram.png has changed
diff -r 000000000000 -r de6a6afc5a79 test-data/sRbowtie.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sRbowtie.fa Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,40 @@
+>1
+GTATTGTAAGTGGCAGAGTGGC
+>2
+TGGAATGTAAAGAAGTATGGAG
+>3
+GTGGGGAGTTTGGATGGGGCGGCA
+>4
+AATGGCACTGGAAGAATTCACGG
+>5
+GTACGGACAAGGGGAATC
+>6
+TTGGGTTCTGGGGGGAGTATGG
+>7
+GTGGGGAGTTTCGCTGGGGCGGCA
+>8
+TAAGGGTCGGGTAGTGAGGGC
+>9
+AGCTGGGACTGAGGACTG
+>10
+AGCTGGGACTGAGGACTGC
+>11
+AAGGGTCGGGTAGTGAGG
+>12
+GTCGGGTAGTGAGGGCCTT
+>13
+TGGTGGGGCTTGGAACAATTGGAGGGC
+>14
+TGACGGAAGGGCACCACC
+>15
+TGGAATGTAAAGAAGTATGGAG
+>16
+TTGGGTTCTGGGGGGAGT
+>17
+TCAGGTGGGGAGTTTGGCTGGGGCGGCACA
+>18
+TTGGGTATAGGGGCGAAAGA
+>19
+AGCGGGCGTGCTTGTGGAC
+>20
+GTCGAATTTGGGTATAGGGGC
diff -r 000000000000 -r de6a6afc5a79 test-data/sRbowtie.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sRbowtie.out Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,5 @@
+2 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG
+4 - 2L 11953463 CCGTGAATTCTTCCAGTGCCATT
+15 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG
+14 - Uextra 7115665 GGTGGTGCCCTTCCGTCA
+18 + Uextra 7726410 TTGGGTATAGGGGCGAAAGA
diff -r 000000000000 -r de6a6afc5a79 test-data/yac.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/yac.fastq Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,40 @@
+@SRR290479.1 HWI-EAS285:2:1:66:28/1
+TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG
++
+B@BBCBCCBCBCCC8A<@##################
+@SRR290479.2 HWI-EAS285:2:1:67:348/1
+AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG
++
+BAA@7?A@@A@@B<'25?6>59:;7#########
+@SRR290479.3 HWI-EAS285:2:1:68:826/1
+TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG
++
+BB@BBCCBCCBBB:%%83/>B7@44#;;324'117?
+@SRR290479.4 HWI-EAS285:2:1:68:65/1
+ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT
++
+BBB@@ABAAB?9B42&9;##################
+@SRR290479.5 HWI-EAS285:2:1:69:594/1
+AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG
++
+AB?5;3>/=?>=;416481#################
+@SRR290479.6 HWI-EAS285:2:1:70:700/1
+TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT
++
+BCB=:ACCBB=>BB8<-###################
+@SRR290479.7 HWI-EAS285:2:1:70:1679/1
+TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC
++
+BCBBCCBCCCBCCA?AB>:B@><>############
+@SRR290479.8 HWI-EAS285:2:1:71:1400/1
+AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT
++
+7@BC>>@55CCBCA3CBA14B.A16#*;9359B###
+@SRR290479.9 HWI-EAS285:2:1:71:795/1
+TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG
++
+BBBBBCBBCB;>AA',9=18?1:7:#<;57######
+@SRR290479.10 HWI-EAS285:2:1:71:596/1
+TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG
++
+B@B>7>9A@<46B@79972#################
diff -r 000000000000 -r de6a6afc5a79 test-data/yac.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/yac.out Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,12 @@
+>1
+TGTAAACATCCCCGACTGGCAGC
+>2
+AAAGTGCTACTACTTTTGAGTCT
+>3
+ACTGGACTTGGAGTCCGAAGGC
+>4
+AAGTGCCGCCAGGTTTTGAGTGG
+>5
+TATTGCACTTGTCCCGGCCTGAATCNCGT
+>6
+TAGCTTATCAGACTGATGTTGAC
diff -r 000000000000 -r de6a6afc5a79 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,67 @@
+
+
+
+ value, dbkey, name, path
+
+
+
+
+ value, dbkey, formats, name, path
+
+
+
+
+
+
+
+
+ value, dbkey, name, path
+
+
+
+
+ value, dbkey, name, path
+
+
+
+
+ name, value, dbkey, species
+
+
+
+
+ value, dbkey, name, path
+
+
+
+
+
+
+
+
+ value, dbkey, name, path
+
+
+
+
+ value, dbkey, name, path
+
+
+
+
+ value, dbkey, name, path
+
+
+
diff -r 000000000000 -r de6a6afc5a79 yac.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/yac.py Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+# yac = yet another clipper
+# v 1.0.0
+# Usage yac.py $input $output $adapter_to_clip $min $max $Nmode
+# Christophe Antoniewski
+
+import sys, string
+
+class Clip:
+ def __init__(self, inputfile, outputfile, adapter, minsize, maxsize):
+ self.inputfile = inputfile
+ self.outputfile = outputfile
+ self.adapter = adapter
+ self.minsize = int(minsize)
+ self.maxsize = int(maxsize)
+ def motives (sequence):
+ '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
+ sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match
+ dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"}
+ for pos in enumerate(sequence[:6]):
+ for subst in dicsubst[pos[1]]:
+ sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7])
+ return sequencevariants
+ self.adaptmotifs= motives(self.adapter)
+
+ def scanadapt(self, adaptmotives=[], sequence=""):
+ '''scans sequence for adapter motives'''
+ if sequence.rfind(adaptmotives[0]) != -1:
+ return sequence[:sequence.rfind(adaptmotives[0])]
+ for motif in adaptmotives[1:]:
+ if sequence.rfind(motif) != -1:
+ return sequence[:sequence.rfind(motif)]
+ return sequence
+
+ def clip_with_N (self):
+ '''clips adapter sequences from inputfile.
+ Reads containing N are retained.'''
+ iterator = 0
+ id = 0
+ F = open (self.inputfile, "r")
+ O = open (self.outputfile, "w")
+ for line in F:
+ iterator += 1
+ if iterator % 4 == 2:
+ trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
+ if self.minsize <= len(trim) <= self.maxsize:
+ id += 1
+ print >> O, ">%i\n%s" % (id, trim)
+ F.close()
+ O.close()
+ def clip_without_N (self):
+ '''clips adapter sequences from inputfile.
+ Reads containing N are rejected.'''
+ iterator = 0
+ id = 0
+ F = open (self.inputfile, "r")
+ O = open (self.outputfile, "w")
+ for line in F:
+ iterator += 1
+ if iterator % 4 == 2:
+ trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
+ if "N" in trim: continue
+ if self.minsize <= len(trim) <= self.maxsize:
+ id += 1
+ print >> O, ">%i\n%s" % (id, trim)
+ F.close()
+ O.close()
+
+def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode):
+ instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize)
+ if Nmode == "accept":
+ instanceClip.clip_with_N()
+ else:
+ instanceClip.clip_without_N()
+
+if __name__ == "__main__" :
+ input = sys.argv[1]
+ output = sys.argv[2]
+ adapter = sys.argv[3]
+ minsize = sys.argv[4]
+ maxsize = sys.argv[5]
+ Nmode = sys.argv[6]
+ __main__(input, output, adapter, minsize, maxsize, Nmode)
diff -r 000000000000 -r de6a6afc5a79 yac.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/yac.xml Tue Jun 24 12:16:43 2014 -0400
@@ -0,0 +1,58 @@
+
+
+ yac.py $input $output $clip_source.clip_sequence $min $max $Nmode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool clips adapter sequences from a fastq file and fasta file of clipped reads with renumbered fasta headers.
+
+Clipped sequences with Ns can be discarded.
+
+Min size and max size filter clipped reads on their size.
+
+Note that unclipped reads that satisfy the min and max size conditions are kept.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+