Mercurial > repos > drosofff > mississipi_gcc2
changeset 0:de6a6afc5a79 draft default tip
Uploaded
author | drosofff |
---|---|
date | Tue, 24 Jun 2014 12:16:43 -0400 |
parents | |
children | |
files | FeaturesParser.py FeaturesParser.xml MDS_wrapper.py MirDeepStar.xml MirParser.py MirParser.xml annotation_collector.py annotation_collector.xml bowtie_indices.loc.sample piRNAsignature.py piRNAsignature.xml readmap.py readmap.xml sRbowtie.py sRbowtie.xml sRbowtieCascade.py sRbowtieCascade.xml sRbowtieParser.py sRbowtieParser.xml size_histogram.py size_histogram.xml smRtools.py static/.DS_Store static/._.DS_Store static/images/readmap.png static/images/size_histogram.png test-data/sRbowtie.fa test-data/sRbowtie.out test-data/yac.fastq test-data/yac.out tool_data_table_conf.xml.sample yac.py yac.xml |
diffstat | 33 files changed, 3123 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FeaturesParser.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,64 @@ +#!/usr/bin/python +# python parser module to analyse Features in sRbowtie alignments (guided by a GFF3 file) +# version 0.9 +# Usage FeaturesParser.py <1:index source> <2:extraction directive> <3:output> <4:GFF3 guide file> <5:6:7 filePath:FileExt:FileLabel> <.. ad lib> + +import sys +from smRtools import * +from collections import * + +IndexSource = sys.argv[1] +ExtractionDirective = sys.argv[2] +if ExtractionDirective == "--do_not_extract_index": + genomeRefFormat = "fastaSource" +elif ExtractionDirective == "--extract_index": + genomeRefFormat = "bowtieIndex" +Output = sys.argv[3] +GFF3_file = sys.argv[4] +Triplets = [sys.argv[5:][i:i+3] for i in xrange(0, len(sys.argv[5:]), 3)] +MasterListOfGenomes = {} +FeatureDict = defaultdict(dict) + +for [filePath, FileExt, FileLabel] in Triplets: + MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (filePath, FileExt, IndexSource, genomeRefFormat) + FeatureDict[FileLabel] = MasterListOfGenomes[FileLabel].CountFeatures(GFF3=GFF3_file) + +# add some code to pick up the GFF3 features in their order of appearence. +F = open(GFF3_file, "r") +featureList = [] +for line in F: + if line[0] == "#": continue + feature = line.split()[2] + if feature not in featureList: + featureList.append(feature) +F.close() + +header = ["#Feature"] +for [filePath, FileExt, FileLabel] in Triplets: + header.append(FileLabel) + +F = open (sys.argv[3], "w") +print >> F, "\t".join(header) +for feature in featureList: + line=[feature] + for sample in header[1:]: + count = str (FeatureDict[sample][feature]) +# uncomment to get percentage in addition to counts +# percent = float(FeatureDict[sample][feature]) / MasterListOfGenomes[sample].alignedReads +# value = "%s | %0.2f" % (count, percent) +# line.append(value) + line.append(count) + print >> F, "\t".join(line ) +line = ["Unfeatured"] +for sample in header[1:]: + matched = 0 + for feature in FeatureDict[sample]: + matched += FeatureDict[sample][feature] + unmatched = MasterListOfGenomes[sample].alignedReads - matched +# uncomment to get percentage in addition to counts +# percent = float (unmatched) / (matched + unmatched) +# value = "%s | %0.2f" % (unmatched, percent) +# line.append(value) + line.append("%s" % unmatched) +print >> F, "\t".join(line) +F.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FeaturesParser.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,61 @@ +<tool id="FeaturesParser" name="Parse GFF3 features" version="0.9.1"> + <description>in sRbowtie alignment</description> + <requirements><requirement type='package'>bowtie-inspect</requirement></requirements> + <parallelism method="basic"></parallelism> +<command interpreter="python"> + FeaturesParser.py + #if $refGenomeSource.genomeSource == "history": + $refGenomeSource.ownFile ## index source ## 1 + --do_not_extract_index ## 2 + #else: + #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] + $reference ## index source ## 1 + --extract_index ## 2 + #end if + $output ## 3 + $gff3 ## 4 + #for $i in $refGenomeSource.input_list + $i $i.ext "$i.name" + #end for + +</command> + <inputs> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="input_list" type="data" label="Select multiple alignments to parse" multiple="true"> + <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="database not set for this bowtie output. Select the database(=genome used for matching) manually, or select a reference fasta from your history."/> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" label="Select the fasta reference" /> + <param name="input_list" type="data" label="Select multiple alignments to parse" multiple="true"/> + </when> + </conditional> <!-- refGenomeSource --> + <param name="gff3" type="data" format="gff3" label="A GFF3 with features' coordinates"/> + </inputs> + <outputs> + <data format="tabular" name="output" label="Feature Counts Lists"/> + </outputs> + <help> + +**What it does** + +Parses Features Counts from one or several sRBowtie alignments (in tabular, Sam or Bam format). + +Both sense and antisense alignments are counted + +The library labels are infered from the input dataset names in the galaxy history. + +**It is thus essential that input datasets are appropriately renamed** + +**it is preferable that you do not put any space in this input dataset names. You may edit these names in the history** + + + + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MDS_wrapper.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# mirdeep* python wrapper +# refactoring a python version of the MDS_wrapper.pl +# Usage MDS_wrapper.py <fasta input> <MDS genome> <gff3_output> <data.result> <data.cluster> + +import sys, re, os, subprocess, shlex, tempfile, shutil + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +MDS_path = "/home/galaxy/bin/MDS_command_line_v32/MDS_command_line" + +input_full_path = sys.argv[1] +MDS_genome = sys.argv[2] +gff3_output = sys.argv[3] +dataresult = sys.argv[4] +datacluster = sys.argv[5] + +tmp_MDS_work_dir = tempfile.mkdtemp(dir = MDS_path) # make temp directory for MDS analysis +os.symlink(input_full_path, tmp_MDS_work_dir+"/data.fa" ) # symlink between the fasta source file and the required "data.fa" input +os.symlink(MDS_path+"/MDS_command_line.jar", tmp_MDS_work_dir+"/MDS_command_line.jar" ) # symlink to jar source in working directory +os.symlink(MDS_path+"/genome", tmp_MDS_work_dir+"/genome") +os.symlink(MDS_path+"/targetScan", tmp_MDS_work_dir+"/targetScan") +os.symlink(MDS_path+"/targetScan_files", tmp_MDS_work_dir+"/targetScan_files") + +# execute MirDeep* +command_line = "java -Xmx4g -jar " + MDS_path + "/MDS_command_line.jar -r 5 -g " + MDS_genome + " data.fa" # -Xmx12g +print command_line +#tmp = tempfile.NamedTemporaryFile( dir=tmp_MDS_work_dir ).name +#tmp_stderr = open( tmp, 'wb' ) + +try: + os.chdir(tmp_MDS_work_dir) + p = subprocess.Popen(args=command_line, cwd=tmp_MDS_work_dir, shell=True, stderr=sys.stderr) + returncode = p.wait() + shutil.copy2 ("data.result", dataresult) + shutil.copy2 ("data.cluster", datacluster) + dataFILE = open("data.result", "r") + datafile = dataFILE.readlines() + dataFILE.close() + GFF3OUT = open(gff3_output, "w") + print >> GFF3OUT,"##gff-version 3" + print >> GFF3OUT, "##Seqid Source Type Start End Score Strand Phase Attributes" + print >> GFF3OUT, "##" + for line in datafile[1:]: + fields = line.split("\t") + Seqid, Source, Type, Start, End, Score, Strand, Phase = fields[2], "MirDeep*", "hairPin_loci", fields[4].split("-")[0], fields[4].split("-")[1], fields[1], fields[3], "." + ID = "ID=%s;%s_reads;%s;%s;mature_seq:%s" % (fields[0],fields[5],fields[7],fields[8],fields[9]) + print >> GFF3OUT, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (Seqid, Source, Type, Start, End, Score, Strand, Phase, ID) + GFF3OUT.close() + if os.path.exists( tmp_MDS_work_dir ): + shutil.rmtree( tmp_MDS_work_dir ) + else: + print "Error in cleaning tmp working directory" + +except Exception, e: + # clean up temp dir + if os.path.exists( tmp_MDS_work_dir ): + shutil.rmtree( tmp_MDS_work_dir ) + stop_err( 'Error running MDS_command_line.jar\n' + str( e ) ) + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MirDeepStar.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,32 @@ +<tool id="MirDeepStar" name="MirDeepStar"> +<description></description> +<command interpreter="python">MDS_wrapper.py $input $genome $output $output2 $output3</command> + +<inputs> + <param name="input" type="data" format="fasta" label="input fasta file"/> + <param name="genome" type="select" label="MirDeep* reference genome"> + <option value="AgamP4">SoftMasked Anopheles Gambiae P4</option> + <option value="AgamF">SoftMasked Anopheles Gambiae P3</option> + <option value="hg19">Human (hg19)</option> + <option value="mm9">Mouse (mm9)</option> + <option value="Dmel">Dmel (5.49)</option> + </param> +</inputs> + +<outputs> + <data name="output3" format="tabular" label="mirdeep* clusters"/> + <data name="output2" format="tabular" label="mirdeep* raw output"/> + <data name="output" format="gff3" label="mirdeep* GFF output"/> +</outputs> + +<help> + +**What it does** + +MirDeep* wrapper + + +</help> + +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MirParser.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,81 @@ +#!/usr/bin/python +# python parser module for pre-mir and mature miRNAs, guided by mirbase.org GFF3 +# version 0.0.9 (1-6-2014) +# Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> +# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> +# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> + +import sys, subprocess +from smRtools import * + +IndexSource = sys.argv[1] +ExtractionDirective = sys.argv[2] +if ExtractionDirective == "--do_not_extract_index": + genomeRefFormat = "fastaSource" +elif ExtractionDirective == "--extract_index": + genomeRefFormat = "bowtieIndex" +OutputPre_mirs = sys.argv[3] +OutputMature_Mirs = sys.argv[4] +GFF3_file = sys.argv[5] +lattice = sys.argv[6] +Rcode = sys.argv[7] +latticePDF = sys.argv[8] +Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)] +MasterListOfGenomes = {} + +for [filePath, FileExt, FileLabel] in Triplets: + print FileLabel + MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel) + +header = ["gene"] +for [filePath, FileExt, FileLabel] in Triplets: + header.append(FileLabel) + +hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation + +## read GFF3 to subinstantiate +gff3 = open (GFF3_file, "r") +lattice_dataframe = [] +for line in gff3: + if line[0] == "#": continue + gff_fields = line[:-1].split("\t") + chrom = gff_fields[0] + gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name + item_upstream_coordinate = int(gff_fields[3]) + item_downstream_coordinate = int(gff_fields[4]) + if gff_fields[6] == "+": + item_polarity = "forward" + else: + item_polarity = "reverse" + item_line = [gff_name] + for sample in header[1:]: + count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity) + item_line.append(str(count)) + ## subtreatement for lattice + if lattice != "dummy_dataframe_path": + if ("5p" not in gff_name) and ("3p" not in gff_name): + lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) ) + ## end of subtreatement for lattice + hit_table.append("\t".join(item_line) ) +gff3.close() + +Fpremirs = open (OutputPre_mirs, "w") +print >> Fpremirs, hit_table[0] +finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] +print >> Fpremirs, "\n".join(finalPreList ) +Fpremirs.close() + +Fmaturemires = open (OutputMature_Mirs, "w") +print >> Fmaturemires, hit_table[0] +finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] +print >> Fmaturemires, "\n".join(finalMatureList ) +Fmaturemires.close() + +if lattice != "dummy_dataframe_path": + Flattice = open(lattice, "w") + print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity") + print >> Flattice, "\n".join(lattice_dataframe) + Flattice.close() + R_command="Rscript "+ Rcode + process = subprocess.Popen(R_command.split()) + process.wait()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MirParser.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,148 @@ +<tool id="MirParser" name="Parse miRNAs" version="0.9.1"> + <description>from sRbowtie aligment</description> + <requirements><requirement type='package'>bowtie-inspect</requirement></requirements> + <parallelism method="basic"></parallelism> +<command interpreter="python"> + MirParser.py + #if $refGenomeSource.genomeSource == "history": + $refGenomeSource.ownFile ## index source sys.arg[1] + --do_not_extract_index ## sys.argv[2] + #else: + #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] + $reference ## sys.argv[1] + --extract_index ## sys.argv[2] + #end if + $output1 ## for pre-mirs ## sys.argv[3] + $output2 ## for mature mirs ## sys.argv[4] + $GFF3 ## sys.argv[5] + #if $plotting.plottingOption == "yes": + $lattice_dataframe ## sys.argv[6] + $plotCode ## sys.argv[7] + $latticePDF ## sys.argv[8] + #else: + "dummy_dataframe_path" ## sys.argv[6] + "dummy_plotCode" ## sys.argv[7] + "dummy_latticePDF" ## sys.argv[8] + #end if + #for $i in $refGenomeSource.input_list + $i $i.ext "$i.name" ## sys.argv[9,10,11] modulo 3 + #end for + #silent plottingoption = $plotting.plottingOption +</command> + <inputs> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="input_list" type="data" label="Select multiple alignments to parse" multiple="true"> + <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="database not set for this bowtie output. Select the database(=genome used for matching) manually, or select a reference fasta from your history."/> + </param> + </when> + <when value="history"> + <param name="input_list" type="data" label="Select multiple alignments to parse" multiple="true"/> + <param name="ownFile" type="data" format="fasta" label="Select the fasta reference" /> + </when> + </conditional> <!-- refGenomeSource --> + <param name="GFF3" type="data" label="miRbase GFF3 guide" /> + <conditional name="plotting"> + <param name="plottingOption" type="select" label="Additional mir coverage graphs"> + <option value="no" selected="True">No</option> + <option value="yes">YES</option> + </param> + <when value="yes"> + <param name="display" type="select" label="Display Coverage with absolute number of reads or relatively to the total number of read matching the gene or mir"> + <option value="relative" selected="True">Relative Coverage</option> + <option value="absolute">Absolute Coverage</option> + </param> + </when> + </conditional> + </inputs> + <configfiles> + <configfile name="plotCode"> + #if $plotting.plottingOption == "yes": + graph_type = "${plotting.display}" ## "relative" or "absolute" + ## Setup R error handling to go to stderr + options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + library(lattice) + coverage = read.delim("${lattice_dataframe}", header=T) + Numb_of_biosamples = length(levels(coverage\$sample)) + if (graph_type=="relative") { + graph = xyplot(countsNorm~offsetNorm | mir, data=coverage, groups=polarity, col=c("red", "blue"), type="l", lwd=1, + scales=list(x=list(cex=.5), y=list(cex=.5)), par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), layout=c(Numb_of_biosamples,15), as.table=TRUE, main="miRNA coverage maps") + } else { + graph = xyplot(counts~offset | mir, data=coverage, groups=polarity, col=c("red", "blue"), type="l", lwd=1, + scales=list(x=list(cex=.5), y=list(cex=.5)), par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), layout=c(Numb_of_biosamples,15), as.table=TRUE, main="miRNA coverage maps") + } + ## pdf output + pdf(file="${latticePDF}", paper="special", height=11.69, width=8.2677) + plot(graph, newpage = T) + dev.off() + #end if + </configfile> + </configfiles> + + <outputs> + <data format="tabular" name="output1" label="Premirs Count Lists"/> + <data format="tabular" name="output2" label="Mature Mirs Count Lists"/> + <data format="tabular" name="lattice_dataframe" label="Lattice Dataframe"> + <filter>plotting['plottingOption'] == "yes"</filter> + </data> + <data format="pdf" name="latticePDF" label="Mir coverage"> + <filter>plotting['plottingOption'] == "yes"</filter> + </data> + </outputs> + <help> + +**What it does** + +This tool uses a specie-specific GFF3 file from mirBase_ to guide the parsing of an alignment file produced with the sRbowtie tool. + +.. _mirBase: ftp://mirbase.org/pub/mirbase/CURRENT/genomes/ + +------ + +.. class:: warningmark + +the Guide GFF3 file must be in the following format: + +2L . miRNA_primary_transcript 243035 243141 . - . ID=MI0005821;Alias=MI0005821;Name=dme-mir-965 + +2L . miRNA 243055 243076 . - . ID=MIMAT0005480;Alias=MIMAT0005480;Name=dme-miR-965-3p;Derives_from=MI0005821 + +2L . miRNA 243096 243118 . - . ID=MIMAT0020861;Alias=MIMAT0020861;Name=dme-miR-965-5p;Derives_from=MI0005821 + +2L . miRNA_primary_transcript 857542 857632 . + . ID=MI0005813;Alias=MI0005813;Name=dme-mir-375 + +2L . miRNA 857596 857617 . + . ID=MIMAT0005472;Alias=MIMAT0005472;Name=dme-miR-375-3p;Derives_from=MI0005813 + +2L . miRNA 857556 857579 . + . ID=MIMAT0020853;Alias=MIMAT0020853;Name=dme-miR-375-5p;Derives_from=MI0005813 + +2L . miRNA_primary_transcript 1831685 1831799 . - . ID=MI0011290;Alias=MI0011290;Name=dme-mir-2280 + +With name for mature miRNA (3rd column = miRNA) containing either the -3p or -5p string in the attribute Name (Name=dme-miR-965-3p, for instance) + +------ + +**Input formats** + +1. One or sereral alignment files generated with sRbowtie tool and **renamed** according to the name of the biosample (avoid spaces in biosample labels) + +.. class:: warningmark + +Alignment datasets generated with sRbowtie must be renamed according to a biosample name + +2. A GFF3 file retrieved from mirBase_ + +------ + +**Outputs** + +Two count list files for counts of reads aligned to pre-mir or mature miRNA + +A pdf of pre-mir coverages. Red coverages indicate that the mir gene is in the genomic up strand, blue coverages indicate that the mir gene is in the genomic down strand. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_collector.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,25 @@ +#!/usr/bin/env python +#By, drosofff@gmail.com +# command: annotation_collector.py $output input1, label1, input2, label2, etc... + +import sys, os + +def countlineinfile(file): + F = open (file, "r") + count = 0 + for line in F: + count += 1 + F.close() + return count/2 +results = [] + +for file, label in zip (sys.argv[2:-1:2], sys.argv[3:-1:2]): + results.append ( (countlineinfile(file), label) ) + +Fout = open (sys.argv[1], "w") + +print >> Fout, "# %s" % (sys.argv[-1]) +for filecount, label in results: + print >> Fout, "%s\t%s\t%.2f %%" % (label, filecount, filecount/float(results[0][0])*100 ) +print >> Fout +Fout.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_collector.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,37 @@ +<tool id="annotation_collector" name="Collect Annotation counts" version="0.9.0"> + <description> from matched fasta files</description> + <command interpreter="python"> + annotation_collector.py + $out_file + $input1 $label1 + #for $q in $queries + ${q.input2} ${q.label2} + #end for + $title + </command> + <inputs> + <param name="title" type="text" label="Annotation Title"/> + <param name="input1" type="data" label="fasta file to annotate"/> + <param name="label1" type="text" label="category label"/> + <repeat name="queries" title="Additional fasta file and label"> + <param name="input2" type="data" label="Select" /> + <param name="label2" type="text" label="Label" /> + </repeat> + </inputs> + <outputs> + <data name="out_file" format="tabular" metadata_source="input1" label="Annotations"/> + </outputs> + <help> + +.. class:: warningmark + +**WARNING:** Be careful not to agregate datasets of different kinds (e.g., datasets must be matched fasta files). This tool does not check if the datasets being annotated are in the same format. + +----- + +**What it does** + +Generates Annotation Table from fasta files + + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bowtie_indices.loc.sample Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 +# +#and your /depot/data2/galaxy/bowtie/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piRNAsignature.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,78 @@ +#!/usr/bin/python +# script for computing overlap signatures from a bowtie output +# Christophe Antoniewski <drosofff@gmail.com> +# Usage piRNAsignature.py <1:input> <2:format of input> <3:minsize query> <4:maxsize query> <5:minsize target> <6:maxsize target> +# <7:minscope> <8:maxscope> <9:output> <10:bowtie index> <11:procedure option> <12: graph (global or lattice)> +# <13: R code> + +import sys, subprocess +from smRtools import * +from collections import defaultdict # test whether it is required + +if sys.argv[11] == "--extract_index": + if sys.argv[2] == "tabular": + Genome = HandleSmRNAwindows (sys.argv[1],"tabular",sys.argv[10],"bowtieIndex") + elif sys.argv[2] == "sam": + Genome = HandleSmRNAwindows (sys.argv[1],"sam",sys.argv[10],"bowtieIndex") + else: + Genome = HandleSmRNAwindows (sys.argv[1],"bam",sys.argv[10],"bowtieIndex") +else: + if sys.argv[2] == "tabular": + Genome = HandleSmRNAwindows (sys.argv[1],"tabular",sys.argv[10],"fastaSource") + elif sys.argv[2] == "sam": + Genome = HandleSmRNAwindows (sys.argv[1],"sam",sys.argv[10],"fastaSource") + else: + Genome = HandleSmRNAwindows (sys.argv[1],"bam",sys.argv[10],"fastaSource") +# this decisional tree may be simplified if sam and bam inputs are treated the same way by pysam + +# replace objDic by Genome.instanceDict or... objDic = Genome.instanceDict +objDic = Genome.instanceDict + +minquery = int(sys.argv[3]) +maxquery = int(sys.argv[4]) +mintarget = int(sys.argv[5]) +maxtarget = int(sys.argv[6]) +minscope = int(sys.argv[7]) +maxscope = int(sys.argv[8]) + 1 +general_frequency_table = dict ([(i,0) for i in range(minscope,maxscope)]) +general_percent_table = dict ([(i,0) for i in range(minscope,maxscope)]) +OUT = open (sys.argv[9], "w") + +if sys.argv[12] == "global": + ###### for normalized summing of local_percent_table(s) + readcount_dic = {} + Total_read_in_objDic = 0 + for item in objDic: + readcount_dic[item] = objDic[item].readcount(minquery, maxquery) + Total_read_in_objDic += readcount_dic[item] + ###### + for x in (objDic): + local_frequency_table = objDic[x].signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) + local_percent_table = objDic[x].hannon_signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) + try: + for overlap in local_frequency_table.keys(): + general_frequency_table[overlap] = general_frequency_table.get(overlap, 0) + local_frequency_table[overlap] + except: + pass + try: + for overlap in local_percent_table.keys(): + general_percent_table[overlap] = general_percent_table.get(overlap, 0) + (1./Total_read_in_objDic*readcount_dic[x]*local_percent_table[overlap]) + except: + pass + print >> OUT, "overlap\tnum of pairs\tprobability" + for classe in sorted(general_frequency_table): + print >> OUT, "%i\t%i\t%f" % (classe, general_frequency_table[classe], general_percent_table[classe]) + +else: + print >> OUT, "overlap\tnum of pairs\tprobability\titem" + for x in (objDic): + local_frequency_table = objDic[x].signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) + local_percent_table = objDic[x].hannon_signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) + for classe in range(minscope,maxscope): + print >> OUT, "%i\t%i\t%f\t%s" % (classe, local_frequency_table[classe], local_percent_table[classe], x) + +OUT.close() + +## Run the R script that is defined in the xml using the Rscript binary provided with R. +R_command="Rscript "+ sys.argv[13] +process = subprocess.Popen(R_command.split())
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piRNAsignature.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,122 @@ +<tool id="piRNAsignature" name="piRNA Signatures" version="1.0.1"> + <description></description> + <command interpreter="python"> + piRNAsignature.py $refGenomeSource.input $refGenomeSource.input.ext $minquery $maxquery $mintarget $maxtarget $minscope $maxscope $output + #if $refGenomeSource.genomeSource == "history": + $refGenomeSource.ownFile + --do_not_extract_index + #else: + #silent reference= filter( lambda x: str( x[0] ) == str( $input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] + $reference + --extract_index + #end if + $graph_type + $sigplotter + </command> + + <inputs> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="input" type="data" format="tabular,sam,bam" label="Compute signature from this bowtie standard output"> + <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="database not set for this bowtie output. Select the database(=genome used for matching) manually, or select a reference fasta from your history."/> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" label="Select the fasta reference" /> + <param name="input" type="data" format="tabular,sam,bam" label="Compute signature from this bowtie standard output"/> + </when> + </conditional> <!-- refGenomeSource --> + <param name="minquery" type="integer" size="3" value="23" label="Min size of query small RNAs" help="'23' = 23 nucleotides"/> + <param name="maxquery" type="integer" size="3" value="29" label="Max size of query small RNAs" help="'29' = 29 nucleotides"/> + <param name="mintarget" type="integer" size="3" value="23" label="Min size of target small RNAs" help="'23' = 23 nucleotides"/> + <param name="maxtarget" type="integer" size="3" value="29" label="Max size of target small RNAs" help="'29' = 29 nucleotides"/> + <param name="minscope" type="integer" size="3" value="1" label="Minimal relative overlap analyzed" help="'1' = 1 nucleotide overlap"/> + <param name="maxscope" type="integer" size="3" value="26" label="Maximal relative overlap analyzed" help="'1' = 1 nucleotide overlap"/> + <param name="graph_type" type="select" label="Graph type" help="Signature can be computed globally or by item present in the alignment file"> + <option value="global" selected="True">Global</option> + <option value="lattice">Lattice</option> + </param> + </inputs> + + <configfiles> + <configfile name="sigplotter"> + graph_type = "${graph_type}" + + globalgraph = function () { + ## Setup R error handling to go to stderr + options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + signature = read.delim("${output}", header=TRUE) + ## Open output1 PDF file + pdf( "${output1}" ) + signaturez=(signature[,2] -mean(signature[,2]))/sd(signature[,2]) + plot(signaturez, type = "b", main="signature", cex.main=2, xlab="overlap (nt)", ylab="z-score", pch=19, cex=0.8, col="darkslateblue", lwd=3, cex.lab=1.5, cex.axis=1.4, xaxt="n") + axis(1, at=seq(from=1, to=length(signature[,1]), by=3) ) + devname = dev.off() + ## Close the PDF file + ## Open output2 PDF file + pdf( "${output2}" ) + YLIM=max(signature[,2]) + plot(signature[,1:2], type = "h", xlab="overlap (nt)", ylim=c(0,YLIM), ylab="number of pairs", col="darkslateblue", lwd=7) + devname = dev.off() + ## Close the PDF file + ## Open output3 PDF file + pdf( "${output3}" ) + plot(signature[,1], signature[,3]*100, type = "l", main="ping-pong Signature of ${minquery}-${maxquery} against ${mintarget}-${maxtarget}nt small RNAs", + cex.main=1, xlab="overlap (nt)", ylab="ping-pong signal [%]", ylim=c(0,50), + pch=19, col="darkslateblue", lwd =4, cex.lab=1.2, cex.axis=1, xaxt="n") + axis(1, at=seq(from=1, to=length(signature[,1]), by=3) ) + devname = dev.off() + ## Close the PDF file + } + + treillisgraph = function () { + ## Open output3 PDF file + pdf( "${output3}", paper="special", height=11.69, width=8.2677 ) + signature = read.delim("${output}", header=TRUE) + options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + library(lattice) + print (xyplot(signature[,3]*100~signature[,1]|signature[,4], type = "l", xlim=c(1,26), main="ping-pong Signature of ${minquery}-${maxquery} against ${mintarget}-${maxtarget}nt small RNAs", + par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), scales=list(cex=0.5), + cex.main=1, cex=.5, xlab="overlap (nt)", ylab="ping-pong signal [%]", + pch=19, col="darkslateblue", lwd =1.5, cex.lab=1.2, cex.axis=1.2, + layout=c(4,12), as.table=TRUE, newpage = T) ) + devnname = dev.off() + } + + if (graph_type=="global") { + globalgraph() + + } + if(graph_type=="lattice") { + treillisgraph() + } + </configfile> + </configfiles> + + <outputs> + <data name="output" format="tabular" label = "signature data frame"/> + <data name="output2" format="pdf" label="number of pairs signature"> + <filter>(graph_type == "global")</filter> + </data> + <data name="output1" format="pdf" label="z-score signature"> + <filter>(graph_type == "global")</filter> + </data> + <data name="output3" format="pdf" label="ping-pong signal (hannon algo.)"/> + </outputs> + + <help> + +**What it does** + +This tool computes the number of pairs by overlap classes (in nt) from a bowtie output file, the z-score calculated from these numbers of pairs, and the ping-pong signal as described in Brennecke et al (2009) Science. +The numerical options set the min and max size of both the query small rna class and the target small rna class +Three type of signals are plotted in separate pdf files, the number of pairs founds, the z-score calculated from these numbers of pairs, and the ping-pong signal as described in Brennecke et al (2009) Science. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readmap.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,116 @@ +#!/usr/bin/python +# python parser module for for readmaps and size distributions, guided by GFF3 +# version 0.9.1 (1-6-2014) +# Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> +# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> +# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> + +import sys, subprocess, argparse +from smRtools import * +from collections import OrderedDict, defaultdict +import os + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument('--output_readmap', action="store", type=str, help="readmap dataframe") + the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") + the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") + the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") + the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") + the_parser.add_argument('--ext',nargs='+', help="input file type") + the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") + the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") + the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") + the_parser.add_argument('--minquery', type=int, help="Minimum readsize") + the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") + the_parser.add_argument('--rcode', type=str, help="R script") + args = the_parser.parse_args() + return args + +args=Parser() +if args.reference_fasta: + genomeRefFormat = "fastaSource" + genomeRefFile = args.reference_fasta +if args.reference_bowtie_index: + genomeRefFormat = "bowtieIndex" + genomeRefFile = args.reference_bowtie_index +readmap_file=args.output_readmap +size_distribution_file=args.output_size_distribution +minquery=args.minquery +maxquery=args.maxquery +Rcode = args.rcode +filePath=args.input +fileExt=args.ext +fileLabel=args.label +normalization_factor=args.normalization_factor + +MasterListOfGenomes = OrderedDict() + +def process_samples(filePath): + for i, filePath in enumerate(filePath): + norm=normalization_factor[i] + print fileLabel[i] + MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ + biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) + return MasterListOfGenomes + +def write_readplot_dataframe(readDict, readmap_file): + with open(readmap_file, 'w') as readmap: + print >>readmap, "gene\tcoord\tcount\tpolarity\tsample" + for sample in readDict.keys(): + if args.gff: + dict=readDict[sample] + else: + dict=readDict[sample].instanceDict + for gene in dict.keys(): + plottable = dict[gene].readplot() + for line in plottable: + print >>readmap, "%s\t%s" % (line, sample) + +def write_size_distribution_dataframe(readDict, size_distribution_file): + with open(size_distribution_file, 'w') as size_distrib: + print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" + for sample in readDict.keys(): + if args.gff: + dict=readDict[sample] + else: + dict=readDict[sample].instanceDict + for gene in dict.keys(): + histogram = dict[gene].size_histogram() + for polarity in histogram.keys(): + for item in histogram[polarity].iteritems(): + print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample) + +def gff_item_subinstances(readDict, gff3): + GFFinstanceDict=OrderedDict() + with open(gff3) as gff: + for line in gff: + if line[0] == "#": continue + gff_fields = line[:-1].split("\t") + chrom = gff_fields[0] + gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name + item_upstream_coordinate = int(gff_fields[3]) + item_downstream_coordinate = int(gff_fields[4]) + item_polarity = gff_fields[6] + for sample in readDict.keys(): + if not GFFinstanceDict.has_key(sample): + GFFinstanceDict[sample]={} + subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) + if item_polarity == '-': + subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} + subinstance.gene=gff_name + GFFinstanceDict[sample][gff_name]=subinstance + return GFFinstanceDict + +MasterListOfGenomes=process_samples(filePath) + +if args.gff: + MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) + +write_readplot_dataframe(MasterListOfGenomes, readmap_file) +write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) + +R_command="Rscript "+ Rcode +process = subprocess.Popen(R_command.split()) +process.wait() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readmap.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,225 @@ +<tool id="Readmap" name="Generate readmap and histograms from alignment files" version="0.9.2"> + <description>from sRbowtie aligment</description> + <requirements><requirement type='package'>bowtie-inspect</requirement></requirements> + <parallelism method="basic"></parallelism> +<command interpreter="python"> + readmap.py + #if $refGenomeSource.genomeSource == "history": + --reference_fasta ## sys.argv[2] + $refGenomeSource.ownFile ## index source + #else: + #silent reference= filter( lambda x: str( x[0] ) == str( $refGenomeSource.series[0].input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] + --reference_bowtie_index + $reference + #end if + --rcode + $plotCode + --output_readmap + $readmap_dataframe + --output_size_distribution + $size_distribution_dataframe + --minquery + $minquery + --maxquery + $maxquery + --input + #for $i in $refGenomeSource.series + $i.input + #end for + --ext + #for $i in $refGenomeSource.series + $i.input.ext + #end for + --label + #for $i in $refGenomeSource.series + "$i.input.name" + #end for + --normalization_factor + #for $i in $refGenomeSource.series + $i.norm + #end for + #if $gff: + --gff + $gff + #end if + +</command> + <inputs> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <repeat name="series" title="Add alignment files"> + <param name="input" type="data" label="Select multiple alignments to parse"> + <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="database not set for this bowtie output. Select the database(=genome used for matching) manually, or select a reference fasta from your history."/> + </param> + <param name="norm" type="float" value="1" label="Indicate a normalization factor to compare multiple aligments"/> + </repeat> + </when> + <when value="history"> + <repeat name="series" title="Add alignment files"> + <param name="input" type="data" label="Select multiple alignments to parse"/> + <param name="norm" type="integer" value="1" label="Indicate a normalization factor to compare multiple aligments"/> + </repeat> + </when> + </conditional> + <param name="gff" type="data" optional="true" label="Optional: select a GFF to investigate regions of interest" help="GFF must match genome build"/> + <!-- <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="GFF database and alignment file databse do not match!"/> --> + <param name="minquery" type="integer" size="3" value="18" label="Min size of query small RNAs" help="'18' = 18 nucleotides"/> + <param name="maxquery" type="integer" size="3" value="28" label="Max size of query small RNAs" help="'28' = 28 nucleotides"/> + <param name="title" type="text" size="15" value= "Readmaps and size distributions" label="Main Titles"/> + <param name="xlabel" type="text" size="15" value="Coordinates/read size" label="x axis label"/> + <param name="ylabel" type="text" size="15" value="Number of reads" label="y axis label"/> + <param name="rows_per_page" type="text" size="9" value="8" label="How many items to display per page?"> + <validator type="in_range" min="6" max="20" message="Select between 6 and 20 rows, as the readability will suffer otherwise."/> + </param> + </inputs> + <configfiles> + <configfile name="plotCode"> + ## Setup R error handling to go to stderr + options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + library(RColorBrewer) + library(lattice) + library(latticeExtra) + library(grid) + library(gridExtra) + ##cheetahtemplate data frame implementation + + rm=read.delim("${readmap_dataframe}", header=T, row.names=NULL) + pdf(file="${readmap_PDF}", paper="special", height=11.69, width=8.2677) + n_samples=length(unique(rm\$sample)) + + genes=unique(levels(rm\$gene)) + per_gene_readmap=lapply(genes, function(x) subset(rm, gene==x)) + n_genes=length(per_gene_readmap) + + + par.settings.readmap=list(layout.heights=list(top.padding=0, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8)) + par.settings.size=list(layout.heights=list(top.padding=-1, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8)) + par.settings.combination.readmap=list(layout.heights=list(top.padding=0, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8)) + par.settings.combination.size=list(layout.heights=list(top.padding=-2, bottom.padding=-0.5), fontsize = list(text=96/${rows_per_page}, points=8)) + + + plot_readmap=function(df, ...) { + combineLimits(xyplot(count~coord|factor(sample, levels=unique(sample))+reorder(gene, count, function(x) -sum(abs(x))), + data=df, + type='h', + scales= list(relation="free", x=list(rot=0, cex=0.75, axs="i", tck=0.5), y=list(tick.number=4, rot=90, cex=0.75)), + xlab=NULL, main=NULL, ylab=NULL, + as.table=T, + origin = 0, + horizontal=FALSE, + group=polarity, + col=c("red","blue"), + ...)) + } + + plot_size_distribution= function(df, ...) { + smR.prepanel=function(x,y,...){; yscale=c(-max(abs(y)), max(abs(y)));list(ylim=yscale);} + bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample))+gene, data = df, origin = 0, + horizontal=FALSE, + group=polarity, + stack=TRUE, + col=c('red', 'blue'), + cex=0.75, + scales=list(y=list(tick.number=4, rot=90, relation="free"), cex=0.75), + prepanel=smR.prepanel, + xlab = NULL, + ylab = NULL, +# par.settings=list(layout.heights=list(top.padding=-2, bottom.padding=-3), fontsize = list(text=8, points=8)), + main = NULL , as.table=TRUE, newpage = T, ...) + combineLimits(bc) + } + + for (i in seq(1,n_genes,${rows_per_page})) { + start=i + end=i+${rows_per_page}-1 + if (end>n_genes) {end=n_genes} + readmap_plot.list=lapply(per_gene_readmap[start:end], function(x) plot_readmap(x, par.settings=par.settings.readmap)) + args.list=c(readmap_plot.list, list(nrow=${rows_per_page}, ncol=1, main="readmaps", left="${ylabel}", sub="readmap coordinate")) + do.call(grid.arrange, args.list) + } + + devname=dev.off() + + size=read.delim("${size_distribution_dataframe}", header=T, row.names=NULL) + per_gene_size=lapply(genes, function(x) subset(size, gene==x)) + + pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677) + + for (i in seq(1,n_genes,${rows_per_page})) { + start=i + end=i+${rows_per_page}-1 + if (end>n_genes) {end=n_genes} + plot.list=lapply(per_gene_size[start:end], function(x) plot_size_distribution(x, par.settings=par.settings.size)) + args.list=c(plot.list, list(nrow=${rows_per_page}, ncol=1, main="size distribution", left="${ylabel}", sub="readsize in nucleotides")) + do.call(grid.arrange, args.list) + } + + devname=dev.off() + + pdf(file="${combi_PDF}", paper="special", height=11.69, width=8.2677) + + for (i in seq(1,n_genes,${rows_per_page}/2)) { + start=i + end=i+${rows_per_page}/2-1 + if (end>n_genes) {end=n_genes} + read_plot.list=lapply(per_gene_readmap[start:end], function(x) plot_readmap(x, par.settings=par.settings.combination.readmap)) + size_plot.list=lapply(per_gene_size[start:end], function(x) plot_size_distribution(x, strip=FALSE, par.settings=par.settings.combination.size)) + plot.list=rbind(read_plot.list, size_plot.list ) + args.list=c(plot.list, list(nrow=${rows_per_page}, ncol=1, main="${title}", left="${ylabel}", sub="${xlabel}")) + do.call(grid.arrange, args.list) + } + + devname=dev.off() + + + </configfile> + </configfiles> + + <outputs> + <data format="tabular" name="readmap_dataframe" label="Readmap dataframe"/> + <data format="tabular" name="size_distribution_dataframe" label="Size distribution dataframe"/> + <data format="pdf" name="readmap_PDF" label="Readmaps"/> + <data format="pdf" name="size_PDF" label="Size distribution"/> + <data format="pdf" name="combi_PDF" label="Size distribution and Readmaps"/> + </outputs> +<help> + +**What it does** + +Takes one or more alignment files (BAM, SAM or tabular bowtie output) as input and produces a "Readmap", +where by default for each "chromosome" the position of the read is recorded on the x-axis, and the y-axis indicates +the number of reads per position. Reads that map in sense are on the top, reads that map antisense are on the bottom. + + +.. class:: warningmark + +'''TIP''' The input data can be produced using the sRbowtie tool. + +---- + +'''Example''' + +Query sequence:: +For a SAM file as the following: + + 5 16 2L_79 24393 255 17M * 0 0 CCTTCATCTTTTTTTTT IIIIIIIIIIIIIIIII XA:i:0 MD:Z:17 NM:i:0 + + 11 0 2R_1 12675 255 21M * 0 0 AAAAAAAACGCGTCCTTGTGC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 + + 2 16 2L_5 669 255 23M * 0 0 TGTTGCTGCATTTCTTTTTTTTT IIIIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:23 NM:i:0 + +produce a plot like this: + +---- + +.. image:: static/images/readmap.png + :height: 800 + :width: 500 + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtie.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# small RNA oriented bowtie wrapper +# version 1.01 29-5-2014 +# Usage sRbowtie.py <1 input_fasta_file> <2 alignment method> <3 -v mismatches> <4 out_type> <5 buildIndexIfHistory> <6 fasta/bowtie index> <7 bowtie output> <8 ali_fasta> <9 unali_fasta> <10 --num-threads \${GALAXY_SLOTS:-4}> +# current rev: for bowtie __norc, move from --supress 2,6,7,8 to --supress 6,7,8. Future Parser must be updated to take into account this standardisation +# To Do: +# implement an arg parser +# Christophe Antoniewski <drosofff@gmail.com> + +import sys, os, subprocess, tempfile, shutil + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def bowtieCommandLiner (alignment_method, v_mis, out_type, aligned, unaligned, input, index, output, pslots="12"): + if alignment_method=="RNA": + x = "-v %s -M 1 --best --strata -p %s --norc --suppress 6,7,8" % (v_mis, pslots) + elif alignment_method=="unique": + x = "-v %s -m 1 -p %s --suppress 6,7,8" % (v_mis, pslots) + elif alignment_method=="multiple": + x = "-v %s -M 1 --best --strata -p %s --suppress 6,7,8" % (v_mis, pslots) + elif alignment_method=="k_option": + x = "-v %s -k 1 --best -p %s --suppress 6,7,8" % (v_mis, pslots) + elif alignment_method=="n_option": + x = "-n %s -M 1 --best -p %s --suppress 6,7,8" % (v_mis, pslots) + elif alignment_method=="a_option": + x = "-v %s -a --best -p %s --suppress 6,7,8" % (v_mis, pslots) + if aligned == "None" and unaligned == "None": fasta_command = "" + elif aligned != "None" and unaligned == "None": fasta_command= " --al %s" % aligned + elif aligned == "None" and unaligned != "None": fasta_command = " --un %s" % unaligned + else: fasta_command = " --al %s --un %s" % (aligned, unaligned) + x = x + fasta_command + if out_type == "tabular": + return "bowtie %s %s -f %s > %s" % (x, index, input, output) + elif out_type=="sam": + return "bowtie %s -S %s -f %s > %s" % (x, index, input, output) + elif out_type=="bam": + return "bowtie %s -S %s -f %s |samtools view -bS - > %s" % (x, index, input, output) + +def bowtie_squash(fasta): + tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes + ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) + ref_file_name = ref_file.name + ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name + os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name + cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir) + try: + FNULL = open(os.devnull, 'w') + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string + tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file + proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null + returncode = proc.wait() + tmp_stderr.close() + FNULL.close() + sys.stdout.write(cmd1 + "\n") + except Exception, e: + # clean up temp dir + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + stop_err( 'Error indexing reference sequence\n' + str( e ) ) + # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment() + index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention + return tmp_index_dir, index_full_path + +def bowtie_alignment(command_line, flyPreIndexed=''): + # make temp directory just for stderr + tmp_index_dir = tempfile.mkdtemp() + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name + tmp_stderr = open( tmp, 'wb' ) + # conditional statement for sorted bam generation viewable in Trackster + if "samtools" in command_line: + target_file = command_line.split()[-1] # recover the final output file name + path_to_unsortedBam = os.path.join(tmp_index_dir, "unsorted.bam") + path_to_sortedBam = os.path.join(tmp_index_dir, "unsorted.bam.sorted") + first_command_line = " ".join(command_line.split()[:-3]) + " -o " + path_to_unsortedBam + " - " + # example: bowtie -v 0 -M 1 --best --strata -p 12 --suppress 6,7,8 -S /home/galaxy/galaxy-dist/bowtie/Dmel/dmel-all-chromosome-r5.49 -f /home/galaxy/galaxy-dist/database/files/003/dataset_3460.dat |samtools view -bS -o /tmp/tmp_PgMT0/unsorted.bam - + second_command_line = "samtools sort %s %s" % (path_to_unsortedBam, path_to_sortedBam) # generates an "unsorted.bam.sorted.bam file", NOT an "unsorted.bam.sorted" file + p = subprocess.Popen(args=first_command_line, cwd=tmp_index_dir, shell=True, stderr=tmp_stderr.fileno()) # fileno() method return the file descriptor number of tmp_stderr + returncode = p.wait() + sys.stdout.write("%s\n" % first_command_line + str(returncode)) + p = subprocess.Popen(args=second_command_line, cwd=tmp_index_dir, shell=True, stderr=tmp_stderr.fileno()) + returncode = p.wait() + sys.stdout.write("\n%s\n" % second_command_line + str(returncode)) + if os.path.isfile(path_to_sortedBam + ".bam"): + shutil.copy2(path_to_sortedBam + ".bam", target_file) + else: + p = subprocess.Popen(args=command_line, shell=True, stderr=tmp_stderr.fileno()) + returncode = p.wait() + sys.stdout.write(command_line + "\n") + tmp_stderr.close() + ## cleaning if the index was created in the fly + if os.path.exists( flyPreIndexed ): + shutil.rmtree( flyPreIndexed ) + # cleaning tmp files and directories + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + return + +def __main__(): + F = open (sys.argv[7], "w") + if sys.argv[5] == "history": + tmp_dir, index_path = bowtie_squash(sys.argv[6]) + else: + tmp_dir, index_path = "dummy/dymmy", sys.argv[6] + command_line = bowtieCommandLiner(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[8], sys.argv[9], sys.argv[1], index_path, sys.argv[7], sys.argv[10]) + bowtie_alignment(command_line, flyPreIndexed=tmp_dir) + F.close() +if __name__=="__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtie.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,210 @@ +<tool id="bowtieForSmallRNA" name="sRbowtie" version="1.0.1"> + <description>for FASTA small reads</description> + <requirements> + <requirement type='package'>bowtie</requirement> + </requirements> + <parallelism method="basic"></parallelism> + <command interpreter="python"> sRbowtie.py $input + $method + $v_mismatches + $output_type + $refGenomeSource.genomeSource + ## the very source of the index (indexed or fasta file) + #if $refGenomeSource.genomeSource == "history": + $refGenomeSource.ownFile + #else: + $refGenomeSource.index.fields.path + #end if + $output + $aligned + $unaligned + \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie + </command> + <inputs> + <param name="input" type="data" format="fasta" label="Input fasta file: reads clipped from their adapter" help="Only with clipped, raw fasta files"/> +<!-- which method will be used --> + <param name="method" type="select" label="What kind of matching do you want to do?" help="bowtie parameters adjusted to the type of matching. RNA option match to only one strand"> + <option value="RNA">Match on sense strand RNA reference index, multiple mappers randomly matched at a single position</option> + <option value="unique">Match unique mappers on DNA reference index</option> + <option value="multiple" selected="true">Match on DNA, multiple mappers randomly matched at a single position</option> + <option value="k_option">Match on DNA as fast as possible, without taking care of mapping issues (for raw annotation of reads)</option> + <option value="n_option">Match on DNA - RNAseq mode (-n bowtie option)</option> + <option value="a_option">Match and report all valid alignments</option> + </param> +<!-- END of which method will be used --> + <param name="v_mismatches" type="select" label="Number of mismatches allowed" help="specify the -v bowtie option"> + <option value="0">0</option> + <option value="1" selected="true">1</option> + <option value="2">2</option> + <option value="3">3</option> + </param> +<!-- bowtie index selection --> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a DNA reference index" help="if your genome of interest is not listed - contact GED team"> + <options from_data_table="bowtie_indexes"> + <!-- <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available" /> --> + </options> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" label="Select a fasta file, to serve as index reference" /> + </when> + </conditional> +<!-- END bowtie index selection --> + <param name="output_type" type="select" label="Select output format" help="Note that the BAM will be viewable in trackster only if you choose a full genome referenced for Trackster usage. see the doc below"> + <option value="tabular" select="true">tabular</option> + <option value="sam">sam</option> + <option value="bam">bam</option> + </param> + <param name="additional_fasta" type="select" label="additional fasta output" help="to get aligned and unaligned reads in fasta format"> + <option value="No" select="true">No</option> + <option value="al">aligned</option> + <option value="unal">unaligned</option> + <option value="al_and_unal">both aligned and unaligned</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="output" label="Bowtie Output"> + <change_format> + <when input="output_type" value="sam" format="sam" /> + <when input="output_type" value="bam" format="bam" /> + </change_format> +<!-- Set metadata based on reference genome --> + <actions> + <conditional name="refGenomeSource.genomeSource"> + <when value="indexed"> + <action type="metadata" name="dbkey"> + <option type="from_data_table" name="bowtie_indexes" column="1" offset="0"> + <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> + <filter type="param_value" ref="refGenomeSource.index" column="0"/> + </option> + </action> + </when> + <when value="history"> + <action type="metadata" name="dbkey"> + <option type="from_param" name="refGenomeSource.ownFile" param_attribute="dbkey" /> + </action> + </when> + </conditional> + </actions> + </data> + <data format="fasta" name="aligned" label="Matched reads"> + <filter>additional_fasta == "al" or additional_fasta == "al_and_unal"</filter> + </data> + <data format="fasta" name="unaligned" label ="Unmatched reads"> + <filter>additional_fasta == "unal" or additional_fasta == "al_and_unal"</filter> + </data> + </outputs> + + <test> + <param name="genomeSource" value="indexed" /> + <param name="index" value="/home/galaxy/galaxy-dist/bowtie/Dmel/dmel-all-chromosome-r5.49" /> + <param name="method" value="multiple" /> + <param name="input" ftype="fasta" value="sRbowtie.fa" /> + <param name="v_mismatches" value="1" /> + <param name="output_type" value="tabular" /> + <output name="output" ftype="tabular" value="sRbowtie.out" /> + <output name="aligned" value="None" /> + <output name="unaligned" value="None" /> + </test> + + <help> + +**What it does** + +Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient. It is developed by Ben Langmead and Cole Trapnell. Please cite: Langmead B, Trapnell C, Pop M, Salzberg SL. Ultrafast and memory-efficient alignment of short DNA sequences to the human genome. Genome Biology 10:R25. + +.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml + +A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution. + +However, this Bowtie wrapper tool only takes FASTQ files as inputs. + +The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode) + +------ + +**OPTIONS** + +.. class:: infomark + +This script uses Bowtie to match reads on a reference index. + +Depending on the type of matching, different bowtie options are used: + +**Match on sense strand RNA reference index, multiple mappers randomly matched at a single position** + +Match on RNA reference, SENSE strand, randomly attributing multiple mapper to target with least mismatches, the polarity column is suppressed in the bowtie tabular report: + +*-v [0,1,2,3] -M 1 --best --strata -p 12 --norc --suppress 2,6,7,8* + +**Match unique mappers on DNA reference index** + +Match ONLY unique mappers on DNA reference index + +*-v [0,1,2,3] -m 1 -p 12 --suppress 6,7,8* + +Note that using this option with -v values other than 0 is questionnable... + +**Match on DNA, multiple mappers randomly matched at a single position** + +Match multiple mappers, randomly attributing multiple mapper to target with least mismatches, number of mismatch allowed specified by -v option: + +*-v [0,1,2,3] -M 1 --best --strata -p 12 --suppress 6,7,8* + +**Match on DNA as fast as possible, without taking care of mapping issues (for raw annotation of reads)** + +Match with highest speed, not guaranteeing best hit for speed gain: + +*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8* + + +----- + +**Input formats** + +.. class:: warningmark + +*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter* + +----- + +**OUTPUTS** + +If you choose tabular as the output format, you will obtain the matched reads in standard bowtie output format, having the following columns:: + + Column Description + -------- -------------------------------------------------------- + 1 FastaID fasta identifier + 2 polarity + or - depending whether the match was reported on the forward or reverse strand + 3 target name of the matched target + 4 Offset O-based coordinate of the miR on the miRBase pre-miR sequence + 5 Seq sequence of the matched Read + +If you choose SAM, you will get the output in unordered SAM format. + +.. class:: warningmark + +if you choose BAM, the output will be in sorted BAM format. +To be viewable in Trackster, several condition must be fulfilled: + +.. class:: infomark + +Reads must have been matched to a genome whose chromosome names are compatible with Trackster genome indexes + +.. class:: infomark + +the database/Build (dbkey) which is indicated for the dataset (Pencil - Database/Build field) must match a Trackster genome index. + +Please contact the Galaxy instance administrator if your genome is not referenced + +**Matched and unmatched fasta reads can be retrieved, for further analyses** + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtieCascade.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# small RNA oriented bowtie wrapper in cascade for small RNA data set genome annotation +# version 0.9 13-6-2014 +# Usage sRbowtie_cascade.py see Parser() for valid arguments +# Christophe Antoniewski <drosofff@gmail.com> + +import sys, os, subprocess, tempfile, shutil, argparse +from collections import defaultdict + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument('--output', action="store", type=str, help="output file") + the_parser.add_argument('--num-threads', dest="num_threads", action="store", type=str, help="number of bowtie threads") + the_parser.add_argument('--mismatch', action="store", type=str, help="number of mismatches allowed") + the_parser.add_argument('--indexing-flags', dest="indexing_flags", nargs='+', help="whether the index should be generated or not by bowtie-buid") + the_parser.add_argument('--index',nargs='+', help="paths to indexed or fasta references") + the_parser.add_argument('--indexName',nargs='+', help="Names of the indexes") + the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") + the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") + args = the_parser.parse_args() + return args + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def bowtie_squash(fasta): + tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes + ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) + ref_file_name = ref_file.name + ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name + os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name + cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir) + try: + FNULL = open(os.devnull, 'w') + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string + tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file + proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null + returncode = proc.wait() + tmp_stderr.close() + FNULL.close() + sys.stdout.write(cmd1 + "\n") + except Exception, e: + # clean up temp dir + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + stop_err( 'Error indexing reference sequence\n' + str( e ) ) + # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment() + index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention + return index_full_path + +def make_working_dir(): + working_dir = tempfile.mkdtemp() + return working_dir + +def Clean_TempDir(directory): + if os.path.exists( directory ): + shutil.rmtree( directory ) + return + +def bowtie_alignment(command_line="None", working_dir = ""): + FNULL = open(os.devnull, 'w') + p = subprocess.Popen(args=command_line, cwd=working_dir, shell=True, stderr=FNULL, stdout=FNULL) + returncode = p.wait() + sys.stdout.write("%s\n" % command_line) + FNULL.close() + #p = subprocess.Popen(["wc", "-l", "%s/al.fasta"%working_dir], cwd=working_dir, stdout=subprocess.PIPE) + #aligned = p.communicate()[0].split()[0] + aligned = 0 + F = open ("%s/al.fasta" % working_dir, "r") + for line in F: + aligned += 1 + F.close() + sys.stdout.write("Aligned: %s\n" % aligned) + return aligned/2 + +def CommandLiner (v_mis="1", pslots="12", index="dum/my", input="dum/my", working_dir=""): + return "bowtie -v %s -k 1 --best -p %s --al %s/al.fasta --un %s/unal.fasta --suppress 1,2,3,4,5,6,7,8 %s -f %s" % (v_mis, pslots, working_dir, working_dir, index, input) + +def __main__(): + args = Parser() + ## first we make all indexes available. They can be already available or be squashed by bowtie-build + ## we keep them in a list that alternates indexPath and "toClear" or "DoNotDelete" + BowtieIndexList = [] + for indexing_flags, bowtiePath in zip (args.indexing_flags, args.index): + if indexing_flags == "history": + BowtieIndexList.append ( bowtie_squash (bowtiePath) ) + BowtieIndexList.append ( "toClear" ) + else: + BowtieIndexList.append ( bowtiePath ) + BowtieIndexList.append ( "DoNotDelete") + ###### temporary Indexes are generated. They must be deleted at the end (after removing file name in the temp path) + ResultDict = defaultdict(list) + for label, input in zip(args.label, args.input): ## the main cascade, iterating over samples and bowtie indexes + workingDir = make_working_dir() + cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[0], input=input, working_dir=workingDir) + ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) # first step of the cascade + if len(BowtieIndexList) > 2: # is there a second step to perform ? + os.rename("%s/al.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) ## end of first step. the aligned reads are the input of the next step + cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[2], input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) + ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )## second step of the cascade + if len(BowtieIndexList) > 4: ## remaining steps + for BowtieIndexPath in BowtieIndexList[4::2]: + os.rename("%s/unal.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) + cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexPath, input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) + ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) + Fun = open("%s/unal.fasta"%workingDir, "r") ## to finish, compute the number of unmatched reads + n = 0 + for line in Fun: + n += 1 + ResultDict[label].append(n/2) + Fun.close() + Clean_TempDir (workingDir) # clean the sample working directory + ## cleaning + for IndexPath, IndexFlag in zip(BowtieIndexList[::2], BowtieIndexList[1::2]): + if IndexFlag == "toClear": + Clean_TempDir ("/".join(IndexPath.split("/")[:-1])) + ## end of cleaning + + + + F = open (args.output, "w") + print >> F, "alignment reference\t%s" % "\t".join(args.label) + for i, reference in enumerate(args.indexName): + F.write ("%s" % reference) + for sample in args.label: + F.write ("\t%s" % "{:,}".format(ResultDict[sample][i]) ) + print >> F + F.write ("Remaining Unmatched") + for sample in args.label: + F.write ("\t%s" % "{:,}".format(ResultDict[sample][-1]) ) + print >> F + + F.close() + +if __name__=="__main__": __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtieCascade.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,142 @@ +<tool id="sRbowtie_cascade" name="Annotate smRNA datasets" version="0.9.0"> + <description>Using iterative sRbowtie Alignments</description> + <requirements> + <requirement type='package'>bowtie</requirement> + </requirements> + <parallelism method="basic"></parallelism> + <command interpreter="python"> sRbowtieCascade.py --output $output + --num-threads \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie + --mismatch $mismatches + --input + #for $i in $input: + $i + #end for + --label + #for $i in $input: + "$i.name" + #end for + --index + #if $refGenomeSource1.genomeSource == "history": + $refGenomeSource1.ownFile + #else: + $refGenomeSource1.index.fields.path + #end if + #for $i in $AdditionalQueries: + #if $i.refGenomeSource.genomeSource == "history": + $i.refGenomeSource.ownFile + #else: + $i.refGenomeSource.index.fields.path + #end if + #end for + --indexing-flags + $refGenomeSource1.genomeSource + #for $i in $AdditionalQueries: + $i.refGenomeSource.genomeSource + #end for + --indexName + #if $refGenomeSource1.genomeSource == "history": + "$refGenomeSource1.ownFile.name" + #else: + "$refGenomeSource1.index.fields.name" + #end if + #for $i in $AdditionalQueries: + #if $i.refGenomeSource.genomeSource == "history": + "$i.refGenomeSource.ownFile.name" + #else: + "$i.refGenomeSource.index.fields.name" + #end if + #end for + </command> + <inputs> + <param name="input" type="data" format="fasta" label="Input fasta file: reads clipped from their adapter" help="Only with clipped, raw fasta files" multiple="true"/> + <param name="mismatches" type="select" label="Number of mismatches allowed" help="specify the number of mismatches allowed during alignments"> + <option value="0">0</option> + <option value="1" selected="true">1</option> + <option value="2">2</option> + <option value="3">3</option> + </param> +<!-- First bowtie index selection --> + <conditional name="refGenomeSource1"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a DNA reference index" help="if your genome of interest is not listed - contact GED team"> + <options from_data_table="bowtie_indexes"/> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" label="Select a fasta file, to serve as index reference" /> + </when> + </conditional> +<!-- End of first bowtie index selection --> +<!-- other bowtie index selections --> + <repeat name="AdditionalQueries" title="Additional Alignment Step"> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a DNA reference index" help="if your genome of interest is not listed - contact GED team"> + <options from_data_table="bowtie_indexes"/> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" label="Select a fasta file, to serve as index reference" /> + </when> + </conditional> + </repeat> +<!-- End of other bowtie index selections --> + </inputs> + <outputs> + <data format="tabular" name="output" label="Cascade Annotation Analysis"/> + </outputs> + + <test> + </test> + + <help> + +**Intro** + +Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient. +A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution. +However, this Bowtie wrapper tool only takes FASTQ files as inputs. + +Here The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode, with -k 1) + +.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml + + +------ + +**What it does** + +.. class:: infomark + +This script uses the sRbowtie wrapper to iteratively match reads on a reference indexes. + +Reads are Matched on DNA references as fast as possible, without taking care of mapping issues + +*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8* + +unaligned reads at step N are used input for sRbowtie at step N+1 + +----- + +**Input formats** + +.. class:: warningmark + +*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter* + +----- + +**OUTPUTS** + +**Annotation table** + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtieParser.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,35 @@ +#!/usr/bin/python +# python parser module to analyse sRbowtie alignments +# version 0.9 +# Usage sRbowtieParser.py <1:index source> <2:extraction directive> <3:outputL> <4:polarity> <5:6:7 filePath:FileExt:FileLabel> <.. ad lib> + +import sys +from smRtools import * + +IndexSource = sys.argv[1] +ExtractionDirective = sys.argv[2] +if ExtractionDirective == "--do_not_extract_index": + genomeRefFormat = "fastaSource" +elif ExtractionDirective == "--extract_index": + genomeRefFormat = "bowtieIndex" +Output = sys.argv[3] +Polarity = sys.argv[4] # maybe "both", "forward", "reverse" +Triplets = [sys.argv[5:][i:i+3] for i in xrange(0, len(sys.argv[5:]), 3)] +MasterListOfGenomes = {} + +for [filePath, FileExt, FileLabel] in Triplets: + MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (filePath, FileExt, IndexSource, genomeRefFormat) + +header = ["gene"] +for [filePath, FileExt, FileLabel] in Triplets: + header.append(FileLabel) + +F = open (sys.argv[3], "w") +print >> F, "\t".join(header) +for item in sorted (MasterListOfGenomes[header[1]].instanceDict.keys() ): + line=[item] + for sample in header[1:]: + count = str (MasterListOfGenomes[sample].instanceDict[item].readcount(polarity=Polarity)) + line.append(count) + print >> F, "\t".join(line ) +F.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sRbowtieParser.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,67 @@ +<tool id="sRbowtieParser" name="Parse items in sRbowtie alignment" version="0.9.1"> + <description></description> + <requirements><requirement type='package'>bowtie-inspect</requirement></requirements> + <parallelism method="basic"></parallelism> +<command interpreter="python"> + sRbowtieParser.py + #if $refGenomeSource.genomeSource == "history": + $refGenomeSource.ownFile ## index source + --do_not_extract_index + #else: + #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] + $reference ## index source + --extract_index + #end if + $output + $polarity + #for $i in $refGenomeSource.input_list + $i $i.ext "$i.name" + #end for + +</command> + <inputs> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="input_list" type="data" label="Select multiple alignments to parse" multiple="true"> + <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="database not set for this bowtie output. Select the database(=genome used for matching) manually, or select a reference fasta from your history."/> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="fasta" label="Select the fasta reference" /> + <param name="input_list" type="data" label="Select multiple alignments to parse" multiple="true"/> + </when> + </conditional> <!-- refGenomeSource --> + <param name="polarity" type="select" label="how to count sense and antisense reads"> + <option value="both">count both sense and antisense reads</option> + <option value="forward">count only sense reads</option> + <option value="reverse">count only antisense reads</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="output" label="Read Count Lists"/> + </outputs> + <help> + +**What it does** + +Parses read counts from one or several sRBowtie alignments (in tabular, Sam or Bam format). + +Here a bowtie match done against an index composed of a set of items is parsed and expressed as a hit list of the corresponding items + +Sense, antisense or both sense and antisense alignments can be counted + +The library labels are infered from the input dataset names in the galaxy history. + +**It is thus essential that input datasets are appropriately renamed** + +**it is preferable that you do not put any space in this input dataset names. You may edit these names in the history** + + + + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/size_histogram.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,126 @@ +#!/usr/bin/python +# python parser module for size distributions, guided by GFF3 +# version 0.9.1 (1-6-2014) +# Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> +# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> +# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> + +import sys, subprocess, argparse +from smRtools import * +from collections import OrderedDict, defaultdict +import os + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") + the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") + the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") + the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") + the_parser.add_argument('--ext',nargs='+', help="input file type") + the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") + the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") + the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") + the_parser.add_argument('--minquery', type=int, help="Minimum readsize") + the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") + the_parser.add_argument('--rcode', type=str, help="R script") + the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calcilated for the sum of all items") + the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed") + args = the_parser.parse_args() + return args + +args=Parser() +if args.reference_fasta: + genomeRefFormat = "fastaSource" + genomeRefFile = args.reference_fasta +if args.reference_bowtie_index: + genomeRefFormat = "bowtieIndex" + genomeRefFile = args.reference_bowtie_index +size_distribution_file=args.output_size_distribution +minquery=args.minquery +maxquery=args.maxquery +Rcode = args.rcode +filePath=args.input +fileExt=args.ext +fileLabel=args.label +normalization_factor=args.normalization_factor +global_size=args.global_size +collapse=args.collapse + +if collapse: + pol=["both"] +else: + pol=["F", "R"] + +MasterListOfGenomes = OrderedDict() + +def process_samples(filePath): + for i, filePath in enumerate(filePath): + norm=normalization_factor[i] + print fileLabel[i] + MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ + biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) + return MasterListOfGenomes + +def write_size_distribution_dataframe(readDict, size_distribution_file): + with open(size_distribution_file, 'w') as size_distrib: + print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" + for sample in readDict.keys(): + if args.gff: + dict=readDict[sample] + else: + dict=readDict[sample].instanceDict + for gene in dict.keys(): + histogram = dict[gene].size_histogram() + for polarity in histogram.keys(): + for item in histogram[polarity].iteritems(): + print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample) + +def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]): + with open(size_distribution_file, 'w') as size_distrib: + print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" + for sample in readDict.keys(): + histogram = readDict[sample].size_histogram() + gene="sample" + for polarity in pol: + for item in histogram[polarity].iteritems(): + if polarity=="R": + print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], -item[1], sample) + else: + print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample) + +def gff_item_subinstances(readDict, gff3): + GFFinstanceDict=OrderedDict() + with open(gff3) as gff: + for line in gff: + if line[0] == "#": continue + gff_fields = line[:-1].split("\t") + chrom = gff_fields[0] + gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name + item_upstream_coordinate = int(gff_fields[3]) + item_downstream_coordinate = int(gff_fields[4]) + item_polarity = gff_fields[6] + for sample in readDict.keys(): + if not GFFinstanceDict.has_key(sample): + GFFinstanceDict[sample]={} + subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) + if item_polarity == '-': + subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} +# subinstance.readDict.setdefault(key, []) + subinstance.gene=gff_name + GFFinstanceDict[sample][gff_name]=subinstance + return GFFinstanceDict + +MasterListOfGenomes=process_samples(filePath) + +if args.gff: + MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) + +if global_size: + write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol) +else: + write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) + +R_command="Rscript "+ Rcode +process = subprocess.Popen(R_command.split()) +process.wait() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/size_histogram.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,200 @@ +<tool id="Size_histogram" name="Generate size histograms from alignment files" version="0.9.0"> + <description>from sRbowtie aligment</description> + <requirements><requirement type='package'>bowtie-inspect</requirement></requirements> + <parallelism method="basic"></parallelism> +<command interpreter="python"> + size_histogram.py + #if $refGenomeSource.genomeSource == "history": + --reference_fasta ## sys.argv[2] + $refGenomeSource.ownFile ## index source + #else: + #silent reference= filter( lambda x: str( x[0] ) == str( $refGenomeSource.series[0].input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] + --reference_bowtie_index + $reference + #end if + --rcode + $plotCode + --output_size_distribution + $size_distribution_dataframe + --minquery + $minquery + --maxquery + $maxquery + --input + #for $i in $refGenomeSource.series + $i.input + #end for + --ext + #for $i in $refGenomeSource.series + $i.input.ext + #end for + --label + #for $i in $refGenomeSource.series + "$i.input.name" + #end for + --normalization_factor + #for $i in $refGenomeSource.series + $i.norm + #end for + #if $gff: + --gff + $gff + #end if + #if $global.value == 'yes': + --global_size + #end if + #if $collapsestrands.value == 'yes': + --collapse + #end if + +</command> + <inputs> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <repeat name="series" title="Add alignment files"> + <param name="input" type="data" label="Select multiple alignments to parse"> + <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="database not set for this bowtie output. Select the database(=genome used for matching) manually, or select a reference fasta from your history."/> + </param> + <param name="norm" type="float" value="1" label="Indicate a normalization factor to compare multiple aligments"/> + </repeat> + </when> + <when value="history"> + <repeat name="series" title="Add alignment files"> + <param name="input" type="data" label="Select multiple alignments to parse"/> + <param name="norm" type="integer" value="1" label="Indicate a normalization factor to compare multiple aligments"/> + </repeat> + </when> + </conditional> + <param name="gff" type="data" optional="true" label="Optional: select a GFF to investigate regions of interest" help="GFF must match genome build"/> + <!-- <validator type="dataset_metadata_in_data_table" table_name="bowtie_indexes" metadata_name="dbkey" metadata_column="0" message="GFF database and alignment file databse do not match!"/> --> + <param name="global" type="select" label="Generate size distribution for each item, or generate a global alignment"> + <option value="no">for each item</option> + <option value="yes">global</option> + </param> + <param name="collapsestrands" type="select" label="Whether + and - reads should be collapsed or not"> + <option value="no">Do not collapse</option> + <option value="yes">Collapse + and - reads</option> + </param> + <param name="minquery" type="integer" size="3" value="18" label="Min size of reads to plot" help="'15' = 15 nucleotides"/> + <param name="maxquery" type="integer" size="3" value="28" label="Max size of reads to plot" help="'30' = 30 nucleotides"/> + <param name="title" type="text" size="15" value="Size distribution" label="Main Titles"/> + <param name="xlabel" type="text" size="15" value="Size in nucleotides" label="x axis label"/> + <param name="ylabel" type="text" size="15" value="Number of reads" label="y axis label"/> + <param name="rows_per_page" type="text" size="9" value="8" label="How many items to display per page?"> + <validator type="in_range" min="6" max="20" message="Select between 6 and 20 rows, as the readability will suffer otherwise."/> + </param> + </inputs> + <configfiles> + <configfile name="plotCode"> + ## Setup R error handling to go to stderr + options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + library(RColorBrewer) + library(lattice) + library(latticeExtra) + library(grid) + library(gridExtra) + ##cheetahtemplate data frame implementation + + size=read.delim("${size_distribution_dataframe}", header=T, row.names=NULL) + + n_samples=length(unique(size\$sample)) + genes=unique(levels(size\$gene)) + n_genes=length(genes) + + par.settings.size=list(layout.heights=list(top.padding=-1, bottom.padding=-3, strip = .75), fontsize = list(text=96/${rows_per_page}, points=8)) + smR.prepanel=function(x,y,...){; yscale=c(-max(abs(y)), max(abs(y)));list(ylim=yscale);} + + plot_size_distribution= function(df, ...) { + bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample))+gene, data = df, origin = 0, + horizontal=FALSE, + group=polarity, + stack=TRUE, + col=c('red', 'blue'), + strip = strip.custom(par.strip.text = list(cex = 0.5)), + cex=0.75, + scales=list(y=list(tick.number=4, rot=90, relation="free"), cex=0.75), + xlab = "readsize in nucleotides", + ylab = "${ylabel}", + main="${title}" , + as.table=TRUE, newpage = T, ...) + combineLimits(update(useOuterStrips(bc), layout=c(n_samples,${rows_per_page})), margin.x=F, margin.y=1) + } + + per_gene_size=lapply(genes, function(x) subset(size, gene==x)) + + global = "no" + #if $global.value == 'yes': + global = "yes" + #end if + + if (global=="no") { + options(warn=-1) + pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677*n_samples/4) + plot_size_distribution(size, par.settings=par.settings.size, prepanel=smR.prepanel) + } else { + pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677) + bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample)), data = size, origin = 0, + horizontal=FALSE, + group=polarity, + stack=TRUE, + col=c('red', 'blue'), + cex=0.75, + par.settings=list(fontsize = list(text=8, points=8)), + scales=list(y=list(tick.number=4, rot=90, relation="same"), cex=0.75), + xlab = "readsize in nucleotides", + ylab = "${ylabel}", + main="${title}" , as.table=TRUE, newpage = T, + aspect=0.5) + #layout=c(n_samples, ${rows_per_page})) + bc + } + devname=dev.off() + + </configfile> + </configfiles> + + <outputs> + <data format="tabular" name="size_distribution_dataframe" label="Size distributionn dataframe"/> + <data format="pdf" name="size_PDF" label="Size distribution"/> + </outputs> +<help> + +**What it does** + +Takes one or more alignment files (BAM, SAM or tabular bowtie output) as input and produces a histogram of read sizes, +where by default for each "chromosome" a histogram of read sizes is drawn. +Reads that map in sense are on the top (red), reads that map antisense are on the bottom (blue). + + +.. class:: warningmark + +'''TIP''' The input data can be produced using the sRbowtie tool. + +---- + +'''Example''' + +Query sequence:: +For a SAM file as the following: + + 5 16 2L_79 24393 255 17M * 0 0 CCTTCATCTTTTTTTTT IIIIIIIIIIIIIIIII XA:i:0 MD:Z:17 NM:i:0 + + 11 0 2R_1 12675 255 21M * 0 0 AAAAAAAACGCGTCCTTGTGC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 + + 2 16 2L_5 669 255 23M * 0 0 TGTTGCTGCATTTCTTTTTTTTT IIIIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:23 NM:i:0 + +produce a plot like this: + +---- + +.. image:: static/images/size_histogram.png + :height: 800 + :width: 500 + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smRtools.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,704 @@ +#!/usr/bin/python +# version 1 7-5-2012 unification of the SmRNAwindow class + +import sys, subprocess +from collections import defaultdict +from numpy import mean, median, std +from scipy import stats + +def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"): + '''This function will return a dictionary containing fasta identifiers as keys and the + sequence as values. Index must be the path to a fasta file.''' + p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines + outputlines = p.stdout.readlines() + p.wait() + item_dic = {} + for line in outputlines: + if (line[0] == ">"): + try: + item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item + except: pass + current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers ! + item_dic[current_item] = "" + stringlist=[] + else: + stringlist.append(line.rstrip() ) + item_dic[current_item] = "".join(stringlist) # for the last item + return item_dic + +def get_fasta_headers (index): + p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines + outputlines = p.stdout.readlines() + p.wait() + item_dic = {} + for line in outputlines: + header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers ! + item_dic[header] = 1 + return item_dic + + +def get_file_sample (file, numberoflines): + '''import random to use this function''' + F=open(file) + fullfile = F.read().splitlines() + F.close() + if len(fullfile) < numberoflines: + return "sample size exceeds file size" + return random.sample(fullfile, numberoflines) + +def get_fasta_from_history (file): + F = open (file, "r") + item_dic = {} + for line in F: + if (line[0] == ">"): + try: + item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item + except: pass + current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers ! + item_dic[current_item] = "" + stringlist=[] + else: + stringlist.append(line[:-1]) + item_dic[current_item] = "".join(stringlist) # for the last item + return item_dic + +def antipara (sequence): + antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"} + revseq = sequence[::-1] + return "".join([antidict[i] for i in revseq]) + +def RNAtranslate (sequence): + return "".join([i if i in "AGCN" else "U" for i in sequence]) +def DNAtranslate (sequence): + return "".join([i if i in "AGCN" else "T" for i in sequence]) + +def RNAfold (sequence_list): + thestring= "\n".join(sequence_list) + p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + output=p.communicate(thestring)[0] + p.wait() + output=output.split("\n") + if not output[-1]: output = output[:-1] # nasty patch to remove last empty line + buffer=[] + for line in output: + if line[0] in ["N","A","T","U","G","C"]: + buffer.append(DNAtranslate(line)) + if line[0] in ["(",".",")"]: + fields=line.split("(") + energy= fields[-1] + energy = energy[:-1] # remove the ) parenthesis + energy=float(energy) + buffer.append(str(energy)) + return dict(zip(buffer[::2], buffer[1::2])) + +def extractsubinstance (start, end, instance): + ''' Testing whether this can be an function external to the class to save memory''' + subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start) + subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1) + upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ] + downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ] + for i in upcoordinate: + subinstance.readDict[i]=instance.readDict[i] + for i in downcoordinate: + subinstance.readDict[i]=instance.readDict[i] + return subinstance + +class HandleSmRNAwindows: + def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0): + self.biosample = biosample + self.alignmentFile = alignmentFile + self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam" + self.genomeRefFile = genomeRefFile + self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource" + self.alignedReads = 0 + self.instanceDict = {} + self.size_inf=size_inf + self.size_sup=size_sup + self.norm=norm + if genomeRefFormat == "bowtieIndex": + self.itemDict = get_fasta (genomeRefFile) + elif genomeRefFormat == "fastaSource": + self.itemDict = get_fasta_from_history (genomeRefFile) + for item in self.itemDict: + self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items + self.readfile() + + def readfile (self) : + if self.alignmentFileFormat == "tabular": + F = open (self.alignmentFile, "r") + for line in F: + fields = line.split() + polarity = fields[1] + gene = fields[2] + offset = int(fields[3]) + size = len (fields[4]) + if self.size_inf: + if (size>=self.size_inf and size<= self.size_sup): + self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow + self.alignedReads += 1 + else: + self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow + self.alignedReads += 1 + F.close() + return self.instanceDict +# elif self.alignmentFileFormat == "sam": +# F = open (self.alignmentFile, "r") +# dict = {"0":"+", "16":"-"} +# for line in F: +# if line[0]=='@': +# continue +# fields = line.split() +# if fields[2] == "*": continue +# polarity = dict[fields[1]] +# gene = fields[2] +# offset = int(fields[3]) +# size = len (fields[9]) +# if self.size_inf: +# if (size>=self.size_inf and size<= self.size_sup): +# self.instanceDict[gene].addread (polarity, offset, size) +# self.alignedReads += 1 +# else: +# self.instanceDict[gene].addread (polarity, offset, size) +# self.alignedReads += 1 +# F.close() + elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam": + import pysam + samfile = pysam.Samfile(self.alignmentFile) + for read in samfile: + if read.tid == -1: + continue # filter out unaligned reads + if read.is_reverse: + polarity="-" + else: + polarity="+" + gene = samfile.getrname(read.tid) + offset = read.pos + size = read.qlen + if self.size_inf: + if (size>=self.size_inf and size<= self.size_sup): + self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow + self.alignedReads += 1 + else: + self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow + self.alignedReads += 1 + return self.instanceDict + + def size_histogram (self): + size_dict={} + size_dict['F']= defaultdict (int) + size_dict['R']= defaultdict (int) + size_dict['both'] = defaultdict (int) + for item in self.instanceDict: + buffer_dict_F = self.instanceDict[item].size_histogram()['F'] + buffer_dict_R = self.instanceDict[item].size_histogram()['R'] + for size in buffer_dict_F: + size_dict['F'][size] += buffer_dict_F[size] + for size in buffer_dict_R: + size_dict['R'][size] -= buffer_dict_R[size] + allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) ) + for size in allSizeKeys: + size_dict['both'][size] = size_dict['F'][size] + size_dict['R'][size] + return size_dict + + def CountFeatures (self, GFF3="path/to/file"): + featureDict = defaultdict(int) + F = open (GFF3, "r") + for line in F: + if line[0] == "#": continue + fields = line[:-1].split() + chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6] + featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive") + F.close() + return featureDict + +class SmRNAwindow: + + def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0): + self.biosample = biosample + self.sequence = sequence + self.gene = gene + self.windowoffset = windowoffset + self.size = len(sequence) + self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...} + self.matchedreadsUp = 0 + self.matchedreadsDown = 0 + self.norm=norm + + def addread (self, polarity, offset, size): + '''ATTENTION ATTENTION ATTENTION''' + ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.''' + if polarity == "+": + self.readDict[offset].append(size) + self.matchedreadsUp += 1 + else: + self.readDict[-(offset + size -1)].append(size) + self.matchedreadsDown += 1 + return + + def barycenter (self, upstream_coord=None, downstream_coord=None): + '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that + In this version, attempt to replace the dictionary structure by a list of tupple to save memory too''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + window_size = downstream_coord - upstream_coord +1 + def weigthAverage (TuppleList): + weightSum = 0 + PonderWeightSum = 0 + for tuple in TuppleList: + PonderWeightSum += tuple[0] * tuple[1] + weightSum += tuple[1] + if weightSum > 0: + return PonderWeightSum / float(weightSum) + else: + return 0 + forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window + reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window + Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size + Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size + return Fbarycenter, Rbarycenter + + def correlation_mapper (self, reference, window_size): + '''to map correlation with a sliding window 26-2-2013''' + if window_size > self.size: + return [] + F=open(reference, "r") + reference_forward = [] + reference_reverse = [] + for line in F: + fields=line.split() + reference_forward.append(int(float(fields[1]))) + reference_reverse.append(int(float(fields[2]))) + F.close() + local_object_forward=[] + local_object_reverse=[] + ## Dict to list for the local object + for i in range(1, self.size+1): + local_object_forward.append(len(self.readDict[i])) + local_object_reverse.append(len(self.readDict[-i])) + ## start compiling results by slides + results=[] + for coordinate in range(self.size - window_size): + local_forward=local_object_forward[coordinate:coordinate + window_size] + local_reverse=local_object_reverse[coordinate:coordinate + window_size] + if sum(local_forward) == 0 or sum(local_reverse) == 0: + continue + try: + reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward) + reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse) + if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2): + results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]]) + except: + pass + return results + + def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"): + '''refactored 24-12-2013 to save memory and introduce offset filtering + take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation + the trick is to pass None and then test + polarity parameter can take "both", "forward" or "reverse" as value''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both": + return self.matchedreadsUp + self.matchedreadsDown + if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward": + return self.matchedreadsUp + if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse": + return self.matchedreadsDown + n=0 + if polarity == "both": + for offset in xrange(upstream_coord, downstream_coord+1): + if self.readDict.has_key(offset): + for read in self.readDict[offset]: + if (read>=size_inf and read<= size_sup): + n += 1 + if method != "conservative": + del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST + if self.readDict.has_key(-offset): + for read in self.readDict[-offset]: + if (read>=size_inf and read<= size_sup): + n += 1 + if method != "conservative": + del self.readDict[-offset] + return n + elif polarity == "forward": + for offset in xrange(upstream_coord, downstream_coord+1): + if self.readDict.has_key(offset): + for read in self.readDict[offset]: + if (read>=size_inf and read<= size_sup): + n += 1 + return n + elif polarity == "reverse": + for offset in xrange(upstream_coord, downstream_coord+1): + if self.readDict.has_key(-offset): + for read in self.readDict[-offset]: + if (read>=size_inf and read<= size_sup): + n += 1 + return n + + def readsizes (self): + '''return a dictionary of number of reads by size (the keys)''' + dicsize = {} + for offset in self.readDict: + for size in self.readDict[offset]: + dicsize[size] = dicsize.get(size, 0) + 1 + for offset in range (min(dicsize.keys()), max(dicsize.keys())+1): + dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values + return dicsize + + def size_histogram(self): + norm=self.norm + hist_dict={} + hist_dict['F']={} + hist_dict['R']={} + for offset in self.readDict: + for size in self.readDict[offset]: + if offset < 0: + hist_dict['R'][size] = hist_dict['R'].get(size, 0) - 1*norm + else: + hist_dict['F'][size] = hist_dict['F'].get(size, 0) + 1*norm + return hist_dict + + def statsizes (self, upstream_coord=None, downstream_coord=None): + ''' migration to memory saving by specifying possible subcoordinates + see the readcount method for further discussion''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + L = [] + for offset in self.readDict: + if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue + for size in self.readDict[offset]: + L.append(size) + meansize = mean(L) + stdv = std(L) + mediansize = median(L) + return meansize, mediansize, stdv + + def foldEnergy (self, upstream_coord=None, downstream_coord=None): + ''' migration to memory saving by specifying possible subcoordinates + see the readcount method for further discussion''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ]) + return float(Energy[self.sequence[upstream_coord-1:downstream_coord]]) + + def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None): + ''' migration to memory saving by specifying possible subcoordinates + see the readcount method for further discussion. size_scope must be an interable''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0} + convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} + for offset in self.readDict: + if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue + for size in self.readDict[offset]: + if size in size_scope: + startbase = self.sequence[abs(offset)-self.windowoffset] + if offset < 0: + startbase = convertDic[startbase] + freqDic[startbase] += 1 + base_sum = float ( sum( freqDic.values()) ) + if base_sum == 0: + return "." + else: + return freqDic["T"] / base_sum * 100 + + def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None): + ''' migration to memory saving by specifying possible subcoordinates + see the readcount method for further discussion. size_scope must be an interable + This method is similar to the Ufreq method but take strandness into account''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0} + convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} + for offset in self.readDict: + if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue + for size in self.readDict[offset]: + if size in size_scope: + startbase = self.sequence[abs(offset)-self.windowoffset] + if offset < 0: + startbase = convertDic[startbase] + freqDic[startbase+"rev"] += 1 + else: + freqDic[startbase+"for"] += 1 + forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"]) + reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"]) + if forward_sum == 0 and reverse_sum == 0: + return ". | ." + elif reverse_sum == 0: + return "%s | ." % (freqDic["Tfor"] / forward_sum * 100) + elif forward_sum == 0: + return ". | %s" % (freqDic["Trev"] / reverse_sum * 100) + else: + return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100) + + + def readplot (self): + norm=self.norm + readmap = {} + for offset in self.readDict.keys(): + readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm ) + mylist = [] + for offset in sorted(readmap): + if readmap[offset][1] != 0: + mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") ) + if readmap[offset][0] != 0: + mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") ) + return mylist + + def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None): + '''Use by MirParser tool''' + upstream_coord = upstream_coord or 1 + downstream_coord = downstream_coord or self.size + windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord) + forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)]) + totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward") + totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse") + if totalforward > totalreverse: + majorcoverage = "forward" + for offset in self.readDict.keys(): + if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ): + for read in self.readDict[offset]: + for i in xrange(read): + try: + forORrev_coverage[offset-upstream_coord+1+i] += 1 + except KeyError: + continue # a sense read may span over the downstream limit + else: + majorcoverage = "reverse" + for offset in self.readDict.keys(): + if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ): + for read in self.readDict[offset]: + for i in xrange(read): + try: + forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage + except KeyError: + continue # an antisense read may span over the upstream limit + output_list = [] + maximum = max (forORrev_coverage.values()) or 1 + for n in sorted (forORrev_coverage): + output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage)) + return "\n".join(output_list) + + + def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None): + ''' migration to memory saving by specifying possible subcoordinates + see the readcount method for further discussion + scope must be a python iterable; scope define the *relative* offset range to be computed''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + query_range = range (minquery, maxquery+1) + target_range = range (mintarget, maxtarget+1) + Query_table = {} + Target_table = {} + frequency_table = dict ([(i, 0) for i in scope]) + for offset in self.readDict: + if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue + for size in self.readDict[offset]: + if size in query_range: + Query_table[offset] = Query_table.get(offset, 0) + 1 + if size in target_range: + Target_table[offset] = Target_table.get(offset, 0) + 1 + for offset in Query_table: + for i in scope: + frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0)) + if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py + frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table]) + if zscore == "yes": + z_mean = mean(frequency_table.values() ) + z_std = std(frequency_table.values() ) + if z_std == 0: + frequency_table = dict([(i,0) for i in frequency_table] ) + else: + frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] ) + return frequency_table + + def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None): + ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion + note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed''' + upstream_coord = upstream_coord or self.windowoffset + downstream_coord = downstream_coord or self.windowoffset+self.size-1 + query_range = range (minquery, maxquery+1) + target_range = range (mintarget, maxtarget+1) + Query_table = {} + Target_table = {} + Total_Query_Numb = 0 + general_frequency_table = dict ([(i,0) for i in scope]) + ## filtering the appropriate reads for the study + for offset in self.readDict: + if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue + for size in self.readDict[offset]: + if size in query_range: + Query_table[offset] = Query_table.get(offset, 0) + 1 + Total_Query_Numb += 1 + if size in target_range: + Target_table[offset] = Target_table.get(offset, 0) + 1 + for offset in Query_table: + frequency_table = dict ([(i,0) for i in scope]) + number_of_targets = 0 + for i in scope: + frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0) + number_of_targets += Target_table.get(-offset -i +1, 0) + for i in scope: + try: + general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i] + except ZeroDivisionError : + continue + return general_frequency_table + + def phasing (self, size_range, scope): + ''' to calculate autocorelation like signal - scope must be an python iterable''' + read_table = {} + total_read_number = 0 + general_frequency_table = dict ([(i, 0) for i in scope]) + ## read input filtering + for offset in self.readDict: + for size in self.readDict[offset]: + if size in size_range: + read_table[offset] = read_table.get(offset, 0) + 1 + total_read_number += 1 + ## per offset read phasing computing + for offset in read_table: + frequency_table = dict ([(i, 0) for i in scope]) # local frequency table + number_of_targets = 0 + for i in scope: + if offset > 0: + frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0) + number_of_targets += read_table.get(offset + i, 0) + else: + frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0) + number_of_targets += read_table.get(offset - i, 0) + ## inclusion of local frequency table in the general frequency table (all offsets average) + for i in scope: + try: + general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i] + except ZeroDivisionError : + continue + return general_frequency_table + + + + def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope): + '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute''' + frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) + z_table = {} + frequency_list = [frequency_table[i] for i in sorted (frequency_table)] + if std(frequency_list): + meanlist = mean(frequency_list) + stdlist = std(frequency_list) + z_list = [(i-meanlist)/stdlist for i in frequency_list] + return dict (zip (sorted(frequency_table), z_list) ) + else: + return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) ) + + def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope): + frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) + total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) ) + if total == 0: + return dict( [(i,0) for i in scope]) + return dict( [(i, frequency_table[i]/total*100) for i in scope]) + + def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget): + queryhash = defaultdict(list) + targethash = defaultdict(list) + query_range = range (int(minquery), int(maxquery)+1) + target_range = range (int(mintarget), int(maxtarget)+1) + paired_sequences = [] + for offset in self.readDict: # selection of data + for size in self.readDict[offset]: + if size in query_range: + queryhash[offset].append(size) + if size in target_range: + targethash[offset].append(size) + for offset in queryhash: + if offset >= 0: matched_offset = -offset - overlap + 1 + else: matched_offset = -offset - overlap + 1 + if targethash[matched_offset]: + paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) ) + if offset >= 0: + for i in range (paired): + paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) ) + paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) ) + if offset < 0: + for i in range (paired): + paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) ) + paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) ) + return paired_sequences + + def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget): + queryhash = defaultdict(list) + targethash = defaultdict(list) + query_range = range (int(minquery), int(maxquery)+1) + target_range = range (int(mintarget), int(maxtarget)+1) + paired_sequences = [] + + for offset in self.readDict: # selection of data + for size in self.readDict[offset]: + if size in query_range: + queryhash[offset].append(size) + if size in target_range: + targethash[offset].append(size) + + for offset in queryhash: + matched_offset = -offset - overlap + 1 + if targethash[matched_offset]: + if offset >= 0: + for i in queryhash[offset]: + paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) ) + for i in targethash[matched_offset]: + paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) ) + if offset < 0: + for i in queryhash[offset]: + paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) ) + for i in targethash[matched_offset]: + paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) ) + return paired_sequences + + def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget): + ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates''' + queryhash = defaultdict(list) + targethash = defaultdict(list) + query_range = range (int(minquery), int(maxquery)+1) + target_range = range (int(mintarget), int(maxtarget)+1) + bowtie_output = [] + + for offset in self.readDict: # selection of data + for size in self.readDict[offset]: + if size in query_range: + queryhash[offset].append(size) + if size in target_range: + targethash[offset].append(size) + counter = 0 + for offset in queryhash: + matched_offset = -offset - overlap + 1 + if targethash[matched_offset]: + if offset >= 0: + for i in queryhash[offset]: + counter += 1 + bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset + if offset < 0: + for i in queryhash[offset]: + counter += 1 + bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset + return bowtie_output + + +def __main__(bowtie_index_path, bowtie_output_path): + sequenceDic = get_fasta (bowtie_index_path) + objDic = {} + F = open (bowtie_output_path, "r") # F is the bowtie output taken as input + for line in F: + fields = line.split() + polarity = fields[1] + gene = fields[2] + offset = int(fields[3]) + size = len (fields[4]) + try: + objDic[gene].addread (polarity, offset, size) + except KeyError: + objDic[gene] = SmRNAwindow(gene, sequenceDic[gene]) + objDic[gene].addread (polarity, offset, size) + F.close() + for gene in objDic: + print gene, objDic[gene].pairer(19,19,23,19,23) + +if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sRbowtie.fa Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,40 @@ +>1 +GTATTGTAAGTGGCAGAGTGGC +>2 +TGGAATGTAAAGAAGTATGGAG +>3 +GTGGGGAGTTTGGATGGGGCGGCA +>4 +AATGGCACTGGAAGAATTCACGG +>5 +GTACGGACAAGGGGAATC +>6 +TTGGGTTCTGGGGGGAGTATGG +>7 +GTGGGGAGTTTCGCTGGGGCGGCA +>8 +TAAGGGTCGGGTAGTGAGGGC +>9 +AGCTGGGACTGAGGACTG +>10 +AGCTGGGACTGAGGACTGC +>11 +AAGGGTCGGGTAGTGAGG +>12 +GTCGGGTAGTGAGGGCCTT +>13 +TGGTGGGGCTTGGAACAATTGGAGGGC +>14 +TGACGGAAGGGCACCACC +>15 +TGGAATGTAAAGAAGTATGGAG +>16 +TTGGGTTCTGGGGGGAGT +>17 +TCAGGTGGGGAGTTTGGCTGGGGCGGCACA +>18 +TTGGGTATAGGGGCGAAAGA +>19 +AGCGGGCGTGCTTGTGGAC +>20 +GTCGAATTTGGGTATAGGGGC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sRbowtie.out Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,5 @@ +2 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG +4 - 2L 11953463 CCGTGAATTCTTCCAGTGCCATT +15 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG +14 - Uextra 7115665 GGTGGTGCCCTTCCGTCA +18 + Uextra 7726410 TTGGGTATAGGGGCGAAAGA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/yac.fastq Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,40 @@ +@SRR290479.1 HWI-EAS285:2:1:66:28/1 +TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG ++ +B@BBCBCCBCBCCC8A<@################## +@SRR290479.2 HWI-EAS285:2:1:67:348/1 +AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG ++ +BAA@7?A@@A@@B<'25?6>59:;7#<?######## +@SRR290479.3 HWI-EAS285:2:1:68:826/1 +TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG ++ +BB@BBCCBCCBBB:%%83/>B7@44#;;324'117? +@SRR290479.4 HWI-EAS285:2:1:68:65/1 +ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT ++ +BBB@@ABAAB?9B42&9;################## +@SRR290479.5 HWI-EAS285:2:1:69:594/1 +AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG ++ +AB?5;3>/=?>=;416481################# +@SRR290479.6 HWI-EAS285:2:1:70:700/1 +TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT ++ +BCB=:ACCBB=>BB8<-################### +@SRR290479.7 HWI-EAS285:2:1:70:1679/1 +TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC ++ +BCBBCCBCCCBCCA?AB>:B@><>############ +@SRR290479.8 HWI-EAS285:2:1:71:1400/1 +AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT ++ +7@BC>>@55CCBCA3CBA14B.A16#*;9359B### +@SRR290479.9 HWI-EAS285:2:1:71:795/1 +TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG ++ +BBBBBCBBCB;>AA',9=18?1:7:#<;57###### +@SRR290479.10 HWI-EAS285:2:1:71:596/1 +TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG ++ +B@B>7>9A@<46B@79972#################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/yac.out Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,12 @@ +>1 +TGTAAACATCCCCGACTGGCAGC +>2 +AAAGTGCTACTACTTTTGAGTCT +>3 +ACTGGACTTGGAGTCCGAAGGC +>4 +AAGTGCCGCCAGGTTTTGAGTGG +>5 +TATTGCACTTGTCCCGGCCTGAATCNCGT +>6 +TAGCTTATCAGACTGATGTTGAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,67 @@ +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="all_fasta" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/all_fasta.loc" /> + </table> + <!-- Locations of indexes in the BFAST mapper format --> + <table name="bfast_indexes" comment_char="#"> + <columns>value, dbkey, formats, name, path</columns> + <file path="tool-data/bfast_indexes.loc" /> + </table> + <!-- Locations of nucleotide (mega)blast databases --> + <table name="blastdb" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/blastdb.loc" /> + </table> + <!-- Locations of protein (mega)blast databases --> + <table name="blastdb_p" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/blastdb_p.loc" /> + </table> + <!-- Locations of indexes in the BWA mapper format --> + <table name="bwa_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/bwa_index.loc" /> + </table> + <!-- Locations of indexes in the BWA color-space mapper format --> + <table name="bwa_indexes_color" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/bwa_index_color.loc" /> + </table> + <!-- Locations of MAF files that have been indexed with bx-python --> + <table name="indexed_maf_files"> + <columns>name, value, dbkey, species</columns> + <file path="tool-data/maf_index.loc" /> + </table> + <!-- Locations of fasta files appropriate for NGS simulation --> + <table name="ngs_sim_fasta" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/ngs_sim_fasta.loc" /> + </table> + <!-- Locations of PerM base index files --> + <table name="perm_base_indexes" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/perm_base_index.loc" /> + </table> + <!-- Locations of PerM color-space index files --> + <table name="perm_color_indexes" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/perm_color_index.loc" /> + </table> + <!-- Location of Picard dict file and other files --> + <table name="picard_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/picard_index.loc" /> + </table> + <!-- Location of SRMA dict file and other files --> + <table name="srma_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/picard_index.loc" /> + </table> + <!-- Location of Mosaik files --> + <table name="mosaik_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/mosaik_index.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/yac.py Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,83 @@ +#!/usr/bin/python +# yac = yet another clipper +# v 1.0.0 +# Usage yac.py $input $output $adapter_to_clip $min $max $Nmode +# Christophe Antoniewski <drosofff@gmail.com> + +import sys, string + +class Clip: + def __init__(self, inputfile, outputfile, adapter, minsize, maxsize): + self.inputfile = inputfile + self.outputfile = outputfile + self.adapter = adapter + self.minsize = int(minsize) + self.maxsize = int(maxsize) + def motives (sequence): + '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' + sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match + dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"} + for pos in enumerate(sequence[:6]): + for subst in dicsubst[pos[1]]: + sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7]) + return sequencevariants + self.adaptmotifs= motives(self.adapter) + + def scanadapt(self, adaptmotives=[], sequence=""): + '''scans sequence for adapter motives''' + if sequence.rfind(adaptmotives[0]) != -1: + return sequence[:sequence.rfind(adaptmotives[0])] + for motif in adaptmotives[1:]: + if sequence.rfind(motif) != -1: + return sequence[:sequence.rfind(motif)] + return sequence + + def clip_with_N (self): + '''clips adapter sequences from inputfile. + Reads containing N are retained.''' + iterator = 0 + id = 0 + F = open (self.inputfile, "r") + O = open (self.outputfile, "w") + for line in F: + iterator += 1 + if iterator % 4 == 2: + trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) + if self.minsize <= len(trim) <= self.maxsize: + id += 1 + print >> O, ">%i\n%s" % (id, trim) + F.close() + O.close() + def clip_without_N (self): + '''clips adapter sequences from inputfile. + Reads containing N are rejected.''' + iterator = 0 + id = 0 + F = open (self.inputfile, "r") + O = open (self.outputfile, "w") + for line in F: + iterator += 1 + if iterator % 4 == 2: + trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) + if "N" in trim: continue + if self.minsize <= len(trim) <= self.maxsize: + id += 1 + print >> O, ">%i\n%s" % (id, trim) + F.close() + O.close() + +def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode): + instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize) + if Nmode == "accept": + instanceClip.clip_with_N() + else: + instanceClip.clip_without_N() + +if __name__ == "__main__" : + input = sys.argv[1] + output = sys.argv[2] + adapter = sys.argv[3] + minsize = sys.argv[4] + maxsize = sys.argv[5] + Nmode = sys.argv[6] + __main__(input, output, adapter, minsize, maxsize, Nmode)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/yac.xml Tue Jun 24 12:16:43 2014 -0400 @@ -0,0 +1,58 @@ + <tool id="yac" name="Clip adapter" version="1.0.0"> + <description></description> + <command interpreter="python">yac.py $input $output $clip_source.clip_sequence $min $max $Nmode</command> + <inputs> + <param format="fastq" name="input" type="data" label="Source file"/> + <param name="min" type="integer" size="4" value="15" label="min size"/> + <param name="max" type="integer" size="4" value="36" label="max size"/> + <param name="Nmode" type="select" label="Reads containing Nst"> + <option value="accept" selected="True">accept</option> + <option value="reject">reject</option> + </param> + <conditional name="clip_source"> + <param name="clip_source_list" type="select" label="Source" help="Built-in adapters or User-provided"> + <option value="prebuilt" selected="True">Use a built-in adapter (select from the list below)</option> + <option value="user">Use custom sequence</option> + </param> + <when value="prebuilt"> + <param name="clip_sequence" type="select" label="Select Adapter to clip" help="if your adapter is not listed, input your own sequence"> + <option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option> + <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option> + <option value="TGGAATTCTCGGGTGCCAAG" selected="True">Illumina TruSeq TGGAATTCTCGGGTGCCAAG</option> + <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option> + </param> + </when> + <when value="user"> + <param name="clip_sequence" type="text" size="35" label="Enter your Sequence" value="GAATCC"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="fasta" name="output" metadata="input" /> + </outputs> + + <help> +<!-- write a decent doc ! --> +This tool clips adapter sequences from a fastq file and fasta file of clipped reads with renumbered fasta headers. + +Clipped sequences with Ns can be discarded. + +Min size and max size filter clipped reads on their size. + +Note that unclipped reads that satisfy the min and max size conditions are kept. + </help> + +<!-- write a <test> section --> + <tests> + <test> + <param name="input" value="yac.fastq" ftype="fastqsanger"/> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <output name="output" file="yac.out" /> + </test> + </tests> + +</tool>