# HG changeset patch # User drosofff # Date 1403621933 14400 # Node ID f9032a86667566332fc70b4feded940b4897c9e2 # Parent f777cbc82f982ddfe70a908c707320c055e71283 Deleted selected files diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/FeaturesParser.py --- a/mississippi_gcc/FeaturesParser.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -#!/usr/bin/python -# python parser module to analyse Features in sRbowtie alignments (guided by a GFF3 file) -# version 0.9 -# Usage FeaturesParser.py <1:index source> <2:extraction directive> <3:output> <4:GFF3 guide file> <5:6:7 filePath:FileExt:FileLabel> <.. ad lib> - -import sys -from smRtools import * -from collections import * - -IndexSource = sys.argv[1] -ExtractionDirective = sys.argv[2] -if ExtractionDirective == "--do_not_extract_index": - genomeRefFormat = "fastaSource" -elif ExtractionDirective == "--extract_index": - genomeRefFormat = "bowtieIndex" -Output = sys.argv[3] -GFF3_file = sys.argv[4] -Triplets = [sys.argv[5:][i:i+3] for i in xrange(0, len(sys.argv[5:]), 3)] -MasterListOfGenomes = {} -FeatureDict = defaultdict(dict) - -for [filePath, FileExt, FileLabel] in Triplets: - MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (filePath, FileExt, IndexSource, genomeRefFormat) - FeatureDict[FileLabel] = MasterListOfGenomes[FileLabel].CountFeatures(GFF3=GFF3_file) - -# add some code to pick up the GFF3 features in their order of appearence. -F = open(GFF3_file, "r") -featureList = [] -for line in F: - if line[0] == "#": continue - feature = line.split()[2] - if feature not in featureList: - featureList.append(feature) -F.close() - -header = ["#Feature"] -for [filePath, FileExt, FileLabel] in Triplets: - header.append(FileLabel) - -F = open (sys.argv[3], "w") -print >> F, "\t".join(header) -for feature in featureList: - line=[feature] - for sample in header[1:]: - count = str (FeatureDict[sample][feature]) -# uncomment to get percentage in addition to counts -# percent = float(FeatureDict[sample][feature]) / MasterListOfGenomes[sample].alignedReads -# value = "%s | %0.2f" % (count, percent) -# line.append(value) - line.append(count) - print >> F, "\t".join(line ) -line = ["Unfeatured"] -for sample in header[1:]: - matched = 0 - for feature in FeatureDict[sample]: - matched += FeatureDict[sample][feature] - unmatched = MasterListOfGenomes[sample].alignedReads - matched -# uncomment to get percentage in addition to counts -# percent = float (unmatched) / (matched + unmatched) -# value = "%s | %0.2f" % (unmatched, percent) -# line.append(value) - line.append("%s" % unmatched) -print >> F, "\t".join(line) -F.close() diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/FeaturesParser.xml --- a/mississippi_gcc/FeaturesParser.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ - - in sRbowtie alignment - bowtie-inspect - - - FeaturesParser.py - #if $refGenomeSource.genomeSource == "history": - $refGenomeSource.ownFile ## index source ## 1 - --do_not_extract_index ## 2 - #else: - #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] - $reference ## index source ## 1 - --extract_index ## 2 - #end if - $output ## 3 - $gff3 ## 4 - #for $i in $refGenomeSource.input_list - $i $i.ext "$i.name" - #end for - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Parses Features Counts from one or several sRBowtie alignments (in tabular, Sam or Bam format). - -Both sense and antisense alignments are counted - -The library labels are infered from the input dataset names in the galaxy history. - -**It is thus essential that input datasets are appropriately renamed** - -**it is preferable that you do not put any space in this input dataset names. You may edit these names in the history** - - - - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/MDS_wrapper.py --- a/mississippi_gcc/MDS_wrapper.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -#!/usr/bin/env python -# mirdeep* python wrapper -# refactoring a python version of the MDS_wrapper.pl -# Usage MDS_wrapper.py - -import sys, re, os, subprocess, shlex, tempfile, shutil - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -MDS_path = "/home/galaxy/bin/MDS_command_line_v32/MDS_command_line" - -input_full_path = sys.argv[1] -MDS_genome = sys.argv[2] -gff3_output = sys.argv[3] -dataresult = sys.argv[4] -datacluster = sys.argv[5] - -tmp_MDS_work_dir = tempfile.mkdtemp(dir = MDS_path) # make temp directory for MDS analysis -os.symlink(input_full_path, tmp_MDS_work_dir+"/data.fa" ) # symlink between the fasta source file and the required "data.fa" input -os.symlink(MDS_path+"/MDS_command_line.jar", tmp_MDS_work_dir+"/MDS_command_line.jar" ) # symlink to jar source in working directory -os.symlink(MDS_path+"/genome", tmp_MDS_work_dir+"/genome") -os.symlink(MDS_path+"/targetScan", tmp_MDS_work_dir+"/targetScan") -os.symlink(MDS_path+"/targetScan_files", tmp_MDS_work_dir+"/targetScan_files") - -# execute MirDeep* -command_line = "java -Xmx4g -jar " + MDS_path + "/MDS_command_line.jar -r 5 -g " + MDS_genome + " data.fa" # -Xmx12g -print command_line -#tmp = tempfile.NamedTemporaryFile( dir=tmp_MDS_work_dir ).name -#tmp_stderr = open( tmp, 'wb' ) - -try: - os.chdir(tmp_MDS_work_dir) - p = subprocess.Popen(args=command_line, cwd=tmp_MDS_work_dir, shell=True, stderr=sys.stderr) - returncode = p.wait() - shutil.copy2 ("data.result", dataresult) - shutil.copy2 ("data.cluster", datacluster) - dataFILE = open("data.result", "r") - datafile = dataFILE.readlines() - dataFILE.close() - GFF3OUT = open(gff3_output, "w") - print >> GFF3OUT,"##gff-version 3" - print >> GFF3OUT, "##Seqid Source Type Start End Score Strand Phase Attributes" - print >> GFF3OUT, "##" - for line in datafile[1:]: - fields = line.split("\t") - Seqid, Source, Type, Start, End, Score, Strand, Phase = fields[2], "MirDeep*", "hairPin_loci", fields[4].split("-")[0], fields[4].split("-")[1], fields[1], fields[3], "." - ID = "ID=%s;%s_reads;%s;%s;mature_seq:%s" % (fields[0],fields[5],fields[7],fields[8],fields[9]) - print >> GFF3OUT, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (Seqid, Source, Type, Start, End, Score, Strand, Phase, ID) - GFF3OUT.close() - if os.path.exists( tmp_MDS_work_dir ): - shutil.rmtree( tmp_MDS_work_dir ) - else: - print "Error in cleaning tmp working directory" - -except Exception, e: - # clean up temp dir - if os.path.exists( tmp_MDS_work_dir ): - shutil.rmtree( tmp_MDS_work_dir ) - stop_err( 'Error running MDS_command_line.jar\n' + str( e ) ) - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/MirDeepStar.xml --- a/mississippi_gcc/MirDeepStar.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ - - -MDS_wrapper.py $input $genome $output $output2 $output3 - - - - - - - - - - - - - - - - - - - - -**What it does** - -MirDeep* wrapper - - - - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/MirParser.py --- a/mississippi_gcc/MirParser.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -#!/usr/bin/python -# python parser module for pre-mir and mature miRNAs, guided by mirbase.org GFF3 -# version 0.0.9 (1-6-2014) -# Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> -# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> -# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> - -import sys, subprocess -from smRtools import * - -IndexSource = sys.argv[1] -ExtractionDirective = sys.argv[2] -if ExtractionDirective == "--do_not_extract_index": - genomeRefFormat = "fastaSource" -elif ExtractionDirective == "--extract_index": - genomeRefFormat = "bowtieIndex" -OutputPre_mirs = sys.argv[3] -OutputMature_Mirs = sys.argv[4] -GFF3_file = sys.argv[5] -lattice = sys.argv[6] -Rcode = sys.argv[7] -latticePDF = sys.argv[8] -Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)] -MasterListOfGenomes = {} - -for [filePath, FileExt, FileLabel] in Triplets: - print FileLabel - MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel) - -header = ["gene"] -for [filePath, FileExt, FileLabel] in Triplets: - header.append(FileLabel) - -hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation - -## read GFF3 to subinstantiate -gff3 = open (GFF3_file, "r") -lattice_dataframe = [] -for line in gff3: - if line[0] == "#": continue - gff_fields = line[:-1].split("\t") - chrom = gff_fields[0] - gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name - item_upstream_coordinate = int(gff_fields[3]) - item_downstream_coordinate = int(gff_fields[4]) - if gff_fields[6] == "+": - item_polarity = "forward" - else: - item_polarity = "reverse" - item_line = [gff_name] - for sample in header[1:]: - count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity) - item_line.append(str(count)) - ## subtreatement for lattice - if lattice != "dummy_dataframe_path": - if ("5p" not in gff_name) and ("3p" not in gff_name): - lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) ) - ## end of subtreatement for lattice - hit_table.append("\t".join(item_line) ) -gff3.close() - -Fpremirs = open (OutputPre_mirs, "w") -print >> Fpremirs, hit_table[0] -finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] -print >> Fpremirs, "\n".join(finalPreList ) -Fpremirs.close() - -Fmaturemires = open (OutputMature_Mirs, "w") -print >> Fmaturemires, hit_table[0] -finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] -print >> Fmaturemires, "\n".join(finalMatureList ) -Fmaturemires.close() - -if lattice != "dummy_dataframe_path": - Flattice = open(lattice, "w") - print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity") - print >> Flattice, "\n".join(lattice_dataframe) - Flattice.close() - R_command="Rscript "+ Rcode - process = subprocess.Popen(R_command.split()) - process.wait() diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/MirParser.xml --- a/mississippi_gcc/MirParser.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ - - from sRbowtie aligment - bowtie-inspect - - - MirParser.py - #if $refGenomeSource.genomeSource == "history": - $refGenomeSource.ownFile ## index source sys.arg[1] - --do_not_extract_index ## sys.argv[2] - #else: - #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] - $reference ## sys.argv[1] - --extract_index ## sys.argv[2] - #end if - $output1 ## for pre-mirs ## sys.argv[3] - $output2 ## for mature mirs ## sys.argv[4] - $GFF3 ## sys.argv[5] - #if $plotting.plottingOption == "yes": - $lattice_dataframe ## sys.argv[6] - $plotCode ## sys.argv[7] - $latticePDF ## sys.argv[8] - #else: - "dummy_dataframe_path" ## sys.argv[6] - "dummy_plotCode" ## sys.argv[7] - "dummy_latticePDF" ## sys.argv[8] - #end if - #for $i in $refGenomeSource.input_list - $i $i.ext "$i.name" ## sys.argv[9,10,11] modulo 3 - #end for - #silent plottingoption = $plotting.plottingOption - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #if $plotting.plottingOption == "yes": - graph_type = "${plotting.display}" ## "relative" or "absolute" - ## Setup R error handling to go to stderr - options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) - library(lattice) - coverage = read.delim("${lattice_dataframe}", header=T) - Numb_of_biosamples = length(levels(coverage\$sample)) - if (graph_type=="relative") { - graph = xyplot(countsNorm~offsetNorm | mir, data=coverage, groups=polarity, col=c("red", "blue"), type="l", lwd=1, - scales=list(x=list(cex=.5), y=list(cex=.5)), par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), layout=c(Numb_of_biosamples,15), as.table=TRUE, main="miRNA coverage maps") - } else { - graph = xyplot(counts~offset | mir, data=coverage, groups=polarity, col=c("red", "blue"), type="l", lwd=1, - scales=list(x=list(cex=.5), y=list(cex=.5)), par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), layout=c(Numb_of_biosamples,15), as.table=TRUE, main="miRNA coverage maps") - } - ## pdf output - pdf(file="${latticePDF}", paper="special", height=11.69, width=8.2677) - plot(graph, newpage = T) - dev.off() - #end if - - - - - - - - plotting['plottingOption'] == "yes" - - - plotting['plottingOption'] == "yes" - - - - -**What it does** - -This tool uses a specie-specific GFF3 file from mirBase_ to guide the parsing of an alignment file produced with the sRbowtie tool. - -.. _mirBase: ftp://mirbase.org/pub/mirbase/CURRENT/genomes/ - ------- - -.. class:: warningmark - -the Guide GFF3 file must be in the following format: - -2L . miRNA_primary_transcript 243035 243141 . - . ID=MI0005821;Alias=MI0005821;Name=dme-mir-965 - -2L . miRNA 243055 243076 . - . ID=MIMAT0005480;Alias=MIMAT0005480;Name=dme-miR-965-3p;Derives_from=MI0005821 - -2L . miRNA 243096 243118 . - . ID=MIMAT0020861;Alias=MIMAT0020861;Name=dme-miR-965-5p;Derives_from=MI0005821 - -2L . miRNA_primary_transcript 857542 857632 . + . ID=MI0005813;Alias=MI0005813;Name=dme-mir-375 - -2L . miRNA 857596 857617 . + . ID=MIMAT0005472;Alias=MIMAT0005472;Name=dme-miR-375-3p;Derives_from=MI0005813 - -2L . miRNA 857556 857579 . + . ID=MIMAT0020853;Alias=MIMAT0020853;Name=dme-miR-375-5p;Derives_from=MI0005813 - -2L . miRNA_primary_transcript 1831685 1831799 . - . ID=MI0011290;Alias=MI0011290;Name=dme-mir-2280 - -With name for mature miRNA (3rd column = miRNA) containing either the -3p or -5p string in the attribute Name (Name=dme-miR-965-3p, for instance) - ------- - -**Input formats** - -1. One or sereral alignment files generated with sRbowtie tool and **renamed** according to the name of the biosample (avoid spaces in biosample labels) - -.. class:: warningmark - -Alignment datasets generated with sRbowtie must be renamed according to a biosample name - -2. A GFF3 file retrieved from mirBase_ - ------- - -**Outputs** - -Two count list files for counts of reads aligned to pre-mir or mature miRNA - -A pdf of pre-mir coverages. Red coverages indicate that the mir gene is in the genomic up strand, blue coverages indicate that the mir gene is in the genomic down strand. - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/annotation_collector.py --- a/mississippi_gcc/annotation_collector.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -#!/usr/bin/env python -#By, drosofff@gmail.com -# command: annotation_collector.py $output input1, label1, input2, label2, etc... - -import sys, os - -def countlineinfile(file): - F = open (file, "r") - count = 0 - for line in F: - count += 1 - F.close() - return count/2 -results = [] - -for file, label in zip (sys.argv[2:-1:2], sys.argv[3:-1:2]): - results.append ( (countlineinfile(file), label) ) - -Fout = open (sys.argv[1], "w") - -print >> Fout, "# %s" % (sys.argv[-1]) -for filecount, label in results: - print >> Fout, "%s\t%s\t%.2f %%" % (label, filecount, filecount/float(results[0][0])*100 ) -print >> Fout -Fout.close() diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/bowtie_indices.loc.sample --- a/mississippi_gcc/bowtie_indices.loc.sample Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use a directory of Bowtie indexed sequences data files. You will -#need to create these data files and then create a bowtie_indices.loc -#file similar to this one (store it in this directory) that points to -#the directories in which those files are stored. The bowtie_indices.loc -#file has this format (longer white space characters are TAB characters): -# -# -# -#So, for example, if you had hg18 indexed stored in -#/depot/data2/galaxy/bowtie/hg18/, -#then the bowtie_indices.loc entry would look like this: -# -#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 -# -#and your /depot/data2/galaxy/bowtie/hg18/ directory -#would contain hg18.*.ebwt files: -# -#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt -#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt -#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt -#...etc... -# -#Your bowtie_indices.loc file should include an entry per line for each -#index set you have stored. The "file" in the path does not actually -#exist, but it is the prefix for the actual index files. For example: -# -#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon -#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full -#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 -#...etc... -# -#Note that for backwards compatibility with workflows, the unique ID of -#an entry must be the path that was in the original loc file, because that -#is the value stored in the workflow for that parameter. That is why the -#hg19 entry above looks odd. New genomes can be better-looking. -# diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/piRNAsignature.py --- a/mississippi_gcc/piRNAsignature.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -#!/usr/bin/python -# script for computing overlap signatures from a bowtie output -# Christophe Antoniewski -# Usage piRNAsignature.py <1:input> <2:format of input> <3:minsize query> <4:maxsize query> <5:minsize target> <6:maxsize target> -# <7:minscope> <8:maxscope> <9:output> <10:bowtie index> <11:procedure option> <12: graph (global or lattice)> -# <13: R code> - -import sys, subprocess -from smRtools import * -from collections import defaultdict # test whether it is required - -if sys.argv[11] == "--extract_index": - if sys.argv[2] == "tabular": - Genome = HandleSmRNAwindows (sys.argv[1],"tabular",sys.argv[10],"bowtieIndex") - elif sys.argv[2] == "sam": - Genome = HandleSmRNAwindows (sys.argv[1],"sam",sys.argv[10],"bowtieIndex") - else: - Genome = HandleSmRNAwindows (sys.argv[1],"bam",sys.argv[10],"bowtieIndex") -else: - if sys.argv[2] == "tabular": - Genome = HandleSmRNAwindows (sys.argv[1],"tabular",sys.argv[10],"fastaSource") - elif sys.argv[2] == "sam": - Genome = HandleSmRNAwindows (sys.argv[1],"sam",sys.argv[10],"fastaSource") - else: - Genome = HandleSmRNAwindows (sys.argv[1],"bam",sys.argv[10],"fastaSource") -# this decisional tree may be simplified if sam and bam inputs are treated the same way by pysam - -# replace objDic by Genome.instanceDict or... objDic = Genome.instanceDict -objDic = Genome.instanceDict - -minquery = int(sys.argv[3]) -maxquery = int(sys.argv[4]) -mintarget = int(sys.argv[5]) -maxtarget = int(sys.argv[6]) -minscope = int(sys.argv[7]) -maxscope = int(sys.argv[8]) + 1 -general_frequency_table = dict ([(i,0) for i in range(minscope,maxscope)]) -general_percent_table = dict ([(i,0) for i in range(minscope,maxscope)]) -OUT = open (sys.argv[9], "w") - -if sys.argv[12] == "global": - ###### for normalized summing of local_percent_table(s) - readcount_dic = {} - Total_read_in_objDic = 0 - for item in objDic: - readcount_dic[item] = objDic[item].readcount(minquery, maxquery) - Total_read_in_objDic += readcount_dic[item] - ###### - for x in (objDic): - local_frequency_table = objDic[x].signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) - local_percent_table = objDic[x].hannon_signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) - try: - for overlap in local_frequency_table.keys(): - general_frequency_table[overlap] = general_frequency_table.get(overlap, 0) + local_frequency_table[overlap] - except: - pass - try: - for overlap in local_percent_table.keys(): - general_percent_table[overlap] = general_percent_table.get(overlap, 0) + (1./Total_read_in_objDic*readcount_dic[x]*local_percent_table[overlap]) - except: - pass - print >> OUT, "overlap\tnum of pairs\tprobability" - for classe in sorted(general_frequency_table): - print >> OUT, "%i\t%i\t%f" % (classe, general_frequency_table[classe], general_percent_table[classe]) - -else: - print >> OUT, "overlap\tnum of pairs\tprobability\titem" - for x in (objDic): - local_frequency_table = objDic[x].signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) - local_percent_table = objDic[x].hannon_signature( minquery, maxquery, mintarget, maxtarget, range(minscope,maxscope) ) - for classe in range(minscope,maxscope): - print >> OUT, "%i\t%i\t%f\t%s" % (classe, local_frequency_table[classe], local_percent_table[classe], x) - -OUT.close() - -## Run the R script that is defined in the xml using the Rscript binary provided with R. -R_command="Rscript "+ sys.argv[13] -process = subprocess.Popen(R_command.split()) diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/piRNAsignature.xml --- a/mississippi_gcc/piRNAsignature.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ - - - - piRNAsignature.py $refGenomeSource.input $refGenomeSource.input.ext $minquery $maxquery $mintarget $maxtarget $minscope $maxscope $output - #if $refGenomeSource.genomeSource == "history": - $refGenomeSource.ownFile - --do_not_extract_index - #else: - #silent reference= filter( lambda x: str( x[0] ) == str( $input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] - $reference - --extract_index - #end if - $graph_type - $sigplotter - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - graph_type = "${graph_type}" - - globalgraph = function () { - ## Setup R error handling to go to stderr - options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) - signature = read.delim("${output}", header=TRUE) - ## Open output1 PDF file - pdf( "${output1}" ) - signaturez=(signature[,2] -mean(signature[,2]))/sd(signature[,2]) - plot(signaturez, type = "b", main="signature", cex.main=2, xlab="overlap (nt)", ylab="z-score", pch=19, cex=0.8, col="darkslateblue", lwd=3, cex.lab=1.5, cex.axis=1.4, xaxt="n") - axis(1, at=seq(from=1, to=length(signature[,1]), by=3) ) - devname = dev.off() - ## Close the PDF file - ## Open output2 PDF file - pdf( "${output2}" ) - YLIM=max(signature[,2]) - plot(signature[,1:2], type = "h", xlab="overlap (nt)", ylim=c(0,YLIM), ylab="number of pairs", col="darkslateblue", lwd=7) - devname = dev.off() - ## Close the PDF file - ## Open output3 PDF file - pdf( "${output3}" ) - plot(signature[,1], signature[,3]*100, type = "l", main="ping-pong Signature of ${minquery}-${maxquery} against ${mintarget}-${maxtarget}nt small RNAs", - cex.main=1, xlab="overlap (nt)", ylab="ping-pong signal [%]", ylim=c(0,50), - pch=19, col="darkslateblue", lwd =4, cex.lab=1.2, cex.axis=1, xaxt="n") - axis(1, at=seq(from=1, to=length(signature[,1]), by=3) ) - devname = dev.off() - ## Close the PDF file - } - - treillisgraph = function () { - ## Open output3 PDF file - pdf( "${output3}", paper="special", height=11.69, width=8.2677 ) - signature = read.delim("${output}", header=TRUE) - options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) - library(lattice) - print (xyplot(signature[,3]*100~signature[,1]|signature[,4], type = "l", xlim=c(1,26), main="ping-pong Signature of ${minquery}-${maxquery} against ${mintarget}-${maxtarget}nt small RNAs", - par.strip.text=list(cex=.5), strip=strip.custom(which.given=1, bg="lightblue"), scales=list(cex=0.5), - cex.main=1, cex=.5, xlab="overlap (nt)", ylab="ping-pong signal [%]", - pch=19, col="darkslateblue", lwd =1.5, cex.lab=1.2, cex.axis=1.2, - layout=c(4,12), as.table=TRUE, newpage = T) ) - devnname = dev.off() - } - - if (graph_type=="global") { - globalgraph() - - } - if(graph_type=="lattice") { - treillisgraph() - } - - - - - - - (graph_type == "global") - - - (graph_type == "global") - - - - - - -**What it does** - -This tool computes the number of pairs by overlap classes (in nt) from a bowtie output file, the z-score calculated from these numbers of pairs, and the ping-pong signal as described in Brennecke et al (2009) Science. -The numerical options set the min and max size of both the query small rna class and the target small rna class -Three type of signals are plotted in separate pdf files, the number of pairs founds, the z-score calculated from these numbers of pairs, and the ping-pong signal as described in Brennecke et al (2009) Science. - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/readmap.py --- a/mississippi_gcc/readmap.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -#!/usr/bin/python -# python parser module for for readmaps and size distributions, guided by GFF3 -# version 0.9.1 (1-6-2014) -# Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> -# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> -# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> - -import sys, subprocess, argparse -from smRtools import * -from collections import OrderedDict, defaultdict -import os - -def Parser(): - the_parser = argparse.ArgumentParser() - the_parser.add_argument('--output_readmap', action="store", type=str, help="readmap dataframe") - the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") - the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") - the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") - the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") - the_parser.add_argument('--ext',nargs='+', help="input file type") - the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") - the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") - the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") - the_parser.add_argument('--minquery', type=int, help="Minimum readsize") - the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") - the_parser.add_argument('--rcode', type=str, help="R script") - args = the_parser.parse_args() - return args - -args=Parser() -if args.reference_fasta: - genomeRefFormat = "fastaSource" - genomeRefFile = args.reference_fasta -if args.reference_bowtie_index: - genomeRefFormat = "bowtieIndex" - genomeRefFile = args.reference_bowtie_index -readmap_file=args.output_readmap -size_distribution_file=args.output_size_distribution -minquery=args.minquery -maxquery=args.maxquery -Rcode = args.rcode -filePath=args.input -fileExt=args.ext -fileLabel=args.label -normalization_factor=args.normalization_factor - -MasterListOfGenomes = OrderedDict() - -def process_samples(filePath): - for i, filePath in enumerate(filePath): - norm=normalization_factor[i] - print fileLabel[i] - MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ - biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) - return MasterListOfGenomes - -def write_readplot_dataframe(readDict, readmap_file): - with open(readmap_file, 'w') as readmap: - print >>readmap, "gene\tcoord\tcount\tpolarity\tsample" - for sample in readDict.keys(): - if args.gff: - dict=readDict[sample] - else: - dict=readDict[sample].instanceDict - for gene in dict.keys(): - plottable = dict[gene].readplot() - for line in plottable: - print >>readmap, "%s\t%s" % (line, sample) - -def write_size_distribution_dataframe(readDict, size_distribution_file): - with open(size_distribution_file, 'w') as size_distrib: - print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" - for sample in readDict.keys(): - if args.gff: - dict=readDict[sample] - else: - dict=readDict[sample].instanceDict - for gene in dict.keys(): - histogram = dict[gene].size_histogram() - for polarity in histogram.keys(): - for item in histogram[polarity].iteritems(): - print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample) - -def gff_item_subinstances(readDict, gff3): - GFFinstanceDict=OrderedDict() - with open(gff3) as gff: - for line in gff: - if line[0] == "#": continue - gff_fields = line[:-1].split("\t") - chrom = gff_fields[0] - gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name - item_upstream_coordinate = int(gff_fields[3]) - item_downstream_coordinate = int(gff_fields[4]) - item_polarity = gff_fields[6] - for sample in readDict.keys(): - if not GFFinstanceDict.has_key(sample): - GFFinstanceDict[sample]={} - subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) - if item_polarity == '-': - subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} - subinstance.gene=gff_name - GFFinstanceDict[sample][gff_name]=subinstance - return GFFinstanceDict - -MasterListOfGenomes=process_samples(filePath) - -if args.gff: - MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) - -write_readplot_dataframe(MasterListOfGenomes, readmap_file) -write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) - -R_command="Rscript "+ Rcode -process = subprocess.Popen(R_command.split()) -process.wait() - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/readmap.xml --- a/mississippi_gcc/readmap.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,225 +0,0 @@ - - from sRbowtie aligment - bowtie-inspect - - - readmap.py - #if $refGenomeSource.genomeSource == "history": - --reference_fasta ## sys.argv[2] - $refGenomeSource.ownFile ## index source - #else: - #silent reference= filter( lambda x: str( x[0] ) == str( $refGenomeSource.series[0].input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] - --reference_bowtie_index - $reference - #end if - --rcode - $plotCode - --output_readmap - $readmap_dataframe - --output_size_distribution - $size_distribution_dataframe - --minquery - $minquery - --maxquery - $maxquery - --input - #for $i in $refGenomeSource.series - $i.input - #end for - --ext - #for $i in $refGenomeSource.series - $i.input.ext - #end for - --label - #for $i in $refGenomeSource.series - "$i.input.name" - #end for - --normalization_factor - #for $i in $refGenomeSource.series - $i.norm - #end for - #if $gff: - --gff - $gff - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## Setup R error handling to go to stderr - options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) - library(RColorBrewer) - library(lattice) - library(latticeExtra) - library(grid) - library(gridExtra) - ##cheetahtemplate data frame implementation - - rm=read.delim("${readmap_dataframe}", header=T, row.names=NULL) - pdf(file="${readmap_PDF}", paper="special", height=11.69, width=8.2677) - n_samples=length(unique(rm\$sample)) - - genes=unique(levels(rm\$gene)) - per_gene_readmap=lapply(genes, function(x) subset(rm, gene==x)) - n_genes=length(per_gene_readmap) - - - par.settings.readmap=list(layout.heights=list(top.padding=0, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8)) - par.settings.size=list(layout.heights=list(top.padding=-1, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8)) - par.settings.combination.readmap=list(layout.heights=list(top.padding=0, bottom.padding=-3), fontsize = list(text=96/${rows_per_page}, points=8)) - par.settings.combination.size=list(layout.heights=list(top.padding=-2, bottom.padding=-0.5), fontsize = list(text=96/${rows_per_page}, points=8)) - - - plot_readmap=function(df, ...) { - combineLimits(xyplot(count~coord|factor(sample, levels=unique(sample))+reorder(gene, count, function(x) -sum(abs(x))), - data=df, - type='h', - scales= list(relation="free", x=list(rot=0, cex=0.75, axs="i", tck=0.5), y=list(tick.number=4, rot=90, cex=0.75)), - xlab=NULL, main=NULL, ylab=NULL, - as.table=T, - origin = 0, - horizontal=FALSE, - group=polarity, - col=c("red","blue"), - ...)) - } - - plot_size_distribution= function(df, ...) { - smR.prepanel=function(x,y,...){; yscale=c(-max(abs(y)), max(abs(y)));list(ylim=yscale);} - bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample))+gene, data = df, origin = 0, - horizontal=FALSE, - group=polarity, - stack=TRUE, - col=c('red', 'blue'), - cex=0.75, - scales=list(y=list(tick.number=4, rot=90, relation="free"), cex=0.75), - prepanel=smR.prepanel, - xlab = NULL, - ylab = NULL, -# par.settings=list(layout.heights=list(top.padding=-2, bottom.padding=-3), fontsize = list(text=8, points=8)), - main = NULL , as.table=TRUE, newpage = T, ...) - combineLimits(bc) - } - - for (i in seq(1,n_genes,${rows_per_page})) { - start=i - end=i+${rows_per_page}-1 - if (end>n_genes) {end=n_genes} - readmap_plot.list=lapply(per_gene_readmap[start:end], function(x) plot_readmap(x, par.settings=par.settings.readmap)) - args.list=c(readmap_plot.list, list(nrow=${rows_per_page}, ncol=1, main="${title}", left="${ylabel}", sub="readmap coordinate")) - do.call(grid.arrange, args.list) - } - - devname=dev.off() - - size=read.delim("${size_distribution_dataframe}", header=T, row.names=NULL) - per_gene_size=lapply(genes, function(x) subset(size, gene==x)) - - pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677) - - for (i in seq(1,n_genes,${rows_per_page})) { - start=i - end=i+${rows_per_page}-1 - if (end>n_genes) {end=n_genes} - plot.list=lapply(per_gene_size[start:end], function(x) plot_size_distribution(x, par.settings=par.settings.size)) - args.list=c(plot.list, list(nrow=${rows_per_page}, ncol=1, main="${title}", left="${ylabel}", sub="readsize in nucleotides")) - do.call(grid.arrange, args.list) - } - - devname=dev.off() - - pdf(file="${combi_PDF}", paper="special", height=11.69, width=8.2677) - - for (i in seq(1,n_genes,${rows_per_page}/2)) { - start=i - end=i+${rows_per_page}/2-1 - if (end>n_genes) {end=n_genes} - read_plot.list=lapply(per_gene_readmap[start:end], function(x) plot_readmap(x, par.settings=par.settings.combination.readmap)) - size_plot.list=lapply(per_gene_size[start:end], function(x) plot_size_distribution(x, strip=FALSE, par.settings=par.settings.combination.size)) - plot.list=rbind(read_plot.list, size_plot.list ) - args.list=c(plot.list, list(nrow=${rows_per_page}, ncol=1, main="${title}", left="${ylabel}", sub="${xlabel}")) - do.call(grid.arrange, args.list) - } - - devname=dev.off() - - - - - - - - - - - - - - -**What it does** - -Takes one or more alignment files (BAM, SAM or tabular bowtie output) as input and produces a "Readmap", -where by default for each "chromosome" the position of the read is recorded on the x-axis, and the y-axis indicates -the number of reads per position. Reads that map in sense are on the top, reads that map antisense are on the bottom. - - -.. class:: warningmark - -'''TIP''' The input data can be produced using the sRbowtie tool. - ----- - -'''Example''' - -Query sequence:: -For a SAM file as the following: - - 5 16 2L_79 24393 255 17M * 0 0 CCTTCATCTTTTTTTTT IIIIIIIIIIIIIIIII XA:i:0 MD:Z:17 NM:i:0 - - 11 0 2R_1 12675 255 21M * 0 0 AAAAAAAACGCGTCCTTGTGC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 - - 2 16 2L_5 669 255 23M * 0 0 TGTTGCTGCATTTCTTTTTTTTT IIIIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:23 NM:i:0 - -produce a plot like this: - ----- - -.. image:: static/images/readmap.png - :height: 800 - :width: 500 - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/sRbowtie.py --- a/mississippi_gcc/sRbowtie.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,109 +0,0 @@ -#!/usr/bin/env python -# small RNA oriented bowtie wrapper -# version 1.01 29-5-2014 -# Usage sRbowtie.py <1 input_fasta_file> <2 alignment method> <3 -v mismatches> <4 out_type> <5 buildIndexIfHistory> <6 fasta/bowtie index> <7 bowtie output> <8 ali_fasta> <9 unali_fasta> <10 --num-threads \${GALAXY_SLOTS:-4}> -# current rev: for bowtie __norc, move from --supress 2,6,7,8 to --supress 6,7,8. Future Parser must be updated to take into account this standardisation -# To Do: -# implement an arg parser -# Christophe Antoniewski - -import sys, os, subprocess, tempfile, shutil - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def bowtieCommandLiner (alignment_method, v_mis, out_type, aligned, unaligned, input, index, output, pslots="12"): - if alignment_method=="RNA": - x = "-v %s -M 1 --best --strata -p %s --norc --suppress 6,7,8" % (v_mis, pslots) - elif alignment_method=="unique": - x = "-v %s -m 1 -p %s --suppress 6,7,8" % (v_mis, pslots) - elif alignment_method=="multiple": - x = "-v %s -M 1 --best --strata -p %s --suppress 6,7,8" % (v_mis, pslots) - elif alignment_method=="k_option": - x = "-v %s -k 1 --best -p %s --suppress 6,7,8" % (v_mis, pslots) - elif alignment_method=="n_option": - x = "-n %s -M 1 --best -p %s --suppress 6,7,8" % (v_mis, pslots) - elif alignment_method=="a_option": - x = "-v %s -a --best -p %s --suppress 6,7,8" % (v_mis, pslots) - if aligned == "None" and unaligned == "None": fasta_command = "" - elif aligned != "None" and unaligned == "None": fasta_command= " --al %s" % aligned - elif aligned == "None" and unaligned != "None": fasta_command = " --un %s" % unaligned - else: fasta_command = " --al %s --un %s" % (aligned, unaligned) - x = x + fasta_command - if out_type == "tabular": - return "bowtie %s %s -f %s > %s" % (x, index, input, output) - elif out_type=="sam": - return "bowtie %s -S %s -f %s > %s" % (x, index, input, output) - elif out_type=="bam": - return "bowtie %s -S %s -f %s |samtools view -bS - > %s" % (x, index, input, output) - -def bowtie_squash(fasta): - tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes - ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) - ref_file_name = ref_file.name - ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name - os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name - cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir) - try: - FNULL = open(os.devnull, 'w') - tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string - tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file - proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null - returncode = proc.wait() - tmp_stderr.close() - FNULL.close() - sys.stdout.write(cmd1 + "\n") - except Exception, e: - # clean up temp dir - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Error indexing reference sequence\n' + str( e ) ) - # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment() - index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention - return tmp_index_dir, index_full_path - -def bowtie_alignment(command_line, flyPreIndexed=''): - # make temp directory just for stderr - tmp_index_dir = tempfile.mkdtemp() - tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name - tmp_stderr = open( tmp, 'wb' ) - # conditional statement for sorted bam generation viewable in Trackster - if "samtools" in command_line: - target_file = command_line.split()[-1] # recover the final output file name - path_to_unsortedBam = os.path.join(tmp_index_dir, "unsorted.bam") - path_to_sortedBam = os.path.join(tmp_index_dir, "unsorted.bam.sorted") - first_command_line = " ".join(command_line.split()[:-3]) + " -o " + path_to_unsortedBam + " - " - # example: bowtie -v 0 -M 1 --best --strata -p 12 --suppress 6,7,8 -S /home/galaxy/galaxy-dist/bowtie/Dmel/dmel-all-chromosome-r5.49 -f /home/galaxy/galaxy-dist/database/files/003/dataset_3460.dat |samtools view -bS -o /tmp/tmp_PgMT0/unsorted.bam - - second_command_line = "samtools sort %s %s" % (path_to_unsortedBam, path_to_sortedBam) # generates an "unsorted.bam.sorted.bam file", NOT an "unsorted.bam.sorted" file - p = subprocess.Popen(args=first_command_line, cwd=tmp_index_dir, shell=True, stderr=tmp_stderr.fileno()) # fileno() method return the file descriptor number of tmp_stderr - returncode = p.wait() - sys.stdout.write("%s\n" % first_command_line + str(returncode)) - p = subprocess.Popen(args=second_command_line, cwd=tmp_index_dir, shell=True, stderr=tmp_stderr.fileno()) - returncode = p.wait() - sys.stdout.write("\n%s\n" % second_command_line + str(returncode)) - if os.path.isfile(path_to_sortedBam + ".bam"): - shutil.copy2(path_to_sortedBam + ".bam", target_file) - else: - p = subprocess.Popen(args=command_line, shell=True, stderr=tmp_stderr.fileno()) - returncode = p.wait() - sys.stdout.write(command_line + "\n") - tmp_stderr.close() - ## cleaning if the index was created in the fly - if os.path.exists( flyPreIndexed ): - shutil.rmtree( flyPreIndexed ) - # cleaning tmp files and directories - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - return - -def __main__(): - F = open (sys.argv[7], "w") - if sys.argv[5] == "history": - tmp_dir, index_path = bowtie_squash(sys.argv[6]) - else: - tmp_dir, index_path = "dummy/dymmy", sys.argv[6] - command_line = bowtieCommandLiner(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[8], sys.argv[9], sys.argv[1], index_path, sys.argv[7], sys.argv[10]) - bowtie_alignment(command_line, flyPreIndexed=tmp_dir) - F.close() -if __name__=="__main__": __main__() diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/sRbowtie.xml --- a/mississippi_gcc/sRbowtie.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,210 +0,0 @@ - - for FASTA small reads - - bowtie - - - sRbowtie.py $input - $method - $v_mismatches - $output_type - $refGenomeSource.genomeSource - ## the very source of the index (indexed or fasta file) - #if $refGenomeSource.genomeSource == "history": - $refGenomeSource.ownFile - #else: - $refGenomeSource.index.fields.path - #end if - $output - $aligned - $unaligned - \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - additional_fasta == "al" or additional_fasta == "al_and_unal" - - - additional_fasta == "unal" or additional_fasta == "al_and_unal" - - - - - - - - - - - - - - - - - -**What it does** - -Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient. It is developed by Ben Langmead and Cole Trapnell. Please cite: Langmead B, Trapnell C, Pop M, Salzberg SL. Ultrafast and memory-efficient alignment of short DNA sequences to the human genome. Genome Biology 10:R25. - -.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml - -A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution. - -However, this Bowtie wrapper tool only takes FASTQ files as inputs. - -The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode) - ------- - -**OPTIONS** - -.. class:: infomark - -This script uses Bowtie to match reads on a reference index. - -Depending on the type of matching, different bowtie options are used: - -**Match on sense strand RNA reference index, multiple mappers randomly matched at a single position** - -Match on RNA reference, SENSE strand, randomly attributing multiple mapper to target with least mismatches, the polarity column is suppressed in the bowtie tabular report: - -*-v [0,1,2,3] -M 1 --best --strata -p 12 --norc --suppress 2,6,7,8* - -**Match unique mappers on DNA reference index** - -Match ONLY unique mappers on DNA reference index - -*-v [0,1,2,3] -m 1 -p 12 --suppress 6,7,8* - -Note that using this option with -v values other than 0 is questionnable... - -**Match on DNA, multiple mappers randomly matched at a single position** - -Match multiple mappers, randomly attributing multiple mapper to target with least mismatches, number of mismatch allowed specified by -v option: - -*-v [0,1,2,3] -M 1 --best --strata -p 12 --suppress 6,7,8* - -**Match on DNA as fast as possible, without taking care of mapping issues (for raw annotation of reads)** - -Match with highest speed, not guaranteeing best hit for speed gain: - -*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8* - - ------ - -**Input formats** - -.. class:: warningmark - -*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter* - ------ - -**OUTPUTS** - -If you choose tabular as the output format, you will obtain the matched reads in standard bowtie output format, having the following columns:: - - Column Description - -------- -------------------------------------------------------- - 1 FastaID fasta identifier - 2 polarity + or - depending whether the match was reported on the forward or reverse strand - 3 target name of the matched target - 4 Offset O-based coordinate of the miR on the miRBase pre-miR sequence - 5 Seq sequence of the matched Read - -If you choose SAM, you will get the output in unordered SAM format. - -.. class:: warningmark - -if you choose BAM, the output will be in sorted BAM format. -To be viewable in Trackster, several condition must be fulfilled: - -.. class:: infomark - -Reads must have been matched to a genome whose chromosome names are compatible with Trackster genome indexes - -.. class:: infomark - -the database/Build (dbkey) which is indicated for the dataset (Pencil - Database/Build field) must match a Trackster genome index. - -Please contact the Galaxy instance administrator if your genome is not referenced - -**Matched and unmatched fasta reads can be retrieved, for further analyses** - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/sRbowtieCascade.py --- a/mississippi_gcc/sRbowtieCascade.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,136 +0,0 @@ -#!/usr/bin/env python -# small RNA oriented bowtie wrapper in cascade for small RNA data set genome annotation -# version 0.9 13-6-2014 -# Usage sRbowtie_cascade.py see Parser() for valid arguments -# Christophe Antoniewski - -import sys, os, subprocess, tempfile, shutil, argparse -from collections import defaultdict - -def Parser(): - the_parser = argparse.ArgumentParser() - the_parser.add_argument('--output', action="store", type=str, help="output file") - the_parser.add_argument('--num-threads', dest="num_threads", action="store", type=str, help="number of bowtie threads") - the_parser.add_argument('--mismatch', action="store", type=str, help="number of mismatches allowed") - the_parser.add_argument('--indexing-flags', dest="indexing_flags", nargs='+', help="whether the index should be generated or not by bowtie-buid") - the_parser.add_argument('--index',nargs='+', help="paths to indexed or fasta references") - the_parser.add_argument('--indexName',nargs='+', help="Names of the indexes") - the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") - the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") - args = the_parser.parse_args() - return args - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def bowtie_squash(fasta): - tmp_index_dir = tempfile.mkdtemp() # make temp directory for bowtie indexes - ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) - ref_file_name = ref_file.name - ref_file.close() # by default, delete the temporary file, but ref_file.name is now stored in ref_file_name - os.symlink( fasta, ref_file_name ) # symlink between the fasta source file and the deleted ref_file name - cmd1 = 'bowtie-build -f %s %s' % (ref_file_name, ref_file_name ) # bowtie command line, which will work after changing dir (cwd=tmp_index_dir) - try: - FNULL = open(os.devnull, 'w') - tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name # a path string for a temp file in tmp_index_dir. Just a string - tmp_stderr = open( tmp, 'wb' ) # creates and open a file handler pointing to the temp file - proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=FNULL, stdout=FNULL ) # both stderr and stdout of bowtie-build are redirected in dev/null - returncode = proc.wait() - tmp_stderr.close() - FNULL.close() - sys.stdout.write(cmd1 + "\n") - except Exception, e: - # clean up temp dir - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Error indexing reference sequence\n' + str( e ) ) - # no Cleaning if no Exception, tmp_index_dir has to be cleaned after bowtie_alignment() - index_full_path = os.path.join(tmp_index_dir, ref_file_name) # bowtie fashion path without extention - return index_full_path - -def make_working_dir(): - working_dir = tempfile.mkdtemp() - return working_dir - -def Clean_TempDir(directory): - if os.path.exists( directory ): - shutil.rmtree( directory ) - return - -def bowtie_alignment(command_line="None", working_dir = ""): - FNULL = open(os.devnull, 'w') - p = subprocess.Popen(args=command_line, cwd=working_dir, shell=True, stderr=FNULL, stdout=FNULL) - returncode = p.wait() - sys.stdout.write("%s\n" % command_line) - FNULL.close() - #p = subprocess.Popen(["wc", "-l", "%s/al.fasta"%working_dir], cwd=working_dir, stdout=subprocess.PIPE) - #aligned = p.communicate()[0].split()[0] - aligned = 0 - F = open ("%s/al.fasta" % working_dir, "r") - for line in F: - aligned += 1 - F.close() - sys.stdout.write("Aligned: %s\n" % aligned) - return aligned/2 - -def CommandLiner (v_mis="1", pslots="12", index="dum/my", input="dum/my", working_dir=""): - return "bowtie -v %s -k 1 --best -p %s --al %s/al.fasta --un %s/unal.fasta --suppress 1,2,3,4,5,6,7,8 %s -f %s" % (v_mis, pslots, working_dir, working_dir, index, input) - -def __main__(): - args = Parser() - ## first we make all indexes available. They can be already available or be squashed by bowtie-build - ## we keep them in a list that alternates indexPath and "toClear" or "DoNotDelete" - BowtieIndexList = [] - for indexing_flags, bowtiePath in zip (args.indexing_flags, args.index): - if indexing_flags == "history": - BowtieIndexList.append ( bowtie_squash (bowtiePath) ) - BowtieIndexList.append ( "toClear" ) - else: - BowtieIndexList.append ( bowtiePath ) - BowtieIndexList.append ( "DoNotDelete") - ###### temporary Indexes are generated. They must be deleted at the end (after removing file name in the temp path) - ResultDict = defaultdict(list) - for label, input in zip(args.label, args.input): ## the main cascade, iterating over samples and bowtie indexes - workingDir = make_working_dir() - cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[0], input=input, working_dir=workingDir) - ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) # first step of the cascade - if len(BowtieIndexList) > 2: # is there a second step to perform ? - os.rename("%s/al.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) ## end of first step. the aligned reads are the input of the next step - cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexList[2], input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) - ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) )## second step of the cascade - if len(BowtieIndexList) > 4: ## remaining steps - for BowtieIndexPath in BowtieIndexList[4::2]: - os.rename("%s/unal.fasta"%workingDir, "%s/toAlign.fasta"%workingDir) - cmd = CommandLiner (v_mis=args.mismatch, pslots=args.num_threads, index=BowtieIndexPath, input="%s/toAlign.fasta"%workingDir, working_dir=workingDir) - ResultDict[label].append( bowtie_alignment(command_line=cmd, working_dir = workingDir) ) - Fun = open("%s/unal.fasta"%workingDir, "r") ## to finish, compute the number of unmatched reads - n = 0 - for line in Fun: - n += 1 - ResultDict[label].append(n/2) - Fun.close() - Clean_TempDir (workingDir) # clean the sample working directory - ## cleaning - for IndexPath, IndexFlag in zip(BowtieIndexList[::2], BowtieIndexList[1::2]): - if IndexFlag == "toClear": - Clean_TempDir ("/".join(IndexPath.split("/")[:-1])) - ## end of cleaning - - - - F = open (args.output, "w") - print >> F, "alignment reference\t%s" % "\t".join(args.label) - for i, reference in enumerate(args.indexName): - F.write ("%s" % reference) - for sample in args.label: - F.write ("\t%s" % "{:,}".format(ResultDict[sample][i]) ) - print >> F - F.write ("Remaining Unmatched") - for sample in args.label: - F.write ("\t%s" % "{:,}".format(ResultDict[sample][-1]) ) - print >> F - - F.close() - -if __name__=="__main__": __main__() diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/sRbowtieCascade.xml --- a/mississippi_gcc/sRbowtieCascade.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,142 +0,0 @@ - - Using iterative sRbowtie Alignments - - bowtie - - - sRbowtieCascade.py --output $output - --num-threads \${GALAXY_SLOTS:-4} ## number of processors to be handled by bowtie - --mismatch $mismatches - --input - #for $i in $input: - $i - #end for - --label - #for $i in $input: - "$i.name" - #end for - --index - #if $refGenomeSource1.genomeSource == "history": - $refGenomeSource1.ownFile - #else: - $refGenomeSource1.index.fields.path - #end if - #for $i in $AdditionalQueries: - #if $i.refGenomeSource.genomeSource == "history": - $i.refGenomeSource.ownFile - #else: - $i.refGenomeSource.index.fields.path - #end if - #end for - --indexing-flags - $refGenomeSource1.genomeSource - #for $i in $AdditionalQueries: - $i.refGenomeSource.genomeSource - #end for - --indexName - #if $refGenomeSource1.genomeSource == "history": - "$refGenomeSource1.ownFile.name" - #else: - "$refGenomeSource1.index.fields.name" - #end if - #for $i in $AdditionalQueries: - #if $i.refGenomeSource.genomeSource == "history": - "$i.refGenomeSource.ownFile.name" - #else: - "$i.refGenomeSource.index.fields.name" - #end if - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**Intro** - -Bowtie_ is a short read aligner designed to be ultrafast and memory-efficient. -A generic "Map with Bowtie for Illumina" Galaxy tool is available in the main Galaxy distribution. -However, this Bowtie wrapper tool only takes FASTQ files as inputs. - -Here The sRbowtie wrapper specifically works with short reads FASTA inputs (-v bowtie mode, with -k 1) - -.. _Bowtie: http://bowtie-bio.sourceforge.net/index.shtml - - ------- - -**What it does** - -.. class:: infomark - -This script uses the sRbowtie wrapper to iteratively match reads on a reference indexes. - -Reads are Matched on DNA references as fast as possible, without taking care of mapping issues - -*-v [0,1,2,3] -k 1 --best -p 12 --suppress 6,7,8* - -unaligned reads at step N are used input for sRbowtie at step N+1 - ------ - -**Input formats** - -.. class:: warningmark - -*The only accepted format for the script is a raw fasta list of reads, clipped from their adapter* - ------ - -**OUTPUTS** - -**Annotation table** - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/sRbowtieParser.py --- a/mississippi_gcc/sRbowtieParser.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -#!/usr/bin/python -# python parser module to analyse sRbowtie alignments -# version 0.9 -# Usage sRbowtieParser.py <1:index source> <2:extraction directive> <3:outputL> <4:polarity> <5:6:7 filePath:FileExt:FileLabel> <.. ad lib> - -import sys -from smRtools import * - -IndexSource = sys.argv[1] -ExtractionDirective = sys.argv[2] -if ExtractionDirective == "--do_not_extract_index": - genomeRefFormat = "fastaSource" -elif ExtractionDirective == "--extract_index": - genomeRefFormat = "bowtieIndex" -Output = sys.argv[3] -Polarity = sys.argv[4] # maybe "both", "forward", "reverse" -Triplets = [sys.argv[5:][i:i+3] for i in xrange(0, len(sys.argv[5:]), 3)] -MasterListOfGenomes = {} - -for [filePath, FileExt, FileLabel] in Triplets: - MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (filePath, FileExt, IndexSource, genomeRefFormat) - -header = ["gene"] -for [filePath, FileExt, FileLabel] in Triplets: - header.append(FileLabel) - -F = open (sys.argv[3], "w") -print >> F, "\t".join(header) -for item in sorted (MasterListOfGenomes[header[1]].instanceDict.keys() ): - line=[item] - for sample in header[1:]: - count = str (MasterListOfGenomes[sample].instanceDict[item].readcount(polarity=Polarity)) - line.append(count) - print >> F, "\t".join(line ) -F.close() diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/sRbowtieParser.xml --- a/mississippi_gcc/sRbowtieParser.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,67 +0,0 @@ - - - bowtie-inspect - - - sRbowtieParser.py - #if $refGenomeSource.genomeSource == "history": - $refGenomeSource.ownFile ## index source - --do_not_extract_index - #else: - #silent reference= filter( lambda x: str( x[0] ) == str( $input_list.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] - $reference ## index source - --extract_index - #end if - $output - $polarity - #for $i in $refGenomeSource.input_list - $i $i.ext "$i.name" - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**What it does** - -Parses read counts from one or several sRBowtie alignments (in tabular, Sam or Bam format). - -Here a bowtie match done against an index composed of a set of items is parsed and expressed as a hit list of the corresponding items - -Sense, antisense or both sense and antisense alignments can be counted - -The library labels are infered from the input dataset names in the galaxy history. - -**It is thus essential that input datasets are appropriately renamed** - -**it is preferable that you do not put any space in this input dataset names. You may edit these names in the history** - - - - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/size_histogram.py --- a/mississippi_gcc/size_histogram.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#!/usr/bin/python -# python parser module for size distributions, guided by GFF3 -# version 0.9.1 (1-6-2014) -# Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> -# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> -# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> - -import sys, subprocess, argparse -from smRtools import * -from collections import OrderedDict, defaultdict -import os - -def Parser(): - the_parser = argparse.ArgumentParser() - the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") - the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") - the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") - the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") - the_parser.add_argument('--ext',nargs='+', help="input file type") - the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") - the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") - the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") - the_parser.add_argument('--minquery', type=int, help="Minimum readsize") - the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") - the_parser.add_argument('--rcode', type=str, help="R script") - the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calcilated for the sum of all items") - the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed") - args = the_parser.parse_args() - return args - -args=Parser() -if args.reference_fasta: - genomeRefFormat = "fastaSource" - genomeRefFile = args.reference_fasta -if args.reference_bowtie_index: - genomeRefFormat = "bowtieIndex" - genomeRefFile = args.reference_bowtie_index -size_distribution_file=args.output_size_distribution -minquery=args.minquery -maxquery=args.maxquery -Rcode = args.rcode -filePath=args.input -fileExt=args.ext -fileLabel=args.label -normalization_factor=args.normalization_factor -global_size=args.global_size -collapse=args.collapse - -if collapse: - pol=["both"] -else: - pol=["F", "R"] - -MasterListOfGenomes = OrderedDict() - -def process_samples(filePath): - for i, filePath in enumerate(filePath): - norm=normalization_factor[i] - print fileLabel[i] - MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ - biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) - return MasterListOfGenomes - -def write_size_distribution_dataframe(readDict, size_distribution_file): - with open(size_distribution_file, 'w') as size_distrib: - print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" - for sample in readDict.keys(): - if args.gff: - dict=readDict[sample] - else: - dict=readDict[sample].instanceDict - for gene in dict.keys(): - histogram = dict[gene].size_histogram() - for polarity in histogram.keys(): - for item in histogram[polarity].iteritems(): - print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample) - -def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]): - with open(size_distribution_file, 'w') as size_distrib: - print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" - for sample in readDict.keys(): - histogram = readDict[sample].size_histogram() - gene="sample" - for polarity in pol: - for item in histogram[polarity].iteritems(): - if polarity=="R": - print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], -item[1], sample) - else: - print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, item[0], item[1], sample) - -def gff_item_subinstances(readDict, gff3): - GFFinstanceDict=OrderedDict() - with open(gff3) as gff: - for line in gff: - if line[0] == "#": continue - gff_fields = line[:-1].split("\t") - chrom = gff_fields[0] - gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name - item_upstream_coordinate = int(gff_fields[3]) - item_downstream_coordinate = int(gff_fields[4]) - item_polarity = gff_fields[6] - for sample in readDict.keys(): - if not GFFinstanceDict.has_key(sample): - GFFinstanceDict[sample]={} - subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) - if item_polarity == '-': - subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} -# subinstance.readDict.setdefault(key, []) - subinstance.gene=gff_name - GFFinstanceDict[sample][gff_name]=subinstance - return GFFinstanceDict - -MasterListOfGenomes=process_samples(filePath) - -if args.gff: - MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) - -if global_size: - write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol) -else: - write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) - -R_command="Rscript "+ Rcode -process = subprocess.Popen(R_command.split()) -process.wait() - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/size_histogram.xml --- a/mississippi_gcc/size_histogram.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,200 +0,0 @@ - - from sRbowtie aligment - bowtie-inspect - - - size_histogram.py - #if $refGenomeSource.genomeSource == "history": - --reference_fasta ## sys.argv[2] - $refGenomeSource.ownFile ## index source - #else: - #silent reference= filter( lambda x: str( x[0] ) == str( $refGenomeSource.series[0].input.dbkey ), $__app__.tool_data_tables[ 'bowtie_indexes' ].get_fields() )[0][-1] - --reference_bowtie_index - $reference - #end if - --rcode - $plotCode - --output_size_distribution - $size_distribution_dataframe - --minquery - $minquery - --maxquery - $maxquery - --input - #for $i in $refGenomeSource.series - $i.input - #end for - --ext - #for $i in $refGenomeSource.series - $i.input.ext - #end for - --label - #for $i in $refGenomeSource.series - "$i.input.name" - #end for - --normalization_factor - #for $i in $refGenomeSource.series - $i.norm - #end for - #if $gff: - --gff - $gff - #end if - #if $global.value == 'yes': - --global_size - #end if - #if $collapsestrands.value == 'yes': - --collapse - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## Setup R error handling to go to stderr - options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) - library(RColorBrewer) - library(lattice) - library(latticeExtra) - library(grid) - library(gridExtra) - ##cheetahtemplate data frame implementation - - size=read.delim("${size_distribution_dataframe}", header=T, row.names=NULL) - - n_samples=length(unique(size\$sample)) - genes=unique(levels(size\$gene)) - n_genes=length(genes) - - par.settings.size=list(layout.heights=list(top.padding=-1, bottom.padding=-3, strip = .75), fontsize = list(text=96/${rows_per_page}, points=8)) - smR.prepanel=function(x,y,...){; yscale=c(-max(abs(y)), max(abs(y)));list(ylim=yscale);} - - plot_size_distribution= function(df, ...) { - bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample))+gene, data = df, origin = 0, - horizontal=FALSE, - group=polarity, - stack=TRUE, - col=c('red', 'blue'), - strip = strip.custom(par.strip.text = list(cex = 0.5)), - cex=0.75, - scales=list(y=list(tick.number=4, rot=90, relation="free"), cex=0.75), - xlab = "readsize in nucleotides", - ylab = "${ylabel}", - main="${title}" , - as.table=TRUE, newpage = T, ...) - combineLimits(update(useOuterStrips(bc), layout=c(n_samples,${rows_per_page})), margin.x=F, margin.y=1) - } - - per_gene_size=lapply(genes, function(x) subset(size, gene==x)) - - global = "no" - #if $global.value == 'yes': - global = "yes" - #end if - - if (global=="no") { - options(warn=-1) - pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677*n_samples/4) - plot_size_distribution(size, par.settings=par.settings.size, prepanel=smR.prepanel) - } else { - pdf(file="${size_PDF}", paper="special", height=11.69, width=8.2677) - bc= barchart(count~as.factor(size)|factor(sample, levels=unique(sample)), data = size, origin = 0, - horizontal=FALSE, - group=polarity, - stack=TRUE, - col=c('red', 'blue'), - cex=0.75, - par.settings=list(fontsize = list(text=8, points=8)), - scales=list(y=list(tick.number=4, rot=90, relation="same"), cex=0.75), - xlab = "readsize in nucleotides", - ylab = "${ylabel}", - main="${title}" , as.table=TRUE, newpage = T, - aspect=0.5) - #layout=c(n_samples, ${rows_per_page})) - bc - } - devname=dev.off() - - - - - - - - - - -**What it does** - -Takes one or more alignment files (BAM, SAM or tabular bowtie output) as input and produces a histogram of read sizes, -where by default for each "chromosome" a histogram of read sizes is drawn. -Reads that map in sense are on the top (red), reads that map antisense are on the bottom (blue). - - -.. class:: warningmark - -'''TIP''' The input data can be produced using the sRbowtie tool. - ----- - -'''Example''' - -Query sequence:: -For a SAM file as the following: - - 5 16 2L_79 24393 255 17M * 0 0 CCTTCATCTTTTTTTTT IIIIIIIIIIIIIIIII XA:i:0 MD:Z:17 NM:i:0 - - 11 0 2R_1 12675 255 21M * 0 0 AAAAAAAACGCGTCCTTGTGC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 - - 2 16 2L_5 669 255 23M * 0 0 TGTTGCTGCATTTCTTTTTTTTT IIIIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:23 NM:i:0 - -produce a plot like this: - ----- - -.. image:: static/images/size_histogram.png - :height: 800 - :width: 500 - - - diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/smRtools.py --- a/mississippi_gcc/smRtools.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,704 +0,0 @@ -#!/usr/bin/python -# version 1 7-5-2012 unification of the SmRNAwindow class - -import sys, subprocess -from collections import defaultdict -from numpy import mean, median, std -from scipy import stats - -def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"): - '''This function will return a dictionary containing fasta identifiers as keys and the - sequence as values. Index must be the path to a fasta file.''' - p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines - outputlines = p.stdout.readlines() - p.wait() - item_dic = {} - for line in outputlines: - if (line[0] == ">"): - try: - item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item - except: pass - current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers ! - item_dic[current_item] = "" - stringlist=[] - else: - stringlist.append(line.rstrip() ) - item_dic[current_item] = "".join(stringlist) # for the last item - return item_dic - -def get_fasta_headers (index): - p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines - outputlines = p.stdout.readlines() - p.wait() - item_dic = {} - for line in outputlines: - header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers ! - item_dic[header] = 1 - return item_dic - - -def get_file_sample (file, numberoflines): - '''import random to use this function''' - F=open(file) - fullfile = F.read().splitlines() - F.close() - if len(fullfile) < numberoflines: - return "sample size exceeds file size" - return random.sample(fullfile, numberoflines) - -def get_fasta_from_history (file): - F = open (file, "r") - item_dic = {} - for line in F: - if (line[0] == ">"): - try: - item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item - except: pass - current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers ! - item_dic[current_item] = "" - stringlist=[] - else: - stringlist.append(line[:-1]) - item_dic[current_item] = "".join(stringlist) # for the last item - return item_dic - -def antipara (sequence): - antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"} - revseq = sequence[::-1] - return "".join([antidict[i] for i in revseq]) - -def RNAtranslate (sequence): - return "".join([i if i in "AGCN" else "U" for i in sequence]) -def DNAtranslate (sequence): - return "".join([i if i in "AGCN" else "T" for i in sequence]) - -def RNAfold (sequence_list): - thestring= "\n".join(sequence_list) - p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - output=p.communicate(thestring)[0] - p.wait() - output=output.split("\n") - if not output[-1]: output = output[:-1] # nasty patch to remove last empty line - buffer=[] - for line in output: - if line[0] in ["N","A","T","U","G","C"]: - buffer.append(DNAtranslate(line)) - if line[0] in ["(",".",")"]: - fields=line.split("(") - energy= fields[-1] - energy = energy[:-1] # remove the ) parenthesis - energy=float(energy) - buffer.append(str(energy)) - return dict(zip(buffer[::2], buffer[1::2])) - -def extractsubinstance (start, end, instance): - ''' Testing whether this can be an function external to the class to save memory''' - subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start) - subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1) - upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ] - downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ] - for i in upcoordinate: - subinstance.readDict[i]=instance.readDict[i] - for i in downcoordinate: - subinstance.readDict[i]=instance.readDict[i] - return subinstance - -class HandleSmRNAwindows: - def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0): - self.biosample = biosample - self.alignmentFile = alignmentFile - self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam" - self.genomeRefFile = genomeRefFile - self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource" - self.alignedReads = 0 - self.instanceDict = {} - self.size_inf=size_inf - self.size_sup=size_sup - self.norm=norm - if genomeRefFormat == "bowtieIndex": - self.itemDict = get_fasta (genomeRefFile) - elif genomeRefFormat == "fastaSource": - self.itemDict = get_fasta_from_history (genomeRefFile) - for item in self.itemDict: - self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items - self.readfile() - - def readfile (self) : - if self.alignmentFileFormat == "tabular": - F = open (self.alignmentFile, "r") - for line in F: - fields = line.split() - polarity = fields[1] - gene = fields[2] - offset = int(fields[3]) - size = len (fields[4]) - if self.size_inf: - if (size>=self.size_inf and size<= self.size_sup): - self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow - self.alignedReads += 1 - else: - self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow - self.alignedReads += 1 - F.close() - return self.instanceDict -# elif self.alignmentFileFormat == "sam": -# F = open (self.alignmentFile, "r") -# dict = {"0":"+", "16":"-"} -# for line in F: -# if line[0]=='@': -# continue -# fields = line.split() -# if fields[2] == "*": continue -# polarity = dict[fields[1]] -# gene = fields[2] -# offset = int(fields[3]) -# size = len (fields[9]) -# if self.size_inf: -# if (size>=self.size_inf and size<= self.size_sup): -# self.instanceDict[gene].addread (polarity, offset, size) -# self.alignedReads += 1 -# else: -# self.instanceDict[gene].addread (polarity, offset, size) -# self.alignedReads += 1 -# F.close() - elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam": - import pysam - samfile = pysam.Samfile(self.alignmentFile) - for read in samfile: - if read.tid == -1: - continue # filter out unaligned reads - if read.is_reverse: - polarity="-" - else: - polarity="+" - gene = samfile.getrname(read.tid) - offset = read.pos - size = read.qlen - if self.size_inf: - if (size>=self.size_inf and size<= self.size_sup): - self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow - self.alignedReads += 1 - else: - self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow - self.alignedReads += 1 - return self.instanceDict - - def size_histogram (self): - size_dict={} - size_dict['F']= defaultdict (int) - size_dict['R']= defaultdict (int) - size_dict['both'] = defaultdict (int) - for item in self.instanceDict: - buffer_dict_F = self.instanceDict[item].size_histogram()['F'] - buffer_dict_R = self.instanceDict[item].size_histogram()['R'] - for size in buffer_dict_F: - size_dict['F'][size] += buffer_dict_F[size] - for size in buffer_dict_R: - size_dict['R'][size] -= buffer_dict_R[size] - allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) ) - for size in allSizeKeys: - size_dict['both'][size] = size_dict['F'][size] + size_dict['R'][size] - return size_dict - - def CountFeatures (self, GFF3="path/to/file"): - featureDict = defaultdict(int) - F = open (GFF3, "r") - for line in F: - if line[0] == "#": continue - fields = line[:-1].split() - chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6] - featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive") - F.close() - return featureDict - -class SmRNAwindow: - - def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0): - self.biosample = biosample - self.sequence = sequence - self.gene = gene - self.windowoffset = windowoffset - self.size = len(sequence) - self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...} - self.matchedreadsUp = 0 - self.matchedreadsDown = 0 - self.norm=norm - - def addread (self, polarity, offset, size): - '''ATTENTION ATTENTION ATTENTION''' - ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.''' - if polarity == "+": - self.readDict[offset].append(size) - self.matchedreadsUp += 1 - else: - self.readDict[-(offset + size -1)].append(size) - self.matchedreadsDown += 1 - return - - def barycenter (self, upstream_coord=None, downstream_coord=None): - '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that - In this version, attempt to replace the dictionary structure by a list of tupple to save memory too''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - window_size = downstream_coord - upstream_coord +1 - def weigthAverage (TuppleList): - weightSum = 0 - PonderWeightSum = 0 - for tuple in TuppleList: - PonderWeightSum += tuple[0] * tuple[1] - weightSum += tuple[1] - if weightSum > 0: - return PonderWeightSum / float(weightSum) - else: - return 0 - forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window - reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window - Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size - Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size - return Fbarycenter, Rbarycenter - - def correlation_mapper (self, reference, window_size): - '''to map correlation with a sliding window 26-2-2013''' - if window_size > self.size: - return [] - F=open(reference, "r") - reference_forward = [] - reference_reverse = [] - for line in F: - fields=line.split() - reference_forward.append(int(float(fields[1]))) - reference_reverse.append(int(float(fields[2]))) - F.close() - local_object_forward=[] - local_object_reverse=[] - ## Dict to list for the local object - for i in range(1, self.size+1): - local_object_forward.append(len(self.readDict[i])) - local_object_reverse.append(len(self.readDict[-i])) - ## start compiling results by slides - results=[] - for coordinate in range(self.size - window_size): - local_forward=local_object_forward[coordinate:coordinate + window_size] - local_reverse=local_object_reverse[coordinate:coordinate + window_size] - if sum(local_forward) == 0 or sum(local_reverse) == 0: - continue - try: - reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward) - reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse) - if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2): - results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]]) - except: - pass - return results - - def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"): - '''refactored 24-12-2013 to save memory and introduce offset filtering - take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation - the trick is to pass None and then test - polarity parameter can take "both", "forward" or "reverse" as value''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both": - return self.matchedreadsUp + self.matchedreadsDown - if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward": - return self.matchedreadsUp - if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse": - return self.matchedreadsDown - n=0 - if polarity == "both": - for offset in xrange(upstream_coord, downstream_coord+1): - if self.readDict.has_key(offset): - for read in self.readDict[offset]: - if (read>=size_inf and read<= size_sup): - n += 1 - if method != "conservative": - del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST - if self.readDict.has_key(-offset): - for read in self.readDict[-offset]: - if (read>=size_inf and read<= size_sup): - n += 1 - if method != "conservative": - del self.readDict[-offset] - return n - elif polarity == "forward": - for offset in xrange(upstream_coord, downstream_coord+1): - if self.readDict.has_key(offset): - for read in self.readDict[offset]: - if (read>=size_inf and read<= size_sup): - n += 1 - return n - elif polarity == "reverse": - for offset in xrange(upstream_coord, downstream_coord+1): - if self.readDict.has_key(-offset): - for read in self.readDict[-offset]: - if (read>=size_inf and read<= size_sup): - n += 1 - return n - - def readsizes (self): - '''return a dictionary of number of reads by size (the keys)''' - dicsize = {} - for offset in self.readDict: - for size in self.readDict[offset]: - dicsize[size] = dicsize.get(size, 0) + 1 - for offset in range (min(dicsize.keys()), max(dicsize.keys())+1): - dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values - return dicsize - - def size_histogram(self): - norm=self.norm - hist_dict={} - hist_dict['F']={} - hist_dict['R']={} - for offset in self.readDict: - for size in self.readDict[offset]: - if offset < 0: - hist_dict['R'][size] = hist_dict['R'].get(size, 0) - 1*norm - else: - hist_dict['F'][size] = hist_dict['F'].get(size, 0) + 1*norm - return hist_dict - - def statsizes (self, upstream_coord=None, downstream_coord=None): - ''' migration to memory saving by specifying possible subcoordinates - see the readcount method for further discussion''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - L = [] - for offset in self.readDict: - if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue - for size in self.readDict[offset]: - L.append(size) - meansize = mean(L) - stdv = std(L) - mediansize = median(L) - return meansize, mediansize, stdv - - def foldEnergy (self, upstream_coord=None, downstream_coord=None): - ''' migration to memory saving by specifying possible subcoordinates - see the readcount method for further discussion''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ]) - return float(Energy[self.sequence[upstream_coord-1:downstream_coord]]) - - def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None): - ''' migration to memory saving by specifying possible subcoordinates - see the readcount method for further discussion. size_scope must be an interable''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0} - convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} - for offset in self.readDict: - if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue - for size in self.readDict[offset]: - if size in size_scope: - startbase = self.sequence[abs(offset)-self.windowoffset] - if offset < 0: - startbase = convertDic[startbase] - freqDic[startbase] += 1 - base_sum = float ( sum( freqDic.values()) ) - if base_sum == 0: - return "." - else: - return freqDic["T"] / base_sum * 100 - - def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None): - ''' migration to memory saving by specifying possible subcoordinates - see the readcount method for further discussion. size_scope must be an interable - This method is similar to the Ufreq method but take strandness into account''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0} - convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"} - for offset in self.readDict: - if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue - for size in self.readDict[offset]: - if size in size_scope: - startbase = self.sequence[abs(offset)-self.windowoffset] - if offset < 0: - startbase = convertDic[startbase] - freqDic[startbase+"rev"] += 1 - else: - freqDic[startbase+"for"] += 1 - forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"]) - reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"]) - if forward_sum == 0 and reverse_sum == 0: - return ". | ." - elif reverse_sum == 0: - return "%s | ." % (freqDic["Tfor"] / forward_sum * 100) - elif forward_sum == 0: - return ". | %s" % (freqDic["Trev"] / reverse_sum * 100) - else: - return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100) - - - def readplot (self): - norm=self.norm - readmap = {} - for offset in self.readDict.keys(): - readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm ) - mylist = [] - for offset in sorted(readmap): - if readmap[offset][1] != 0: - mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") ) - if readmap[offset][0] != 0: - mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") ) - return mylist - - def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None): - '''Use by MirParser tool''' - upstream_coord = upstream_coord or 1 - downstream_coord = downstream_coord or self.size - windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord) - forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)]) - totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward") - totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse") - if totalforward > totalreverse: - majorcoverage = "forward" - for offset in self.readDict.keys(): - if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ): - for read in self.readDict[offset]: - for i in xrange(read): - try: - forORrev_coverage[offset-upstream_coord+1+i] += 1 - except KeyError: - continue # a sense read may span over the downstream limit - else: - majorcoverage = "reverse" - for offset in self.readDict.keys(): - if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ): - for read in self.readDict[offset]: - for i in xrange(read): - try: - forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage - except KeyError: - continue # an antisense read may span over the upstream limit - output_list = [] - maximum = max (forORrev_coverage.values()) or 1 - for n in sorted (forORrev_coverage): - output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage)) - return "\n".join(output_list) - - - def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None): - ''' migration to memory saving by specifying possible subcoordinates - see the readcount method for further discussion - scope must be a python iterable; scope define the *relative* offset range to be computed''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - query_range = range (minquery, maxquery+1) - target_range = range (mintarget, maxtarget+1) - Query_table = {} - Target_table = {} - frequency_table = dict ([(i, 0) for i in scope]) - for offset in self.readDict: - if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue - for size in self.readDict[offset]: - if size in query_range: - Query_table[offset] = Query_table.get(offset, 0) + 1 - if size in target_range: - Target_table[offset] = Target_table.get(offset, 0) + 1 - for offset in Query_table: - for i in scope: - frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0)) - if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py - frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table]) - if zscore == "yes": - z_mean = mean(frequency_table.values() ) - z_std = std(frequency_table.values() ) - if z_std == 0: - frequency_table = dict([(i,0) for i in frequency_table] ) - else: - frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] ) - return frequency_table - - def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None): - ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion - note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed''' - upstream_coord = upstream_coord or self.windowoffset - downstream_coord = downstream_coord or self.windowoffset+self.size-1 - query_range = range (minquery, maxquery+1) - target_range = range (mintarget, maxtarget+1) - Query_table = {} - Target_table = {} - Total_Query_Numb = 0 - general_frequency_table = dict ([(i,0) for i in scope]) - ## filtering the appropriate reads for the study - for offset in self.readDict: - if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue - for size in self.readDict[offset]: - if size in query_range: - Query_table[offset] = Query_table.get(offset, 0) + 1 - Total_Query_Numb += 1 - if size in target_range: - Target_table[offset] = Target_table.get(offset, 0) + 1 - for offset in Query_table: - frequency_table = dict ([(i,0) for i in scope]) - number_of_targets = 0 - for i in scope: - frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0) - number_of_targets += Target_table.get(-offset -i +1, 0) - for i in scope: - try: - general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i] - except ZeroDivisionError : - continue - return general_frequency_table - - def phasing (self, size_range, scope): - ''' to calculate autocorelation like signal - scope must be an python iterable''' - read_table = {} - total_read_number = 0 - general_frequency_table = dict ([(i, 0) for i in scope]) - ## read input filtering - for offset in self.readDict: - for size in self.readDict[offset]: - if size in size_range: - read_table[offset] = read_table.get(offset, 0) + 1 - total_read_number += 1 - ## per offset read phasing computing - for offset in read_table: - frequency_table = dict ([(i, 0) for i in scope]) # local frequency table - number_of_targets = 0 - for i in scope: - if offset > 0: - frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0) - number_of_targets += read_table.get(offset + i, 0) - else: - frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0) - number_of_targets += read_table.get(offset - i, 0) - ## inclusion of local frequency table in the general frequency table (all offsets average) - for i in scope: - try: - general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i] - except ZeroDivisionError : - continue - return general_frequency_table - - - - def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope): - '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute''' - frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) - z_table = {} - frequency_list = [frequency_table[i] for i in sorted (frequency_table)] - if std(frequency_list): - meanlist = mean(frequency_list) - stdlist = std(frequency_list) - z_list = [(i-meanlist)/stdlist for i in frequency_list] - return dict (zip (sorted(frequency_table), z_list) ) - else: - return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) ) - - def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope): - frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope) - total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) ) - if total == 0: - return dict( [(i,0) for i in scope]) - return dict( [(i, frequency_table[i]/total*100) for i in scope]) - - def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget): - queryhash = defaultdict(list) - targethash = defaultdict(list) - query_range = range (int(minquery), int(maxquery)+1) - target_range = range (int(mintarget), int(maxtarget)+1) - paired_sequences = [] - for offset in self.readDict: # selection of data - for size in self.readDict[offset]: - if size in query_range: - queryhash[offset].append(size) - if size in target_range: - targethash[offset].append(size) - for offset in queryhash: - if offset >= 0: matched_offset = -offset - overlap + 1 - else: matched_offset = -offset - overlap + 1 - if targethash[matched_offset]: - paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) ) - if offset >= 0: - for i in range (paired): - paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) ) - paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) ) - if offset < 0: - for i in range (paired): - paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) ) - paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) ) - return paired_sequences - - def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget): - queryhash = defaultdict(list) - targethash = defaultdict(list) - query_range = range (int(minquery), int(maxquery)+1) - target_range = range (int(mintarget), int(maxtarget)+1) - paired_sequences = [] - - for offset in self.readDict: # selection of data - for size in self.readDict[offset]: - if size in query_range: - queryhash[offset].append(size) - if size in target_range: - targethash[offset].append(size) - - for offset in queryhash: - matched_offset = -offset - overlap + 1 - if targethash[matched_offset]: - if offset >= 0: - for i in queryhash[offset]: - paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) ) - for i in targethash[matched_offset]: - paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) ) - if offset < 0: - for i in queryhash[offset]: - paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) ) - for i in targethash[matched_offset]: - paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) ) - return paired_sequences - - def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget): - ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates''' - queryhash = defaultdict(list) - targethash = defaultdict(list) - query_range = range (int(minquery), int(maxquery)+1) - target_range = range (int(mintarget), int(maxtarget)+1) - bowtie_output = [] - - for offset in self.readDict: # selection of data - for size in self.readDict[offset]: - if size in query_range: - queryhash[offset].append(size) - if size in target_range: - targethash[offset].append(size) - counter = 0 - for offset in queryhash: - matched_offset = -offset - overlap + 1 - if targethash[matched_offset]: - if offset >= 0: - for i in queryhash[offset]: - counter += 1 - bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset - if offset < 0: - for i in queryhash[offset]: - counter += 1 - bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset - return bowtie_output - - -def __main__(bowtie_index_path, bowtie_output_path): - sequenceDic = get_fasta (bowtie_index_path) - objDic = {} - F = open (bowtie_output_path, "r") # F is the bowtie output taken as input - for line in F: - fields = line.split() - polarity = fields[1] - gene = fields[2] - offset = int(fields[3]) - size = len (fields[4]) - try: - objDic[gene].addread (polarity, offset, size) - except KeyError: - objDic[gene] = SmRNAwindow(gene, sequenceDic[gene]) - objDic[gene].addread (polarity, offset, size) - F.close() - for gene in objDic: - print gene, objDic[gene].pairer(19,19,23,19,23) - -if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2]) diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/test-data/sRbowtie.fa --- a/mississippi_gcc/test-data/sRbowtie.fa Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ ->1 -GTATTGTAAGTGGCAGAGTGGC ->2 -TGGAATGTAAAGAAGTATGGAG ->3 -GTGGGGAGTTTGGATGGGGCGGCA ->4 -AATGGCACTGGAAGAATTCACGG ->5 -GTACGGACAAGGGGAATC ->6 -TTGGGTTCTGGGGGGAGTATGG ->7 -GTGGGGAGTTTCGCTGGGGCGGCA ->8 -TAAGGGTCGGGTAGTGAGGGC ->9 -AGCTGGGACTGAGGACTG ->10 -AGCTGGGACTGAGGACTGC ->11 -AAGGGTCGGGTAGTGAGG ->12 -GTCGGGTAGTGAGGGCCTT ->13 -TGGTGGGGCTTGGAACAATTGGAGGGC ->14 -TGACGGAAGGGCACCACC ->15 -TGGAATGTAAAGAAGTATGGAG ->16 -TTGGGTTCTGGGGGGAGT ->17 -TCAGGTGGGGAGTTTGGCTGGGGCGGCACA ->18 -TTGGGTATAGGGGCGAAAGA ->19 -AGCGGGCGTGCTTGTGGAC ->20 -GTCGAATTTGGGTATAGGGGC diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/test-data/sRbowtie.out --- a/mississippi_gcc/test-data/sRbowtie.out Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -2 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG -4 - 2L 11953463 CCGTGAATTCTTCCAGTGCCATT -15 + 2L 20487495 TGGAATGTAAAGAAGTATGGAG -14 - Uextra 7115665 GGTGGTGCCCTTCCGTCA -18 + Uextra 7726410 TTGGGTATAGGGGCGAAAGA diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/test-data/yac.fastq --- a/mississippi_gcc/test-data/yac.fastq Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -@SRR290479.1 HWI-EAS285:2:1:66:28/1 -TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG -+ -B@BBCBCCBCBCCC8A<@################## -@SRR290479.2 HWI-EAS285:2:1:67:348/1 -AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG -+ -BAA@7?A@@A@@B<'25?6>59:;7#B7@44#;;324'117? -@SRR290479.4 HWI-EAS285:2:1:68:65/1 -ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT -+ -BBB@@ABAAB?9B42&9;################## -@SRR290479.5 HWI-EAS285:2:1:69:594/1 -AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG -+ -AB?5;3>/=?>=;416481################# -@SRR290479.6 HWI-EAS285:2:1:70:700/1 -TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT -+ -BCB=:ACCBB=>BB8<-################### -@SRR290479.7 HWI-EAS285:2:1:70:1679/1 -TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC -+ -BCBBCCBCCCBCCA?AB>:B@><>############ -@SRR290479.8 HWI-EAS285:2:1:71:1400/1 -AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT -+ -7@BC>>@55CCBCA3CBA14B.A16#*;9359B### -@SRR290479.9 HWI-EAS285:2:1:71:795/1 -TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG -+ -BBBBBCBBCB;>AA',9=18?1:7:#<;57###### -@SRR290479.10 HWI-EAS285:2:1:71:596/1 -TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG -+ -B@B>7>9A@<46B@79972################# diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/test-data/yac.out --- a/mississippi_gcc/test-data/yac.out Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ ->1 -TGTAAACATCCCCGACTGGCAGC ->2 -AAAGTGCTACTACTTTTGAGTCT ->3 -ACTGGACTTGGAGTCCGAAGGC ->4 -AAGTGCCGCCAGGTTTTGAGTGG ->5 -TATTGCACTTGTCCCGGCCTGAATCNCGT ->6 -TAGCTTATCAGACTGATGTTGAC diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/tool_data_table_conf.xml.sample --- a/mississippi_gcc/tool_data_table_conf.xml.sample Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ - - - - value, dbkey, name, path - -

- - - value, dbkey, formats, name, path - -

- - - value, name, path - -

- - - value, dbkey, name, path - -

- - - name, value, dbkey, species - -

- - - value, dbkey, name, path - -

- - - value, name, path - -

- - - value, dbkey, name, path - -

- - - value, dbkey, name, path, tools_valid_for - -

- - - value, dbkey, name, path - -

- - - value, name, path - -

- - - value, dbkey, name, path - -

- diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/yac.py --- a/mississippi_gcc/yac.py Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -#!/usr/bin/python -# yac = yet another clipper -# v 1.0.0 -# Usage yac.py $input $output $adapter_to_clip $min $max $Nmode -# Christophe Antoniewski - -import sys, string - -class Clip: - def __init__(self, inputfile, outputfile, adapter, minsize, maxsize): - self.inputfile = inputfile - self.outputfile = outputfile - self.adapter = adapter - self.minsize = int(minsize) - self.maxsize = int(maxsize) - def motives (sequence): - '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' - sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match - dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"} - for pos in enumerate(sequence[:6]): - for subst in dicsubst[pos[1]]: - sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7]) - return sequencevariants - self.adaptmotifs= motives(self.adapter) - - def scanadapt(self, adaptmotives=[], sequence=""): - '''scans sequence for adapter motives''' - if sequence.rfind(adaptmotives[0]) != -1: - return sequence[:sequence.rfind(adaptmotives[0])] - for motif in adaptmotives[1:]: - if sequence.rfind(motif) != -1: - return sequence[:sequence.rfind(motif)] - return sequence - - def clip_with_N (self): - '''clips adapter sequences from inputfile. - Reads containing N are retained.''' - iterator = 0 - id = 0 - F = open (self.inputfile, "r") - O = open (self.outputfile, "w") - for line in F: - iterator += 1 - if iterator % 4 == 2: - trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) - if self.minsize <= len(trim) <= self.maxsize: - id += 1 - print >> O, ">%i\n%s" % (id, trim) - F.close() - O.close() - def clip_without_N (self): - '''clips adapter sequences from inputfile. - Reads containing N are rejected.''' - iterator = 0 - id = 0 - F = open (self.inputfile, "r") - O = open (self.outputfile, "w") - for line in F: - iterator += 1 - if iterator % 4 == 2: - trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) - if "N" in trim: continue - if self.minsize <= len(trim) <= self.maxsize: - id += 1 - print >> O, ">%i\n%s" % (id, trim) - F.close() - O.close() - -def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode): - instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize) - if Nmode == "accept": - instanceClip.clip_with_N() - else: - instanceClip.clip_without_N() - -if __name__ == "__main__" : - input = sys.argv[1] - output = sys.argv[2] - adapter = sys.argv[3] - minsize = sys.argv[4] - maxsize = sys.argv[5] - Nmode = sys.argv[6] - __main__(input, output, adapter, minsize, maxsize, Nmode) diff -r f777cbc82f98 -r f9032a866675 mississippi_gcc/yac.xml --- a/mississippi_gcc/yac.xml Tue Jun 24 10:58:46 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ - - - yac.py $input $output $clip_source.clip_sequence $min $max $Nmode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -This tool clips adapter sequences from a fastq file and fasta file of clipped reads with renumbered fasta headers. - -Clipped sequences with Ns can be discarded. - -Min size and max size filter clipped reads on their size. - -Note that unclipped reads that satisfy the min and max size conditions are kept. - - - - - - - - - - - - - - - -