Mercurial > repos > mzytnicki > s_mart

Binary file SMART/Java/Python/__init__.pyc has changed
--- a/SMART/Java/Python/clusterizeBySlidingWindows.py	Fri Jan 10 09:04:03 2014 -0500
+++ b/SMART/Java/Python/clusterizeBySlidingWindows.py	Thu Jan 30 08:54:40 2014 -0500
@@ -28,317 +28,196 @@
 # The fact that you are presently reading this means that you have had
 # knowledge of the CeCILL license and that you accept its terms.
 #
-import re
-from commons.core.writer.WriterChooser import WriterChooser
 """
 Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.
 """

-import os, os.path
+import os, os.path, re
 from optparse import OptionParser
 from SMART.Java.Python.structure.Transcript import Transcript
-from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
-from SMART.Java.Python.misc.RPlotter import RPlotter
 from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
+from commons.core.parsing.ParserChooser import ParserChooser
+from commons.core.writer.WriterChooser import WriterChooser
 from commons.core.writer.Gff3Writer import Gff3Writer

 class ClusterizeBySlidingWindows(object):

-    def __init__(self, verbosity = 0):
-        self.verbosity = verbosity
-        self.strands   = (0, )
-        self.normalize = False
-        self.plot      = None
-        self.excel     = None
-        self.outputFileName = ''
-        self.defaultValue = None
-
-    def __del__(self):
-        pass
-
-    def setInputFile(self, fileName, format):
-        self.parser = TranscriptContainer(fileName, format, self.verbosity)
-
-    def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"):
-        writerChooser = WriterChooser(self.verbosity)
-        writerChooser.findFormat(format)
-        self.writer = writerChooser.getWriter(fileName)
-        self.writer.setTitle(title)
-        self.writer.setFeature(feature)
-        self.writer.setFeaturePart(featurePart)
-#        self.outputFileName = fileName
-#        self.outputFormat = format
-
-    def setWindowSize(self, size):
-        self.size = size
+	def __init__(self, verbosity = 0):
+		self.verbosity	    = verbosity
+		self.strands		= (0, )
+		self.outputFileName = ''
+		self.defaultValue	= None
+		self.tag 			= None
+		self.valuesPerStrand = {}

-    def setWindowOverlap(self, overlap):
-        self.overlap = overlap
-
-    def setTag(self, tag):
-        self.tag = tag
-
-    def setOperation(self, operation):
-        self.operation = operation
-
-    def setBothStrands(self, bothStrands):
-        if bothStrands:
-            self.strands = (-1, 1)
+	def setInputFile(self, fileNames, format):
+		parserChooser = ParserChooser(self.verbosity)
+		parserChooser.findFormat(format)
+		if self.outputTagNames is None:
+			if len(fileNames) == 1:
+				self.outputTagNames = ["nbElements"]
+			else:
+				operation = "nbElements" if self.operation is None else self.operation.lower()
+				self.outputTagNames = ["%s%s" % (operation, os.path.splitext(os.path.basename(fileName))[0].title()) for fileName in fileNames]
+		self.parsers = dict(zip(self.outputTagNames, [parserChooser.getParser(fileName) for fileName in fileNames]))

-    def setNormalize(self, normalize):
-        self.normalize = normalize
-
-    def setPlot(self, plot):
-        self.plot = plot
+	def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"):
+		writerChooser = WriterChooser(self.verbosity)
+		writerChooser.findFormat(format)
+		self.writer = writerChooser.getWriter(fileName)
+		self.writer.setTitle(title)
+		self.writer.setFeature(feature)
+		self.writer.setFeaturePart(featurePart)

-    def setExcel(self, excel):
-        self.excel = excel
+	def setWindowSize(self, size):
+		self.size = size

-    def setOutputTag(self, tag):
-        self.outputTagName = tag
-
-    def setDefaultValue(self, defaultValue):
-        self.defaultValue = defaultValue
+	def setWindowOverlap(self, overlap):
+		self.overlap = overlap

-    def checkOptions(self):
-#        if self.operation != None:
-#            raise Exception("Trying to combine the values without specifying tag! Aborting...")
-        if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"):
-            raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation))
+	def setTag(self, tag):
+		self.tag = tag
+
+	def setOperation(self, operation):
+		self.operation = operation

-    def getChromosomeSizes(self):
-        self.sizes = {}
-        progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity)
-        for transcript in self.parser.getIterator():
-            self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0))
-            progress.inc()
-        progress.done()
+	def setBothStrands(self, bothStrands):
+		if bothStrands:
+			self.strands = (-1, 1)

-    def getBinsFromPos(self, pos):
-        bin = (pos - 1) / (self.size - self.overlap)
-        if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap:
-            return (bin - 1, bin)
-        return (bin, )
-
-    def getPosFromBin(self, bin):
-        return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size)
+	def setOutputTag(self, tags):
+		self.outputTagNames = tags
+
+	def setDefaultValue(self, defaultValue):
+		self.defaultValue = defaultValue

-    def initializeBins(self):
-        self.binsPerStrand        = {}
-        self.sumsPerStrand        = {}
-        self.valuesPerStrand      = {}
-        self.toBePlottedPerStrand = {}
-        for strand in self.strands:
-            self.binsPerStrand[strand]        = {}
-            self.sumsPerStrand[strand]        = {}
-            self.valuesPerStrand[strand]      = {}
-            self.toBePlottedPerStrand[strand] = {}
-            for chromosome in self.sizes:
-                binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1)
-                self.binsPerStrand[strand][chromosome]        = dict([[i, 0]   for i in binRange])
-                self.sumsPerStrand[strand][chromosome]        = dict([[i, 0.0] for i in binRange])
-                self.valuesPerStrand[strand][chromosome]      = dict([[i, []]  for i in binRange])
-                self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange])
+	def checkOptions(self):
+		if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"):
+			raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation))
+		if self.size is None:
+			raise Exception("Please mention a windows size! Aborting...")
+		if self.overlap is None:
+			raise Exception("Please mention a windows overlap size! Aborting...")

-    def getNbElements(self, transcript):
-        nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences")
-        nbElements    = 1 if "nbElements"    not in transcript.getTagNames() else transcript.getTagValue("nbElements")
-        nbOccurrences = float(nbOccurrences)
-        nbElements = float(nbElements)
-        nbElements /= float(nbOccurrences)
-        return nbElements
+	def getBinsFromPos(self, pos):
+		bin = (pos - 1) / (self.size - self.overlap)
+		if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap:
+			return (bin - 1, bin)
+		return (bin, )

-    def setBins(self):
-        progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity)
-        for transcript in self.parser.getIterator():
-            nbElements = self.getNbElements(transcript)
-            strand     = transcript.getDirection() if len(self.strands) == 2 else 0
-            for bin in self.getBinsFromPos(transcript.getStart()):
-                self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements
-                if self.tag != None:
-                    if self.tag not in transcript.getTagNames():
-                        if self.defaultValue is None:
-                            raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript))
-                        value = self.defaultValue
-                    else:
-                        value = float(transcript.getTagValue(self.tag))
-                    self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value
-                    self.valuesPerStrand[strand][transcript.getChromosome()][bin].append(value)
-            progress.inc()
-        progress.done()
+	def getPosFromBin(self, bin):
+		return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size)

-    def aggregateData(self):
-        if self.operation == "sum":
-            self.computeSumData()
-        elif self.operation == "avg":
-            self.computeAvgData()
-        elif self.operation == "med":
-            self.computeMedData()
-        elif self.operation == "min":
-            self.computeMinData()
-        elif self.operation == "max":
-            self.computeMaxData()
-        elif self.operation == "GCpercent":
-            self.computeGCPercent()
-        else:
-            self.toBePlottedPerStrand = self.binsPerStrand
-
-    def computeSumData(self):
-        self.toBePlottedPerStrand = self.sumsPerStrand
+	def getNbElements(self, transcript):
+		nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences")
+		nbElements	= 1 if "nbElements"	not in transcript.getTagNames() else transcript.getTagValue("nbElements")
+		nbOccurrences = float(nbOccurrences)
+		nbElements	= float(nbElements)
+		nbElements   /= float(nbOccurrences)
+		return nbElements

-    def computeAvgData(self):
-        for strand in self.strands:
-            for chromosome in self.binsPerStrand[strand]:
-                for bin in self.binsPerStrand[strand][chromosome]:
-                    if self.binsPerStrand[strand][chromosome][bin] != 0:
-                        self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin]
-
-    def computeMedData(self):
-        for strand in self.strands:
-            for chromosome in self.binsPerStrand[strand]:
-                for bin in self.binsPerStrand[strand][chromosome]:
-                    if self.valuesPerStrand[strand][chromosome][bin]:
-                        self.valuesPerStrand[strand][chromosome][bin].sort()
-                        size = len(self.valuesPerStrand[strand][chromosome][bin])
-                        if size % 2 == 1:
-                            self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2]
-                        else:
-                            self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0
-
-    def computeMinData(self):
-        for strand in self.strands:
-            for chromosome in self.binsPerStrand[strand]:
-                for bin in self.binsPerStrand[strand][chromosome]:
-                    if self.valuesPerStrand[strand][chromosome][bin]:
-                        self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin])
+	def setBins(self, name, parser):
+		progress = UnlimitedProgress(10000, "Setting bins of file %s" % (parser.fileName), self.verbosity)
+		for transcript in parser.getIterator():
+			nbElements = self.getNbElements(transcript)
+			strand	   = transcript.getDirection() if len(self.strands) == 2 else 0
+			chromosome = transcript.getChromosome()
+			if strand not in self.valuesPerStrand:
+				self.valuesPerStrand[strand] = {}
+			if chromosome not in self.valuesPerStrand[strand]:
+				self.valuesPerStrand[strand][chromosome] = {}
+			for bin in self.getBinsFromPos(transcript.getStart()):
+				if self.tag is None:
+					value = nbElements
+				else:
+					if tag in transcript.getTagNames():
+						value = float(transcript.getTagValue(tag))
+					else:
+						if self.defaultValue is None:
+							raise Exception("Tag %s undefined in transcript %s" % (tag, transcript))
+						value = self.defaultValue
+				if bin not in self.valuesPerStrand[strand][chromosome]:
+					self.valuesPerStrand[strand][chromosome][bin] = {}
+				if name not in self.valuesPerStrand[strand][chromosome][bin]:
+					self.valuesPerStrand[strand][chromosome][bin][name] = []
+				self.valuesPerStrand[strand][transcript.getChromosome()][bin][name].append(value)
+			progress.inc()
+		progress.done()

-    def computeMaxData(self):
-        for strand in self.strands:
-            for chromosome in self.binsPerStrand[strand]:
-                for bin in self.binsPerStrand[strand][chromosome]:
-                    if self.valuesPerStrand[strand][chromosome][bin]:
-                        self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin])
-
-    def computeGCPercent(self):
-        for strand in self.strands:
-            for chromosome in self.binsPerStrand[strand]:
-                for bin in self.binsPerStrand[strand][chromosome]:
-                    if self.valuesPerStrand[strand][chromosome][bin]:
-                        subSequence = self.valuesPerStrand[strand][chromosome][bin]
-                        NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize()))
-                        if NPercent >= 50:
-                            currentGCpercent = "NA"
-                        else:
-                            currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength()
-
-                        self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent
-        #TODO: see if a map method could be used for the various "compute" methods
-        #return currentGCpercent, NPercent
-
-    def plotData(self):
-        if self.plot != None:
-            for strand in self.strands:
-                adjunct = ""
-                if strand != 0:
-                    adjunct = "Strand%d" % (strand)
-                for chromosome in self.toBePlottedPerStrand[strand]:
-                    if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0:
-                        plotter = RPlotter(self.plot, self.verbosity)
-                        plotter.setFill(0)
-                        plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome)
-                        plotter.plot()
+	def aggregateData(self, values):
+		if self.operation is None or self.operation == "sum":
+			return sum(values)
+		if self.operation == "avg":
+			return sum(values) / float(len(values))
+		if self.operation == "med":
+			size		 = len(values)
+			sortedValues = sorted(values)
+			if size % 2 == 1:
+				return sorted[(size - 1) / 2]
+			return (sortedValues[size / 2 - 1] + sortedValues[size / 2]) / 2.0
+		if self.operation == "min":
+			return min(values)
+		if self.operation == "max":
+			return max(values)
+		if self.operation == "GCpercent":
+			subSequence = self.valuesPerStrand[strand][chromosome][bin]
+			NPercent = 100 * (values.countNt("N") / float(values.getSize()))
+			if NPercent >= 50:
+				return "NA"
+			return subSequence.getGCpercentageInSequenceWithoutCountNInLength()

-    def writeExcel(self):
-        if self.excel != None:
-            excelFile = open(self.excel, "w")
-            for strand in self.strands:
-                maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]])
-                for bin in range(0, maxBin + 1):
-                    excelFile.write(",%d-%d" % self.getPosFromBin(bin))
-                excelFile.write("\n")
-                for chromosome in self.toBePlottedPerStrand[strand]:
-                    excelFile.write("%s" % (chromosome))
-                    for bin in self.toBePlottedPerStrand[strand][chromosome]:
-                        excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin]))
-                    excelFile.write("\n")
-            excelFile.close()
+	def printRegions(self):
+		cpt		   = 1
+		for strand in self.strands:
+			for chromosome in sorted(self.valuesPerStrand[strand]):
+				for bin in sorted(self.valuesPerStrand[strand][chromosome]):
+					transcript = Transcript()
+					transcript.setName("region%d" % cpt)
+					transcript.setChromosome(chromosome)
+					transcript.setStart(self.getPosFromBin(bin)[0])
+					transcript.setEnd(self.getPosFromBin(bin)[1])
+					transcript.setDirection(1 if strand == 0 else strand)
+					for name in self.valuesPerStrand[strand][chromosome][bin]:
+						transcript.setTagValue(name, self.aggregateData(self.valuesPerStrand[strand][chromosome][bin][name]))
+					self.writer.addTranscript(transcript)
+					cpt += 1
+		self.writer.close()

-    def printRegions(self):
-        cpt           = 1
-        tagOp         = "nb"
-        tagName       = "Elements"
-        outputTagName = "nbElements"
-        if self.operation != None:
-            tagOp = self.operation.lower()
-        if self.tag != None:
-            tagName = self.tag.title()
-        if self.outputTagName != None:
-            outputTagName = self.outputTagName
-
-
-        #writer = Gff3Writer(self.outputFileName, self.verbosity)
-
-        for strand in self.strands:
-            for chromosome in self.toBePlottedPerStrand[strand]:
-                for bin in self.toBePlottedPerStrand[strand][chromosome]:
-                    transcript = Transcript()
-                    transcript.setName("region%d" % cpt)
-                    transcript.setChromosome(chromosome)
-                    transcript.setStart(self.getPosFromBin(bin)[0])
-                    transcript.setEnd(self.getPosFromBin(bin)[1])
-                    transcript.setDirection(1 if strand == 0 else strand)
-                    transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin])
-                    transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin]))
-                    self.writer.addTranscript(transcript)
-                    cpt += 1
-        self.writer.close()
-
-    def run(self):
-        self.checkOptions()
-        self.getChromosomeSizes()
-        self.initializeBins()
-        self.setBins()
-        self.aggregateData()
-        if self.excel:
-            self.writeExcel()
-        if self.plot:
-            self.plotData()
-        self.printRegions()
-
+	def run(self):
+		self.checkOptions()
+		for key, parser in self.parsers.iteritems():
+			self.setBins(key, parser)
+		self.printRegions()

 if __name__ == "__main__":
-
-    # parse command line
-    description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]"
+
+	# parse command line
+	description = "Clusterize by Sliding Windows v1.0.2: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]"

-    parser = OptionParser(description = description)
-    parser.add_option("-i", "--input",       dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
-    parser.add_option("-f", "--inputFormat", dest="inputFormat",    action="store",                     type="string", help="format of the input file [compulsory] [format: transcript file format]")
-    parser.add_option("-o", "--output",      dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
-    parser.add_option("-u", "--outputFormat", dest="outputFormat",  action="store",     default="gff",  type="string", help="format of the output file [format: transcript file format]")
-    parser.add_option("-s", "--size",        dest="size",           action="store",                     type="int",    help="size of the regions [compulsory] [format: int]")
-    parser.add_option("-e", "--overlap",     dest="overlap",        action="store",                     type="int",    help="overlap between two consecutive regions [compulsory] [format: int]")
-    parser.add_option("-m", "--normalize",   dest="normalize",      action="store_true", default=False,                help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
-    parser.add_option("-g", "--tag",         dest="tag",            action="store",      default=None,  type="string", help="use a given tag as input (instead of summing number of features) [format: string]")
-    parser.add_option("-r", "--operation",   dest="operation",      action="store",      default=None,  type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]")
-    parser.add_option("-d", "--defaultValue",dest="defaultValue",   action="store",                     type="float",    help="default value for input tag [format: float]")
-    parser.add_option("-w", "--write",       dest="writeTag",       action="store",      default=None,  type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]")
-    parser.add_option("-2", "--strands",     dest="strands",        action="store_true", default=False,                help="consider the two strands separately [format: bool] [default: false]")
-    parser.add_option("-p", "--plot",        dest="plot",           action="store",      default=None,  type="string", help="plot regions to the given file [format: output file in PNG format]")
-    parser.add_option("-x", "--excel",       dest="excel",          action="store",      default=None,  type="string", help="write an Excel file to the given file [format: output file in Excel format]")
-    parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int] [default: 1]")
-    (options, args) = parser.parse_args()
+	parser = OptionParser(description = description)
+	parser.add_option("-i", "--input",        dest="inputFileName",  action="store",					 type="string", help="input files, separated by commas [compulsory] [format: string]")
+	parser.add_option("-f", "--inputFormat",  dest="inputFormat",    action="store",					 type="string", help="format of the input file [compulsory] [format: transcript file format]")
+	parser.add_option("-o", "--output",	      dest="outputFileName", action="store",					 type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
+	parser.add_option("-u", "--outputFormat", dest="outputFormat",   action="store",      default="gff", type="string", help="format of the output file [format: transcript file format]")
+	parser.add_option("-s", "--size",         dest="size",	         action="store",					 type="int",	help="size of the regions [compulsory] [format: int]")
+	parser.add_option("-e", "--overlap",	  dest="overlap",        action="store",					 type="int",	help="overlap between two consecutive regions [compulsory] [format: int]")
+	parser.add_option("-g", "--tag",          dest="tag",            action="store",      default=None,  type="string", help="use a given tag as input (instead of summing number of features) [format: string]")
+	parser.add_option("-r", "--operation",    dest="operation",      action="store",      default=None,  type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]")
+	parser.add_option("-d", "--defaultValue", dest="defaultValue",   action="store",					 type="float",  help="default value for input tag [format: float]")
+	parser.add_option("-w", "--write",	      dest="writeTag",       action="store",      default=None,  type="string", help="print the result in the given tags, separated by commas (default usually is 'nbElements') [format: string]")
+	parser.add_option("-2", "--strands",      dest="strands",		 action="store_true", default=False,                help="consider the two strands separately [format: bool] [default: false]")
+	parser.add_option("-v", "--verbosity",    dest="verbosity",	     action="store",      default=1,	 type="int",	help="trace level [format: int] [default: 1]")
+	(options, args) = parser.parse_args()

-    cbsw = ClusterizeBySlidingWindows(options.verbosity)
-    cbsw.setInputFile(options.inputFileName, options.inputFormat)
-    cbsw.setOutputFileName(options.outputFileName, options.outputFormat)
-    cbsw.setWindowSize(options.size)
-    cbsw.setWindowOverlap(options.overlap)
-    cbsw.setTag(options.tag)
-    cbsw.setDefaultValue(options.defaultValue)
-    cbsw.setOperation(options.operation)
-    cbsw.setOutputTag(options.writeTag)
-    cbsw.setBothStrands(options.strands)
-    cbsw.setPlot(options.plot)
-    cbsw.setExcel(options.excel)
-    cbsw.run()
+	cbsw = ClusterizeBySlidingWindows(options.verbosity)
+	cbsw.setOperation(options.operation)
+	cbsw.setTag(options.tag)
+	cbsw.setOutputTag(None if options.writeTag is None else options.writeTag.split(","))
+	cbsw.setInputFile(options.inputFileName.split(","), options.inputFormat)
+	cbsw.setOutputFileName(options.outputFileName, options.outputFormat)
+	cbsw.setWindowSize(options.size)
+	cbsw.setWindowOverlap(options.overlap)
+	cbsw.setDefaultValue(options.defaultValue)
+	cbsw.setBothStrands(options.strands)
+	cbsw.run()
Binary file SMART/Java/Python/misc/Progress.pyc has changed
Binary file SMART/Java/Python/misc/RPlotter.pyc has changed
Binary file SMART/Java/Python/misc/UnlimitedProgress.pyc has changed
Binary file SMART/Java/Python/misc/Utils.pyc has changed
Binary file SMART/Java/Python/misc/__init__.pyc has changed
Binary file SMART/Java/Python/mySql/MySqlExonTable.pyc has changed
Binary file SMART/Java/Python/mySql/MySqlTable.pyc has changed
Binary file SMART/Java/Python/mySql/MySqlTranscriptTable.pyc has changed
Binary file SMART/Java/Python/mySql/__init__.pyc has changed
Binary file SMART/Java/Python/ncList/NCIndex.pyc has changed
Binary file SMART/Java/Python/ncList/NCList.pyc has changed
Binary file SMART/Java/Python/ncList/NCListCursor.pyc has changed
Binary file SMART/Java/Python/ncList/NCListFilePickle.pyc has changed
Binary file SMART/Java/Python/ncList/__init__.pyc has changed
Binary file SMART/Java/Python/structure/Bins.pyc has changed
Binary file SMART/Java/Python/structure/Interval.pyc has changed
Binary file SMART/Java/Python/structure/Mapping.pyc has changed
Binary file SMART/Java/Python/structure/Sequence.pyc has changed
Binary file SMART/Java/Python/structure/SequenceList.pyc has changed
Binary file SMART/Java/Python/structure/SubMapping.pyc has changed
Binary file SMART/Java/Python/structure/Transcript.pyc has changed
Binary file SMART/Java/Python/structure/TranscriptContainer.pyc has changed
Binary file SMART/Java/Python/structure/TranscriptList.pyc has changed
Binary file SMART/Java/Python/structure/__init__.pyc has changed
Binary file SMART/Java/__init__.pyc has changed
Binary file SMART/__init__.pyc has changed
Binary file SMART/galaxy/.clusterizeBySlidingWindows.xml.swp has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/galaxy/.nfs0000000009a6075300000027	Thu Jan 30 08:54:40 2014 -0500
@@ -0,0 +1,137 @@
+<tool id="clusterizeBySlidingWindows" name="clusterize by sliding windows">
+	<description>Produces a GFF3 file that clusters a list of transcripts using a sliding window. Cluster the data into regions (defined by size and overlap with next region).</description>
+	<requirements>
+		<requirement type="set_environment">PYTHONPATH</requirement>
+	</requirements>
+	<command interpreter="python">
+		../Java/Python/clusterizeBySlidingWindows.py -i $formatType.inputFileName
+		#if $formatType.FormatInputFileName == 'bed':
+			-f bed
+		#elif $formatType.FormatInputFileName == 'gff':
+			-f gff
+		#elif $formatType.FormatInputFileName == 'gff2':
+			-f gff2
+		#elif $formatType.FormatInputFileName == 'gff3':
+			-f gff3
+		#elif $formatType.FormatInputFileName == 'sam':
+			-f sam
+		#elif $formatType.FormatInputFileName == 'gtf':
+			-f gtf
+		#end if
+		-s $size
+		-e $overlap
+		-o $outputFileGff
+		$strands
+
+		#if $OptionTag.tag == "Yes":
+			-g $OptionTag.value
+		#end if
+
+		#if $OptionsOperation.operation == "Yes":
+			-r $OptionsOperation.value
+		#end if
+
+		#if $OptionWriteTag.writeTag == "Yes":
+			-w $OptionWriteTag.value
+		#end if
+
+	</command>
+
+	<inputs>
+		<repeat name="inputFileNames" title="inputFileNames">
+			<conditional name="formatType">
+				<param name="FormatInputFileName" type="select" label="Input File Format">
+					<option value="bed">bed</option>
+					<option value="gff">gff</option>
+					<option value="gff2">gff2</option>
+					<option value="gff3">gff3</option>
+					<option value="sam">sam</option>
+					<option value="gtf">gtf</option>
+					<option value="bam">bam</option>
+				</param>
+				<when value="bed">
+					<param name="inputFileName" format="bed" type="data" label="Input File"/>
+				</when>
+				<when value="gff">
+					<param name="inputFileName" format="gff" type="data" label="Input File"/>
+				</when>
+				<when value="gff2">
+					<param name="inputFileName" format="gff2" type="data" label="Input File"/>
+				</when>
+				<when value="gff3">
+					<param name="inputFileName" format="gff3" type="data" label="Input File"/>
+				</when>
+				<when value="sam">
+					<param name="inputFileName" format="sam" type="data" label="Input File"/>
+				</when>
+				<when value="gtf">
+					<param name="inputFileName" format="gtf" type="data" label="Input File"/>
+				</when>
+				<when value="bam">
+					<param name="inputFileName" format="bam" type="data" label="Input File"/>
+				</when>
+			</conditional>
+		</inputFileNames>
+
+
+		<param name="size" type="text" value="50000" label="Size option" help="Size of the regions."/>
+		<param name="overlap" type="text" value="50" label="Overlap option" help="Overlap between two consecutive regions."/>
+		<param name="strands" type="boolean" truevalue="-2" falsevalue="" checked="false" label="Consider the two strands separately"/>
+
+		<conditional name="OptionTag">
+			<param name="tag" type="select" label="Use a given tag as input (instead of summing number of features)">
+				<option value="Yes">Yes</option>
+				<option value="No" selected="true">No</option>
+			</param>
+			<when value="Yes">
+				<param name="value" type="select" label="tag name"/>
+			</when>
+			<when value="No">
+			</when>
+		</conditional>
+
+		<conditional name="OptionsOperation">
+			<param name="operation" type="select" label="combine tag value with given operation">
+				<option value="Yes">Yes</option>
+				<option value="No" selected="true">No</option>
+			</param>
+			<when value="Yes">
+				<param name="value" type="select" label="operation" help="You can ONLY choose one of following operation : sum, avg, med, min, max.">
+					<option value="sum">sum</option>
+					<option value="avg">average</option>
+					<option value="med">median</option>
+					<option value="min">minimum</option>
+					<option value="max">maximum</option>
+				</param>
+			</when>
+			<when value="No">
+			</when>
+		</conditional>
+
+
+		<conditional name="OptionWriteTag">
+			<param name="writeTag" type="select" label="write a new tag in output file">
+				<option value="Yes">Yes</option>
+				<option value="No" selected="true">No</option>
+			</param>
+			<when value="Yes">
+				<param name="value" type="text" value="nbElements" label="write tag option" help="print the result in the given tag (default usually is 'nbElements')"/>
+			</when>
+			<when value="No">
+			</when>
+		</conditional>
+
+	</inputs>
+
+	<outputs>
+		<data name="outputFileGff" format="gff3"/>
+	</outputs>
+
+	<help>
+Sliding windows are a convenient ways to clusterize data mapped on the genome. There are two important parameters of a sliding window: the size of the window and the size of the overlap.
+
+By default, sliding windows count the number of reads in each window. However, you can basically merge any information which is contained in the tags. You can compute the average, sum, median, max or min of the tags for each window. For instance, every window can contain the average cluster size, if you merge clusters instead of reads.
+
+The output file is a GFF3 file, where each element is a window. There is a special tag for each window, whose name is **nbElements** if you counted the number of transcripts per sliding window. However, if you performed a **min** (resp. **max**, **sum**, **median**, **average**) operation on the tags **value** of the transcripts, then the tag of the window will be **minValue** (resp. **maxValue**, **sumValue**, **medValue**, **avgValue**). You can also specify the name of your tag (which is actually advised: **nbReadsInSample1** will always be more informative than **nbElements**).
+	</help>
+</tool>
--- a/SMART/galaxy/clusterizeBySlidingWindows.xml	Fri Jan 10 09:04:03 2014 -0500
+++ b/SMART/galaxy/clusterizeBySlidingWindows.xml	Thu Jan 30 08:54:40 2014 -0500
@@ -4,7 +4,8 @@
 		<requirement type="set_environment">PYTHONPATH</requirement>
 	</requirements>
 	<command interpreter="python">
-		../Java/Python/clusterizeBySlidingWindows.py -i $formatType.inputFileName
+		#set $inputFiles = ",".join(["%s" % (s) for s in $formatType.inputFileName])
+		../Java/Python/clusterizeBySlidingWindows.py -i $inputFiles
 		#if $formatType.FormatInputFileName == 'bed':
 			-f bed
 		#elif $formatType.FormatInputFileName == 'gff':
@@ -21,7 +22,6 @@
 		-s $size
 		-e $overlap
 		-o $outputFileGff
-		$normalize
 		$strands

 		#if $OptionTag.tag == "Yes":
@@ -33,14 +33,10 @@
 		#end if

 		#if $OptionWriteTag.writeTag == "Yes":
-			-w $OptionWriteTag.value
+			#set $outputTags = ",".join(["%s" % (t["value"]) for t in $OptionWriteTag.writeTags])
+			-w $outputTags
 		#end if

-		$strand
-		$plot $plotPng
-		$excel $excelOutput
-
-
 	</command>

 	<inputs>
@@ -52,31 +48,34 @@
 				<option value="gff3">gff3</option>
 				<option value="sam">sam</option>
 				<option value="gtf">gtf</option>
+				<option value="bam">bam</option>
 			</param>
 			<when value="bed">
-				<param name="inputFileName" format="bed" type="data" label="Input File"/>
+				<param name="inputFileName" format="bed" type="data" multiple="true" label="Input File"/>
 			</when>
 			<when value="gff">
-				<param name="inputFileName" format="gff" type="data" label="Input File"/>
+				<param name="inputFileName" format="gff" type="data" multiple="true" label="Input File"/>
 			</when>
 			<when value="gff2">
-				<param name="inputFileName" format="gff2" type="data" label="Input File"/>
+				<param name="inputFileName" format="gff2" type="data" multiple="true" label="Input File"/>
 			</when>
 			<when value="gff3">
-				<param name="inputFileName" format="gff3" type="data" label="Input File"/>
+				<param name="inputFileName" format="gff3" type="data" multiple="true" label="Input File"/>
 			</when>
 			<when value="sam">
-				<param name="inputFileName" format="sam" type="data" label="Input File"/>
+				<param name="inputFileName" format="sam" type="data" multiple="true" label="Input File"/>
 			</when>
 			<when value="gtf">
-				<param name="inputFileName" format="gtf" type="data" label="Input File"/>
+				<param name="inputFileName" format="gtf" type="data" multiple="true" label="Input File"/>
+			</when>
+			<when value="bam">
+				<param name="inputFileName" format="bam" type="data" multiple="true" label="Input File"/>
 			</when>
 		</conditional>


 		<param name="size" type="text" value="50000" label="Size option" help="Size of the regions."/>
 		<param name="overlap" type="text" value="50" label="Overlap option" help="Overlap between two consecutive regions."/>
-		<param name="normalize" type="boolean" truevalue="-m" falsevalue="" checked="false" label="Normalize option for only GFF3 file format" help="(only work if the tag nbOccurrences is set)"/>
 		<param name="strands" type="boolean" truevalue="-2" falsevalue="" checked="false" label="Consider the two strands separately"/>

 		<conditional name="OptionTag">
@@ -91,7 +90,6 @@
 			</when>
 		</conditional>

-
 		<conditional name="OptionsOperation">
 			<param name="operation" type="select" label="combine tag value with given operation">
 				<option value="Yes">Yes</option>
@@ -110,14 +108,15 @@
 			</when>
 		</conditional>

-
 		<conditional name="OptionWriteTag">
-			<param name="writeTag" type="select" label="write a new tag in output file">
+			<param name="writeTag" type="select" label="use given tag in output file">
 				<option value="Yes">Yes</option>
 				<option value="No" selected="true">No</option>
 			</param>
 			<when value="Yes">
-				<param name="value" type="text" value="nbElements" label="write tag option" help="print the result in the given tag (default usually is 'nbElements')"/>
+				<repeat name="writeTags" title="Output Tag" min="1">
+					<param name="value" type="text" value="nbElements" label="write tag option" help="print the result in the given tag (default usually is 'nbElements')"/>
+				</repeat>
 			</when>
 			<when value="No">
 			</when>
@@ -132,10 +131,8 @@
 	<help>
 Sliding windows are a convenient ways to clusterize data mapped on the genome. There are two important parameters of a sliding window: the size of the window and the size of the overlap.

-By default, sliding windows count the number of reads in each window. However, you can basically merge any information which is contained in the tags. You can compute the average, sum, median, max or min of the tags for each window. For instance, every window can contain the average cluster size, if you merge clusters instead of reads.
+By default, sliding windows count the number of reads in each window of each input file. However, you can merge any information which is contained in the tags. You can compute the average, sum, median, max or min of the tags for each window. For instance, every window can contain the average cluster size, if you merge clusters instead of reads.

 The output file is a GFF3 file, where each element is a window. There is a special tag for each window, whose name is **nbElements** if you counted the number of transcripts per sliding window. However, if you performed a **min** (resp. **max**, **sum**, **median**, **average**) operation on the tags **value** of the transcripts, then the tag of the window will be **minValue** (resp. **maxValue**, **sumValue**, **medValue**, **avgValue**). You can also specify the name of your tag (which is actually advised: **nbReadsInSample1** will always be more informative than **nbElements**).
-
-You also have different option, which can select the *n* % highest regions, or the regions with at least *n* features in it, or even the regions with at least *n* unique features. This last option is useful when you want to cluster the reads which have mapped only once, for instance.
 	</help>
 </tool>
Binary file commons/__init__.pyc has changed
Binary file commons/core/__init__.pyc has changed
Binary file commons/core/checker/RepetException.pyc has changed
Binary file commons/core/checker/__init__.pyc has changed
Binary file commons/core/coord/Align.pyc has changed
Binary file commons/core/coord/Map.pyc has changed
Binary file commons/core/coord/Range.pyc has changed
Binary file commons/core/coord/__init__.pyc has changed
Binary file commons/core/parsing/AxtParser.pyc has changed
Binary file commons/core/parsing/BamParser.pyc has changed
Binary file commons/core/parsing/BedParser.pyc has changed
Binary file commons/core/parsing/BlastParser.pyc has changed
Binary file commons/core/parsing/BowtieParser.pyc has changed
Binary file commons/core/parsing/CoordsParser.pyc has changed
Binary file commons/core/parsing/ElandParser.pyc has changed
Binary file commons/core/parsing/ExoParser.pyc has changed
Binary file commons/core/parsing/FastaParser.pyc has changed
Binary file commons/core/parsing/FastqParser.pyc has changed
Binary file commons/core/parsing/GffParser.pyc has changed
Binary file commons/core/parsing/GtfParser.pyc has changed
Binary file commons/core/parsing/MapParser.pyc has changed
Binary file commons/core/parsing/MapperParser.pyc has changed
Binary file commons/core/parsing/MaqParser.pyc has changed
Binary file commons/core/parsing/NCListParser.pyc has changed
Binary file commons/core/parsing/ParserChooser.pyc has changed
Binary file commons/core/parsing/PklParser.pyc has changed
Binary file commons/core/parsing/PslParser.pyc has changed
Binary file commons/core/parsing/RmapParser.pyc has changed
Binary file commons/core/parsing/SamParser.pyc has changed
Binary file commons/core/parsing/SeqmapParser.pyc has changed
Binary file commons/core/parsing/SequenceListParser.pyc has changed
Binary file commons/core/parsing/ShrimpParser.pyc has changed
Binary file commons/core/parsing/Soap2Parser.pyc has changed
Binary file commons/core/parsing/SoapParser.pyc has changed
Binary file commons/core/parsing/TranscriptListParser.pyc has changed
Binary file commons/core/parsing/WigParser.pyc has changed
Binary file commons/core/parsing/__init__.pyc has changed
Binary file commons/core/seq/Bioseq.pyc has changed
Binary file commons/core/seq/__init__.pyc has changed
Binary file commons/core/writer/BedWriter.pyc has changed
Binary file commons/core/writer/CsvWriter.pyc has changed
Binary file commons/core/writer/EmblWriter.pyc has changed
Binary file commons/core/writer/FastaWriter.pyc has changed
Binary file commons/core/writer/FastqWriter.pyc has changed
Binary file commons/core/writer/GbWriter.pyc has changed
Binary file commons/core/writer/Gff2Writer.pyc has changed
Binary file commons/core/writer/Gff3Writer.pyc has changed
Binary file commons/core/writer/GtfWriter.pyc has changed
Binary file commons/core/writer/MapWriter.pyc has changed
Binary file commons/core/writer/MySqlTranscriptWriter.pyc has changed
Binary file commons/core/writer/SamWriter.pyc has changed
Binary file commons/core/writer/SequenceListWriter.pyc has changed
Binary file commons/core/writer/TranscriptListWriter.pyc has changed
Binary file commons/core/writer/UcscWriter.pyc has changed
Binary file commons/core/writer/WigWriter.pyc has changed
Binary file commons/core/writer/WriterChooser.pyc has changed
Binary file commons/core/writer/__init__.pyc has changed