Mercurial > repos > davidvanzessen > baseline_bayesian_estimation

--- a/Baseline_Functions.r	Tue Jul 15 09:00:24 2014 -0400
+++ b/Baseline_Functions.r	Wed Aug 13 07:32:38 2014 -0400
@@ -361,6 +361,7 @@
     grepResults = gregexpr("-*",x)
     grepResultsPos = unlist(grepResults)
     grepResultsLen =  attr(grepResults[[1]],"match.length")
+    print(paste("x = '", x, "'", sep=""))
     x = s2c(x)
     if(x[1]=="-"){
       x[1:grepResultsLen[1]] = "N"
--- a/baseline.xml	Tue Jul 15 09:00:24 2014 -0400
+++ b/baseline.xml	Wed Aug 13 07:32:38 2014 -0400
@@ -1,10 +1,24 @@
 <tool id="baseline_bayesian_estimation" name="Baseline" version="1.0">
 	<description>Bayesian Estimation of Antigen-Driven Selection</description>
 	<command interpreter="bash">
-		wrapper.sh $ss $species $substitution $mutability $clonal $fixindels "$boundaries" $in_file $out_file.files_path result $out_file $out_file2
+		wrapper.sh $ss $species $substitution $mutability $clonal $fixindels "$boundaries" "
+		#for $i, $input in enumerate($inputs)
+ ${input.in_file}
+		#end for
+		"
+		"
+		#for $i, $input in enumerate($inputs)
+ ${input.id}
+		#end for
+		"
+    $reference $out_file
 	</command>
 	<inputs>
-		<param name="in_file" format="fasta" type="data" label="Data to Process" />
+		<repeat name="inputs" title="inputs" min="1" default="1">
+			<param name="in_file" type="data" label="Input excel or IMGT zip file" />
+			<param name="id" type="text" label="ID (alpha-numeric, no spaces)" />
+		</repeat>
+		<param name="reference" type="data" format="fasta" label="Reference fasta file" />
 		<param name="ss" type="select" label="Selection Statistic">
 			<option value="1">Focused</option>
 			<option value="2">Local</option>
@@ -14,13 +28,13 @@
 			<option value="2">Mouse</option>
 		</param>
 		<param name="substitution" type="select" label="Substitution Model">
+			<option value="1">Smith DS et al. 1996</option>
 			<option value="0">Uniform substitution</option>
-			<option value="1">Smith DS et al. 1996</option>
 			<option value="5">FiveS</option>
 		</param>
 		<param name="mutability" type="select" label="Mutability Model">
+			<option value="1">Tri-nucleotide (Shapiro GS et al. 2002)</option>
 			<option value="0">Uniform mutability</option>
-			<option value="1">Tri-nucleotide (Shapiro GS et al. 2002)</option>
 			<option value="5">FiveS</option>
 		</param>
 		<param name="clonal" type="select" label="Sequences are clonal">
@@ -33,13 +47,12 @@
 			<option value="1">Try and fix Indels</option>
 		</param>
 		<param name="boundaries" type="select" label="FWR/CDR3 Boundaries">
+			<option value="1:26:38:55:65:104:-">IMGT® No CDR3</option>
 			<option value="1:26:38:55:65:104:116">IMGT®</option>
-			<option value="1:26:38:55:65:104:-">IMGT® No CDR3</option>
 		</param>
 	</inputs>
 	<outputs>
-		<data format="tabular" name="out_file" label = "${tool.name} on ${on_string}: result"/>
-		<data format="data" name="out_file2" label = "${tool.name} on ${on_string}: RData"/>
+		<data format="pdf" name="out_file" label = "Baseline on ${on_string}"/>
 	</outputs>
 	<help>
 			Gur Yaari; Mohamed Uduman; Steven H. Kleinstein. Quantifying selection in high-throughput Immunoglobulin sequencing data sets. Nucleic Acids Res. 2012 May 27.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/comparePDFs.r	Wed Aug 13 07:32:38 2014 -0400
@@ -0,0 +1,225 @@
+options("warn"=-1)
+
+#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r
+# Compute p-value of two distributions
+compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){
+#print(c(length(dens1),length(dens2)))
+if(length(dens1)>1 & length(dens2)>1 ){
+	dens1<-dens1/sum(dens1)
+	dens2<-dens2/sum(dens2)
+	cum2 <- cumsum(dens2)-dens2/2
+	tmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i])))
+	#print(tmp)
+	if(tmp>0.5)tmp<-tmp-1
+	return( tmp )
+	}
+	else {
+	return(NA)
+	}
+	#return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N)
+}
+
+
+require("grid")
+arg <- commandArgs(TRUE)
+#arg <- c("300143","4","5")
+arg[!arg=="clonal"]
+input <- arg[1]
+output <- arg[2]
+rowIDs <- as.numeric(  sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } )  )
+
+numbSeqs = length(rowIDs)
+
+if ( is.na(rowIDs[1]) | numbSeqs>10 ) {
+  stop( paste("Error: Please select between one and 10 seqeunces to compare.") )
+}
+
+#load( paste("output/",sessionID,".RData",sep="") )
+load( input )
+#input
+
+xMarks = seq(-20,20,length.out=4001)
+
+plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){
+  yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1
+
+  if(length(xlim==2)){
+    xMin=xlim[1]
+    xMax=xlim[2]
+  } else {
+    xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]
+    xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]
+    xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]
+    xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]
+
+    xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1]
+    xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1]
+    xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])]
+    xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])]
+
+    xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE)
+    xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE)
+  }
+
+  sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x
+  grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex))
+  x <- sigma
+  pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc"))
+  #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex))
+  pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05)))
+  grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc")
+  grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc")
+  grid.rect()
+  grid.xaxis(gp = gpar(cex=cex/1.1))
+  yticks = pretty(c(-yMax,yMax),8)
+  yticks = yticks[yticks>(-yMax) & yticks<(yMax)]
+  grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1))
+  if(length(listPDFs[pdf1][[1]][["CDR"]])>1){
+    ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y
+    grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2))
+  }
+  if(length(listPDFs[pdf1][[1]][["FWR"]])>1){
+    yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y
+    grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2))
+   }
+
+  if(length(listPDFs[pdf2][[1]][["CDR"]])>1){
+    ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y
+    grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd=2,lty=2))
+  }
+  if(length(listPDFs[pdf2][[1]][["FWR"]])>1){
+    yfwr2<-approx(xMarks,listPDFs[pdf2][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y
+    grid.lines(unit(x,"native"), unit(-yfwr2,"native"),gp=gpar(col=4,lwd=2,lty=2))
+   }
+
+  grid.lines(unit(c(0,1),"npc"), unit(c(0.5,0.5),"npc"),gp=gpar(col=1))
+  grid.lines(unit(c(0,0),"native"), unit(c(0,1),"npc"),gp=gpar(col=1,lwd=1,lty=3))
+
+  grid.text("Density", x = unit(-2.5, "lines"), rot = 90,gp = gpar(cex=cex))
+  grid.text( expression(paste("Selection Strength (", Sigma, ")", sep="")) , y = unit(-2.5, "lines"),gp = gpar(cex=cex))
+
+  if(pdf1==pdf2 & length(listPDFs[pdf2][[1]][["FWR"]])>1 & length(listPDFs[pdf2][[1]][["CDR"]])>1 ){
+    pCDRFWR = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["CDR"]], dens2=listPDFs[[pdf1]][["FWR"]])
+    pval = formatC(as.numeric(pCDRFWR),digits=3)
+    grid.text( substitute(expression(paste(P[CDR/FWR], "=", x, sep="")),list(x=pval))[[2]] , x = unit(0.02, "npc"),y = unit(0.98, "npc"),just=c("left", "top"),gp = gpar(cex=cex*1.2))
+  }
+  grid.text(paste("CDR"), x = unit(0.98, "npc"),y = unit(0.98, "npc"),just=c("right", "top"),gp = gpar(cex=cex*1.5))
+  grid.text(paste("FWR"), x = unit(0.98, "npc"),y = unit(0.02, "npc"),just=c("right", "bottom"),gp = gpar(cex=cex*1.5))
+  popViewport(2)
+}
+#plot_grid_s(1)
+
+
+p2col<-function(p=0.01){
+  breaks=c(-.51,-0.1,-.05,-0.01,-0.005,0,0.005,0.01,0.05,0.1,0.51)
+  i<-findInterval(p,breaks)
+  cols = c( rgb(0.8,1,0.8), rgb(0.6,1,0.6), rgb(0.4,1,0.4), rgb(0.2,1,0.2) , rgb(0,1,0),
+            rgb(1,0,0), rgb(1,.2,.2), rgb(1,.4,.4), rgb(1,.6,.6) , rgb(1,.8,.8) )
+  return(cols[i])
+}
+
+
+plot_pvals<-function(pdf1,pdf2,cex=1,upper=TRUE){
+  if(upper){
+    pCDR1FWR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["CDR"]], dens2=listPDFs[[pdf2]][["FWR"]])
+    pFWR1FWR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["FWR"]], dens2=listPDFs[[pdf2]][["FWR"]])
+    pFWR1CDR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens2=listPDFs[[pdf2]][["CDR"]], dens1=listPDFs[[pdf1]][["FWR"]])
+    pCDR1CDR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens2=listPDFs[[pdf2]][["CDR"]], dens1=listPDFs[[pdf1]][["CDR"]])
+    grid.polygon(c(0.5,0.5,1,1),c(0,0.5,0.5,0),gp=gpar(col=p2col(pFWR1FWR2),fill=p2col(pFWR1FWR2)),default.units="npc")
+    grid.polygon(c(0.5,0.5,1,1),c(1,0.5,0.5,1),gp=gpar(col=p2col(pCDR1FWR2),fill=p2col(pCDR1FWR2)),default.units="npc")
+    grid.polygon(c(0.5,0.5,0,0),c(1,0.5,0.5,1),gp=gpar(col=p2col(pCDR1CDR2),fill=p2col(pCDR1CDR2)),default.units="npc")
+    grid.polygon(c(0.5,0.5,0,0),c(0,0.5,0.5,0),gp=gpar(col=p2col(pFWR1CDR2),fill=p2col(pFWR1CDR2)),default.units="npc")
+
+    grid.lines(c(0,1),0.5,gp=gpar(lty=2,col=gray(0.925)))
+    grid.lines(0.5,c(0,1),gp=gpar(lty=2,col=gray(0.925)))
+
+    grid.text(formatC(as.numeric(pFWR1FWR2),digits=3), x = unit(0.75, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))
+    grid.text(formatC(as.numeric(pCDR1FWR2),digits=3), x = unit(0.75, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))
+    grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))
+    grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))
+
+
+ #   grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex))
+ #   grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex))
+  }
+  else{
+  }
+}
+
+
+##################################################################################
+################## The whole OCD's matrix ########################################
+##################################################################################
+
+#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)
+pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)
+
+pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null")))))
+
+for( seqOne in 1:numbSeqs+1){
+  pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1))
+  if(seqOne>2){
+    grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")
+    grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")
+    grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc")
+
+    grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center")
+    grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center")
+  }
+  grid.rect(gp = gpar(col=grey(0.9)))
+  grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center")
+  popViewport(1)
+}
+
+for( seqOne in 1:numbSeqs+1){
+  pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1))
+  if(seqOne<=numbSeqs){
+    grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")
+    grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")
+    grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc")
+    grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270)
+    grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270)
+  }
+  grid.rect(gp = gpar(col=grey(0.9)))
+  grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center")
+  popViewport(1)
+}
+
+for( seqOne in 1:numbSeqs+1){
+  for(seqTwo in 1:numbSeqs+1){
+    pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne))
+    if(seqTwo>seqOne){
+      plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2)
+      grid.rect()
+    }
+    popViewport(1)
+  }
+}
+
+
+xMin=0
+xMax=0.01
+for(pdf1 in rowIDs){
+  xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]
+  xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]
+  xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]
+  xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]
+  xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE)
+  xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE)
+}
+
+
+
+for(i in 1:numbSeqs+1){
+  for(j in (i-1):numbSeqs){
+    pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1))
+    grid.rect()
+    plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1)
+    popViewport(1)
+  }
+}
+
+dev.off()
+
+cat("Success", paste(rowIDs,collapse="_"),sep=":")
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script_imgt.py	Wed Aug 13 07:32:38 2014 -0400
@@ -0,0 +1,75 @@
+import xlrd
+import argparse
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
+
+args = parser.parse_args()
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+	currentSeq = ""
+	currentId = ""
+	for line in ref.readlines():
+		if line[0] is ">":
+			if currentSeq is not "" and currentId is not "":
+				refdic[currentId[1:]] = currentSeq
+			currentId = line.rstrip()
+			currentSeq = ""
+		else:
+			currentSeq += line.rstrip()
+	refdic[currentId[1:]] = currentSeq
+
+
+vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
+#						r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
+#						r"(IGKV[0-3]D?-[0-9]{1,2})",
+#						r"(IGLV[0-9]-[0-9]{1,2})",
+#						r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
+#						r"(TRGV[234589])",
+#						r"(TRDV[1-3])"]
+
+#vPattern = re.compile(r"|".join(vPattern))
+vPattern = re.compile("|".join(vPattern))
+
+def filterGene(s, pattern):
+    if type(s) is not str:
+        return None
+    res = pattern.search(s)
+    if res:
+        return res.group(0)
+    return None
+
+
+
+currentSeq = ""
+currentId = ""
+with open(args.input, 'r') as i:
+	with open(args.output, 'a') as o:
+		o.write(">>>" + args.id + "\n")
+		outputdic = dict()
+		for line in i.readlines()[1:]:
+			linesplt = line.split("\t")
+			ref = filterGene(linesplt[1], vPattern)
+			if not ref or not linesplt[2].rstrip():
+				continue
+			if ref in outputdic:
+				outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+			else:
+				outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+		#print outputdic
+
+		for k in outputdic.keys():
+			if k in refdic:
+				o.write(">>" + k + "\n")
+				o.write(refdic[k] + "\n")
+				for seq in outputdic[k]:
+					#print seq
+					o.write(">" + seq[0] + "\n")
+					o.write(seq[1] + "\n")
+			else:
+				print k + " not in reference, skipping " + k
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script_xlsx.py	Wed Aug 13 07:32:38 2014 -0400
@@ -0,0 +1,58 @@
+import xlrd
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+gene_column = 6
+id_column = 7
+seq_column = 8
+LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
+
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+	currentSeq = ""
+	currentId = ""
+	for line in ref.readlines():
+		if line[0] is ">":
+			if currentSeq is not "" and currentId is not "":
+				refdic[currentId[1:]] = currentSeq
+			currentId = line.rstrip()
+			currentSeq = ""
+		else:
+			currentSeq += line.rstrip()
+	refdic[currentId[1:]] = currentSeq
+
+currentSeq = ""
+currentId = ""
+with xlrd.open_workbook(args.input, 'r') as wb:
+	with open(args.output, 'a') as o:
+		for sheet in wb.sheets():
+			if sheet.cell(1,gene_column).value.find("IGHV") < 0:
+				print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
+				continue
+			o.write(">>>" + sheet.name + "\n")
+			outputdic = dict()
+			for rowindex in range(1, sheet.nrows):
+				ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
+				if ref in outputdic:
+					outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+				else:
+					outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+			#print outputdic
+
+			for k in outputdic.keys():
+				if k in refdic:
+					o.write(">>" + k + "\n")
+					o.write(refdic[k] + "\n")
+					for seq in outputdic[k]:
+						#print seq
+						o.write(">" + seq[0] + "\n")
+						o.write(seq[1] + "\n")
+				else:
+					print k + " not in reference, skipping " + k
--- a/wrapper.sh	Tue Jul 15 09:00:24 2014 -0400
+++ b/wrapper.sh	Wed Aug 13 07:32:38 2014 -0400
@@ -8,26 +8,62 @@
 clonal=$5
 fixIndels=$6
 region=$7
-input=$8
-outDir=$9
-#mkdir $outDir
-outID=${10}
-outFile=${11}
-outFile2=${12}
+inputs=$8
+inputs=($inputs)
+IDs=$9
+IDs=($IDs)
+ref=${10}
+output=${11}
+outID="result"
+
+echo "testID = $testID"
+echo "species = $species"
+echo "substitutionModel = $substitutionModel"
+echo "mutabilityModel = $mutabilityModel"
+echo "clonal = $clonal"
+echo "fixIndels = $fixIndels"
+echo "region = $region"
+echo "inputs = ${inputs[@]}"
+echo "IDs = ${IDs[@]}"
+echo "ref = $ref"
+echo "output = $output"
+echo "outID = $outID"
+
+fasta="$PWD/baseline.fasta"
+
+
+count=0
+for current in ${inputs[@]}
+do
+	f=$(file $current)
+	zipType="Zip archive"
+	if [[ "$f" == *"$zipType"* ]]
+	then
+		id=${IDs[$count]}
+		echo "id=$id"
+		unzip $current -d $PWD/$id/ >> $PWD/unziplog.log
+		filename="$PWD/gappednt_${id}.txt"
+		cat $PWD/$id/*/2_* | cut -f2,4,7 > $filename
+		python $dir/script_imgt.py --input $filename --ref $ref --output $fasta --id $id
+	else
+		python $dir/script_xlsx.py --input $current --ref $ref --output $fasta
+	fi
+	count=$((count+1))
+done
+workdir="$PWD"
+cd $dir
+Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1
+
+echo "$workdir/${outID}.txt"
+
+rows=`tail -n +2 $workdir/${outID}.txt | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '`
+rows=($rows)
+unset rows[${#rows[@]}-1]

 cd $dir
-Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $input $dir/ $outID 2>&1
+Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1

-mv $dir/${outID}.txt $outFile
-mv $dir/${outID}.RData $outFile2

-#tail -n+2 $outDir/${outID}.txt > $outDir/tmp.txt
-#echo "<html><table border='1'><thead>" >> $outFile
-#echo "<tr><th>Type</th><th>ID</th><th>Observed_CDR_R</th><th>Observed_CDR_S</th><th>Observed_FWR_R</th><th>Observed_FWR_S</th><th>Expected_CDR_R</th><th>Expected_CDR_S</th><th>Expected_FWR_R</th><th>Expected_FWR_S</th><th>Focused_Sigma_CDR</th><th>Focused_CIlower_CDR</th><th>Focused_CIupper_CDR</th><th>Focused_Sigma_FWR</th><th>Focused_CIlower_FWR</th><th>Focused_CIupper_FWR</th><th>Focused_P_CDR</th><th>Focused_P_FWR</th></tr></thead><tbody>" >> $outFile
-#while read Type ID Observed_CDR_R Observed_CDR_S Observed_FWR_R Observed_FWR_S Expected_CDR_R Expected_CDR_S Expected_FWR_R Expected_FWR_S Focused_Sigma_CDR Focused_CIlower_CDR Focused_CIupper_CDR Focused_Sigma_FWR Focused_CIlower_FWR Focused_CIupper_FWR Focused_P_CDR Focused_P_FWR
-#do
-#	echo "<tr><td>$Type</td><td>$ID</td><td>$Observed_CDR_R</td><td>$Observed_CDR_S</td><td>$Observed_FWR_R</td><td>$Observed_FWR_S</td><td>$Expected_CDR_R</td><td>$Expected_CDR_S</td><td>$Expected_FWR_R</td><td>$Expected_FWR_S</td><td>$Focused_Sigma_CDR</td><td>$Focused_CIlower_CDR</td><td>$Focused_CIupper_CDR</td><td>$Focused_Sigma_FWR</td><td>$Focused_CIlower_FWR</td><td>$Focused_CIupper_FWR</td><td>$Focused_P_CDR</td><td>$Focused_P_FWR</td></tr>" >> $outFile
-#done < $outDir/tmp.txt
-#echo "</tbody></table></html>" >> $outFile
-#rm $outDir/tmp.txt
+

+