Mercurial > repos > davidvanzessen > baseline_bayesian_estimation
changeset 2:94fada165724 draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 13 Aug 2014 07:32:38 -0400 |
parents | e8dd8474aecb |
children | 38c038c17d0c |
files | Baseline_Functions.r baseline.xml comparePDFs.r script_imgt.py script_xlsx.py wrapper.sh |
diffstat | 6 files changed, 433 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/Baseline_Functions.r Tue Jul 15 09:00:24 2014 -0400 +++ b/Baseline_Functions.r Wed Aug 13 07:32:38 2014 -0400 @@ -361,6 +361,7 @@ grepResults = gregexpr("-*",x) grepResultsPos = unlist(grepResults) grepResultsLen = attr(grepResults[[1]],"match.length") + print(paste("x = '", x, "'", sep="")) x = s2c(x) if(x[1]=="-"){ x[1:grepResultsLen[1]] = "N"
--- a/baseline.xml Tue Jul 15 09:00:24 2014 -0400 +++ b/baseline.xml Wed Aug 13 07:32:38 2014 -0400 @@ -1,10 +1,24 @@ <tool id="baseline_bayesian_estimation" name="Baseline" version="1.0"> <description>Bayesian Estimation of Antigen-Driven Selection</description> <command interpreter="bash"> - wrapper.sh $ss $species $substitution $mutability $clonal $fixindels "$boundaries" $in_file $out_file.files_path result $out_file $out_file2 + wrapper.sh $ss $species $substitution $mutability $clonal $fixindels "$boundaries" " + #for $i, $input in enumerate($inputs) + ${input.in_file} + #end for + " + " + #for $i, $input in enumerate($inputs) + ${input.id} + #end for + " + $reference $out_file </command> <inputs> - <param name="in_file" format="fasta" type="data" label="Data to Process" /> + <repeat name="inputs" title="inputs" min="1" default="1"> + <param name="in_file" type="data" label="Input excel or IMGT zip file" /> + <param name="id" type="text" label="ID (alpha-numeric, no spaces)" /> + </repeat> + <param name="reference" type="data" format="fasta" label="Reference fasta file" /> <param name="ss" type="select" label="Selection Statistic"> <option value="1">Focused</option> <option value="2">Local</option> @@ -14,13 +28,13 @@ <option value="2">Mouse</option> </param> <param name="substitution" type="select" label="Substitution Model"> + <option value="1">Smith DS et al. 1996</option> <option value="0">Uniform substitution</option> - <option value="1">Smith DS et al. 1996</option> <option value="5">FiveS</option> </param> <param name="mutability" type="select" label="Mutability Model"> + <option value="1">Tri-nucleotide (Shapiro GS et al. 2002)</option> <option value="0">Uniform mutability</option> - <option value="1">Tri-nucleotide (Shapiro GS et al. 2002)</option> <option value="5">FiveS</option> </param> <param name="clonal" type="select" label="Sequences are clonal"> @@ -33,13 +47,12 @@ <option value="1">Try and fix Indels</option> </param> <param name="boundaries" type="select" label="FWR/CDR3 Boundaries"> + <option value="1:26:38:55:65:104:-">IMGT® No CDR3</option> <option value="1:26:38:55:65:104:116">IMGT®</option> - <option value="1:26:38:55:65:104:-">IMGT® No CDR3</option> </param> </inputs> <outputs> - <data format="tabular" name="out_file" label = "${tool.name} on ${on_string}: result"/> - <data format="data" name="out_file2" label = "${tool.name} on ${on_string}: RData"/> + <data format="pdf" name="out_file" label = "Baseline on ${on_string}"/> </outputs> <help> Gur Yaari; Mohamed Uduman; Steven H. Kleinstein. Quantifying selection in high-throughput Immunoglobulin sequencing data sets. Nucleic Acids Res. 2012 May 27.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/comparePDFs.r Wed Aug 13 07:32:38 2014 -0400 @@ -0,0 +1,225 @@ +options("warn"=-1) + +#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r +# Compute p-value of two distributions +compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){ +#print(c(length(dens1),length(dens2))) +if(length(dens1)>1 & length(dens2)>1 ){ + dens1<-dens1/sum(dens1) + dens2<-dens2/sum(dens2) + cum2 <- cumsum(dens2)-dens2/2 + tmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i]))) + #print(tmp) + if(tmp>0.5)tmp<-tmp-1 + return( tmp ) + } + else { + return(NA) + } + #return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N) +} + + +require("grid") +arg <- commandArgs(TRUE) +#arg <- c("300143","4","5") +arg[!arg=="clonal"] +input <- arg[1] +output <- arg[2] +rowIDs <- as.numeric( sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } ) ) + +numbSeqs = length(rowIDs) + +if ( is.na(rowIDs[1]) | numbSeqs>10 ) { + stop( paste("Error: Please select between one and 10 seqeunces to compare.") ) +} + +#load( paste("output/",sessionID,".RData",sep="") ) +load( input ) +#input + +xMarks = seq(-20,20,length.out=4001) + +plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){ + yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1 + + if(length(xlim==2)){ + xMin=xlim[1] + xMax=xlim[2] + } else { + xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1] + xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1] + xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])] + xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])] + + xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1] + xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1] + xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])] + xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])] + + xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE) + xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE) + } + + sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x + grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex)) + x <- sigma + pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc")) + #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex)) + pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05))) + grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc") + grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc") + grid.rect() + grid.xaxis(gp = gpar(cex=cex/1.1)) + yticks = pretty(c(-yMax,yMax),8) + yticks = yticks[yticks>(-yMax) & yticks<(yMax)] + grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1)) + if(length(listPDFs[pdf1][[1]][["CDR"]])>1){ + ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2)) + } + if(length(listPDFs[pdf1][[1]][["FWR"]])>1){ + yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2)) + } + + if(length(listPDFs[pdf2][[1]][["CDR"]])>1){ + ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd=2,lty=2)) + } + if(length(listPDFs[pdf2][[1]][["FWR"]])>1){ + yfwr2<-approx(xMarks,listPDFs[pdf2][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y + grid.lines(unit(x,"native"), unit(-yfwr2,"native"),gp=gpar(col=4,lwd=2,lty=2)) + } + + grid.lines(unit(c(0,1),"npc"), unit(c(0.5,0.5),"npc"),gp=gpar(col=1)) + grid.lines(unit(c(0,0),"native"), unit(c(0,1),"npc"),gp=gpar(col=1,lwd=1,lty=3)) + + grid.text("Density", x = unit(-2.5, "lines"), rot = 90,gp = gpar(cex=cex)) + grid.text( expression(paste("Selection Strength (", Sigma, ")", sep="")) , y = unit(-2.5, "lines"),gp = gpar(cex=cex)) + + if(pdf1==pdf2 & length(listPDFs[pdf2][[1]][["FWR"]])>1 & length(listPDFs[pdf2][[1]][["CDR"]])>1 ){ + pCDRFWR = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["CDR"]], dens2=listPDFs[[pdf1]][["FWR"]]) + pval = formatC(as.numeric(pCDRFWR),digits=3) + grid.text( substitute(expression(paste(P[CDR/FWR], "=", x, sep="")),list(x=pval))[[2]] , x = unit(0.02, "npc"),y = unit(0.98, "npc"),just=c("left", "top"),gp = gpar(cex=cex*1.2)) + } + grid.text(paste("CDR"), x = unit(0.98, "npc"),y = unit(0.98, "npc"),just=c("right", "top"),gp = gpar(cex=cex*1.5)) + grid.text(paste("FWR"), x = unit(0.98, "npc"),y = unit(0.02, "npc"),just=c("right", "bottom"),gp = gpar(cex=cex*1.5)) + popViewport(2) +} +#plot_grid_s(1) + + +p2col<-function(p=0.01){ + breaks=c(-.51,-0.1,-.05,-0.01,-0.005,0,0.005,0.01,0.05,0.1,0.51) + i<-findInterval(p,breaks) + cols = c( rgb(0.8,1,0.8), rgb(0.6,1,0.6), rgb(0.4,1,0.4), rgb(0.2,1,0.2) , rgb(0,1,0), + rgb(1,0,0), rgb(1,.2,.2), rgb(1,.4,.4), rgb(1,.6,.6) , rgb(1,.8,.8) ) + return(cols[i]) +} + + +plot_pvals<-function(pdf1,pdf2,cex=1,upper=TRUE){ + if(upper){ + pCDR1FWR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["CDR"]], dens2=listPDFs[[pdf2]][["FWR"]]) + pFWR1FWR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens1=listPDFs[[pdf1]][["FWR"]], dens2=listPDFs[[pdf2]][["FWR"]]) + pFWR1CDR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens2=listPDFs[[pdf2]][["CDR"]], dens1=listPDFs[[pdf1]][["FWR"]]) + pCDR1CDR2 = compareTwoDistsFaster(sigma_S=xMarks, N=10000, dens2=listPDFs[[pdf2]][["CDR"]], dens1=listPDFs[[pdf1]][["CDR"]]) + grid.polygon(c(0.5,0.5,1,1),c(0,0.5,0.5,0),gp=gpar(col=p2col(pFWR1FWR2),fill=p2col(pFWR1FWR2)),default.units="npc") + grid.polygon(c(0.5,0.5,1,1),c(1,0.5,0.5,1),gp=gpar(col=p2col(pCDR1FWR2),fill=p2col(pCDR1FWR2)),default.units="npc") + grid.polygon(c(0.5,0.5,0,0),c(1,0.5,0.5,1),gp=gpar(col=p2col(pCDR1CDR2),fill=p2col(pCDR1CDR2)),default.units="npc") + grid.polygon(c(0.5,0.5,0,0),c(0,0.5,0.5,0),gp=gpar(col=p2col(pFWR1CDR2),fill=p2col(pFWR1CDR2)),default.units="npc") + + grid.lines(c(0,1),0.5,gp=gpar(lty=2,col=gray(0.925))) + grid.lines(0.5,c(0,1),gp=gpar(lty=2,col=gray(0.925))) + + grid.text(formatC(as.numeric(pFWR1FWR2),digits=3), x = unit(0.75, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + grid.text(formatC(as.numeric(pCDR1FWR2),digits=3), x = unit(0.75, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex)) + + + # grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex)) + # grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex)) + } + else{ + } +} + + +################################################################################## +################## The whole OCD's matrix ######################################## +################################################################################## + +#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) +pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) + +pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null"))))) + +for( seqOne in 1:numbSeqs+1){ + pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1)) + if(seqOne>2){ + grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc") + grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc") + grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc") + + grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center") + grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center") + } + grid.rect(gp = gpar(col=grey(0.9))) + grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center") + popViewport(1) +} + +for( seqOne in 1:numbSeqs+1){ + pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1)) + if(seqOne<=numbSeqs){ + grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc") + grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc") + grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc") + grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270) + grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270) + } + grid.rect(gp = gpar(col=grey(0.9))) + grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center") + popViewport(1) +} + +for( seqOne in 1:numbSeqs+1){ + for(seqTwo in 1:numbSeqs+1){ + pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne)) + if(seqTwo>seqOne){ + plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2) + grid.rect() + } + popViewport(1) + } +} + + +xMin=0 +xMax=0.01 +for(pdf1 in rowIDs){ + xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1] + xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1] + xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])] + xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])] + xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE) + xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE) +} + + + +for(i in 1:numbSeqs+1){ + for(j in (i-1):numbSeqs){ + pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1)) + grid.rect() + plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1) + popViewport(1) + } +} + +dev.off() + +cat("Success", paste(rowIDs,collapse="_"),sep=":") +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script_imgt.py Wed Aug 13 07:32:38 2014 -0400 @@ -0,0 +1,75 @@ +import xlrd +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") +parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") + +args = parser.parse_args() + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref.readlines(): + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + + +vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, +# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", +# r"(IGKV[0-3]D?-[0-9]{1,2})", +# r"(IGLV[0-9]-[0-9]{1,2})", +# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", +# r"(TRGV[234589])", +# r"(TRDV[1-3])"] + +#vPattern = re.compile(r"|".join(vPattern)) +vPattern = re.compile("|".join(vPattern)) + +def filterGene(s, pattern): + if type(s) is not str: + return None + res = pattern.search(s) + if res: + return res.group(0) + return None + + + +currentSeq = "" +currentId = "" +with open(args.input, 'r') as i: + with open(args.output, 'a') as o: + o.write(">>>" + args.id + "\n") + outputdic = dict() + for line in i.readlines()[1:]: + linesplt = line.split("\t") + ref = filterGene(linesplt[1], vPattern) + if not ref or not linesplt[2].rstrip(): + continue + if ref in outputdic: + outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + else: + outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script_xlsx.py Wed Aug 13 07:32:38 2014 -0400 @@ -0,0 +1,58 @@ +import xlrd +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +gene_column = 6 +id_column = 7 +seq_column = 8 +LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] + + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref.readlines(): + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + +currentSeq = "" +currentId = "" +with xlrd.open_workbook(args.input, 'r') as wb: + with open(args.output, 'a') as o: + for sheet in wb.sheets(): + if sheet.cell(1,gene_column).value.find("IGHV") < 0: + print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name + continue + o.write(">>>" + sheet.name + "\n") + outputdic = dict() + for rowindex in range(1, sheet.nrows): + ref = sheet.cell(rowindex, gene_column).value.replace(">", "") + if ref in outputdic: + outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + else: + outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k
--- a/wrapper.sh Tue Jul 15 09:00:24 2014 -0400 +++ b/wrapper.sh Wed Aug 13 07:32:38 2014 -0400 @@ -8,26 +8,62 @@ clonal=$5 fixIndels=$6 region=$7 -input=$8 -outDir=$9 -#mkdir $outDir -outID=${10} -outFile=${11} -outFile2=${12} +inputs=$8 +inputs=($inputs) +IDs=$9 +IDs=($IDs) +ref=${10} +output=${11} +outID="result" + +echo "testID = $testID" +echo "species = $species" +echo "substitutionModel = $substitutionModel" +echo "mutabilityModel = $mutabilityModel" +echo "clonal = $clonal" +echo "fixIndels = $fixIndels" +echo "region = $region" +echo "inputs = ${inputs[@]}" +echo "IDs = ${IDs[@]}" +echo "ref = $ref" +echo "output = $output" +echo "outID = $outID" + +fasta="$PWD/baseline.fasta" + + +count=0 +for current in ${inputs[@]} +do + f=$(file $current) + zipType="Zip archive" + if [[ "$f" == *"$zipType"* ]] + then + id=${IDs[$count]} + echo "id=$id" + unzip $current -d $PWD/$id/ >> $PWD/unziplog.log + filename="$PWD/gappednt_${id}.txt" + cat $PWD/$id/*/2_* | cut -f2,4,7 > $filename + python $dir/script_imgt.py --input $filename --ref $ref --output $fasta --id $id + else + python $dir/script_xlsx.py --input $current --ref $ref --output $fasta + fi + count=$((count+1)) +done +workdir="$PWD" +cd $dir +Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1 + +echo "$workdir/${outID}.txt" + +rows=`tail -n +2 $workdir/${outID}.txt | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '` +rows=($rows) +unset rows[${#rows[@]}-1] cd $dir -Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $input $dir/ $outID 2>&1 +Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1 -mv $dir/${outID}.txt $outFile -mv $dir/${outID}.RData $outFile2 -#tail -n+2 $outDir/${outID}.txt > $outDir/tmp.txt -#echo "<html><table border='1'><thead>" >> $outFile -#echo "<tr><th>Type</th><th>ID</th><th>Observed_CDR_R</th><th>Observed_CDR_S</th><th>Observed_FWR_R</th><th>Observed_FWR_S</th><th>Expected_CDR_R</th><th>Expected_CDR_S</th><th>Expected_FWR_R</th><th>Expected_FWR_S</th><th>Focused_Sigma_CDR</th><th>Focused_CIlower_CDR</th><th>Focused_CIupper_CDR</th><th>Focused_Sigma_FWR</th><th>Focused_CIlower_FWR</th><th>Focused_CIupper_FWR</th><th>Focused_P_CDR</th><th>Focused_P_FWR</th></tr></thead><tbody>" >> $outFile -#while read Type ID Observed_CDR_R Observed_CDR_S Observed_FWR_R Observed_FWR_S Expected_CDR_R Expected_CDR_S Expected_FWR_R Expected_FWR_S Focused_Sigma_CDR Focused_CIlower_CDR Focused_CIupper_CDR Focused_Sigma_FWR Focused_CIlower_FWR Focused_CIupper_FWR Focused_P_CDR Focused_P_FWR -#do -# echo "<tr><td>$Type</td><td>$ID</td><td>$Observed_CDR_R</td><td>$Observed_CDR_S</td><td>$Observed_FWR_R</td><td>$Observed_FWR_S</td><td>$Expected_CDR_R</td><td>$Expected_CDR_S</td><td>$Expected_FWR_R</td><td>$Expected_FWR_S</td><td>$Focused_Sigma_CDR</td><td>$Focused_CIlower_CDR</td><td>$Focused_CIupper_CDR</td><td>$Focused_Sigma_FWR</td><td>$Focused_CIlower_FWR</td><td>$Focused_CIupper_FWR</td><td>$Focused_P_CDR</td><td>$Focused_P_FWR</td></tr>" >> $outFile -#done < $outDir/tmp.txt -#echo "</tbody></table></html>" >> $outFile -#rm $outDir/tmp.txt + +