Mercurial > repos > davidvanzessen > combined_immune_repertoire_pipeline

--- a/RScript.r	Mon Nov 25 09:21:55 2013 -0500
+++ b/RScript.r	Tue Dec 10 05:53:08 2013 -0500
@@ -1,4 +1,4 @@
-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+#options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )

 args <- commandArgs(trailingOnly = TRUE)

@@ -30,11 +30,6 @@

 test = test[test$Sample != "",]

-if("Replicate" %in% colnames(test))
-{
-	test$SRID = do.call(paste, c(test[c("Sample", "Replicate")], sep = "-"))
-}
-
 test$Top.V.Gene = gsub("[*]([0-9]+)", "", test$Top.V.Gene)
 test$Top.D.Gene = gsub("[*]([0-9]+)", "", test$Top.D.Gene)
 test$Top.J.Gene = gsub("[*]([0-9]+)", "", test$Top.J.Gene)
@@ -43,6 +38,9 @@
 test$VDJCDR3 = do.call(paste, c(test[unlist(strsplit(clonalType, ","))], sep = ":"))

 PROD = test[test$VDJ.Frame != "In-frame with stop codon" & test$VDJ.Frame != "Out-of-frame" & test$CDR3.Found.How != "NOT_FOUND" , ]
+if("Functionality" %in% colnames(test)) {
+	PROD = test[test$Functionality == "productive" | test$Functionality == "productive (see comment)", ]
+}

 NONPROD = test[test$VDJ.Frame == "In-frame with stop codon" | test$VDJ.Frame == "Out-of-frame" | test$CDR3.Found.How == "NOT_FOUND" , ]

@@ -74,7 +72,7 @@
 PRODFJ = merge(PRODFJ, Total, by.x='Sample', by.y='Sample', all.x=TRUE)
 PRODFJ = ddply(PRODFJ, c("Sample", "Top.J.Gene"), summarise, relFreq= (Length*100 / Total))

-V = c("v.name\tchr.orderV\nIGHV7-81\t1\nIGHV3-74\t2\nIGHV3-73\t3\nIGHV3-72\t4\nIGHV3-71\t5\nIGHV2-70\t6\nIGHV1-69\t7\nIGHV3-66\t8\nIGHV3-64\t9\nIGHV4-61\t10\nIGHV4-59\t11\nIGHV1-58\t12\nIGHV3-53\t13\nIGHV3-52\t14\nIGHV5-a\t15\nIGHV5-51\t16\nIGHV3-49\t17\nIGHV3-48\t18\nIGHV3-47\t19\nIGHV1-46\t20\nIGHV1-45\t21\nIGHV3-43\t22\nIGHV4-39\t23\nIGHV3-35\t24\nIGHV4-34\t25\nIGHV3-33\t26\nIGHV4-31\t27\nIGHV4-30-4\t28\nIGHV4-30-2\t29\nIGHV3-30-3\t30\nIGHV3-30\t31\nIGHV4-28\t32\nIGHV2-26\t33\nIGHV1-24\t34\nIGHV3-23\t35\nIGHV3-22\t36\nIGHV3-21\t37\nIGHV3-20\t38\nIGHV3-19\t39\nIGHV1-18\t40\nIGHV3-15\t41\nIGHV3-13\t42\nIGHV3-11\t43\nIGHV3-9\t44\nIGHV1-8\t45\nIGHV3-7\t46\nIGHV2-5\t47\nIGHV7-4-1\t48\nIGHV4-4\t49\nIGHV4-b\t50\nIGHV1-3\t51\nIGHV1-2\t52\nIGHV6-1\t53")
+V = c("v.name\tchr.orderV\nIGHV7-81\t1\nIGHV3-74\t2\nIGHV3-73\t3\nIGHV3-72\t4\nIGHV2-70\t6\nIGHV1-69\t7\nIGHV3-66\t8\nIGHV3-64\t9\nIGHV4-61\t10\nIGHV4-59\t11\nIGHV1-58\t12\nIGHV3-53\t13\nIGHV5-a\t15\nIGHV5-51\t16\nIGHV3-49\t17\nIGHV3-48\t18\nIGHV1-46\t20\nIGHV1-45\t21\nIGHV3-43\t22\nIGHV4-39\t23\nIGHV3-35\t24\nIGHV4-34\t25\nIGHV3-33\t26\nIGHV4-31\t27\nIGHV4-30-4\t28\nIGHV4-30-2\t29\nIGHV3-30-3\t30\nIGHV3-30\t31\nIGHV4-28\t32\nIGHV2-26\t33\nIGHV1-24\t34\nIGHV3-23\t35\nIGHV3-21\t37\nIGHV3-20\t38\nIGHV1-18\t40\nIGHV3-15\t41\nIGHV3-13\t42\nIGHV3-11\t43\nIGHV3-9\t44\nIGHV1-8\t45\nIGHV3-7\t46\nIGHV2-5\t47\nIGHV7-4-1\t48\nIGHV4-4\t49\nIGHV4-b\t50\nIGHV1-3\t51\nIGHV1-2\t52\nIGHV6-1\t53")
 tcV = textConnection(V)
 Vchain = read.table(tcV, sep="\t", header=TRUE)
 PRODFV = merge(PRODFV, Vchain, by.x='Top.V.Gene', by.y='v.name', all.x=TRUE)
@@ -95,6 +93,8 @@

 setwd(outDir)

+write.table(PRODF, "allUnique.tsv", sep="\t",quote=F,row.names=T,col.names=T)
+
 pV = ggplot(PRODFV)
 pV = pV + geom_bar( aes( x=factor(reorder(Top.V.Gene, chr.orderV)), y=relFreq, fill=Sample), stat='identity', position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
 pV = pV + xlab("Summary of V gene") + ylab("Frequency") + ggtitle("Relative frequency of V gene usage")
@@ -131,7 +131,7 @@
 	img = ggplot() +
 	geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-	scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0,1)) +
+	scale_fill_gradient(low="gold", high="blue", na.value="white") +
 	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
 	xlab("D genes") +
 	ylab("V Genes")
@@ -166,7 +166,7 @@
 	img = ggplot() +
 	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-	scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0,1)) +
+	scale_fill_gradient(low="gold", high="blue", na.value="white") +
 	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
 	xlab("J genes") +
 	ylab("V Genes")
@@ -198,7 +198,7 @@
 	img = ggplot() +
 	geom_tile(data=dat, aes(x=factor(reorder(Top.J.Gene, chr.orderJ)), y=factor(reorder(Top.D.Gene, chr.orderD)), fill=relLength)) +
 	theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-	scale_fill_gradient(low="gold", high="blue", na.value="white", limits=c(0,1)) +
+	scale_fill_gradient(low="gold", high="blue", na.value="white") +
 	ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
 	xlab("J genes") +
 	ylab("D Genes")
@@ -229,3 +229,88 @@
 un = paste(un, sep="\n")
 writeLines(un, sampleFile)
 close(sampleFile)
+
+
+if("Replicate" %in% colnames(test))
+{
+	clonalityFrame = PROD
+	clonalityFrame$ReplicateConcat = do.call(paste, c(clonalityFrame[c("VDJCDR3", "Sample", "Replicate")], sep = ":"))
+	clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$ReplicateConcat), ]
+	write.table(clonalityFrame, "clonalityComplete.tsv", sep="\t",quote=F,row.names=T,col.names=T)
+
+	ClonalitySampleReplicatePrint <- function(dat){
+	    write.table(dat, paste("clonality_", unique(dat$Sample) , "_", unique(dat$Replicate), ".tsv", sep=""), sep="\t",quote=F,row.names=T,col.names=T)
+	}
+
+    clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,c("Sample", "Replicate")])
+    lapply(clonalityFrameSplit, FUN=ClonalitySampleReplicatePrint)
+
+    ClonalitySamplePrint <- function(dat){
+	    write.table(dat, paste("clonality_", unique(dat$Sample) , ".tsv", sep=""), sep="\t",quote=F,row.names=T,col.names=T)
+	}
+
+    clonalityFrameSplit = split(clonalityFrame, f=clonalityFrame[,"Sample"])
+    lapply(clonalityFrameSplit, FUN=ClonalitySamplePrint)
+
+	clonalFreq = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "VDJCDR3")])
+	clonalFreqCount = data.frame(data.table(clonalFreq)[, list(Count=.N), by=c("Sample", "Type")])
+	clonalFreqCount$realCount = clonalFreqCount$Type * clonalFreqCount$Count
+	clonalSum = data.frame(data.table(clonalFreqCount)[, list(Reads=sum(realCount)), by=c("Sample")])
+	clonalFreqCount = merge(clonalFreqCount, clonalSum, by.x="Sample", by.y="Sample")
+
+	ct = c('Type\tWeight\n2\t1\n3\t3\n4\t6\n5\t10\n6\t15')
+	tcct = textConnection(ct)
+	CT  = read.table(tcct, sep="\t", header=TRUE)
+	close(tcct)
+	clonalFreqCount = merge(clonalFreqCount, CT, by.x="Type", by.y="Type", all.x=T)
+	clonalFreqCount$WeightedCount = clonalFreqCount$Count * clonalFreqCount$Weight
+
+	ReplicateReads = data.frame(data.table(clonalityFrame)[, list(Type=.N), by=c("Sample", "Replicate", "VDJCDR3")])
+	ReplicateReads = data.frame(data.table(ReplicateReads)[, list(Reads=.N), by=c("Sample", "Replicate")])
+	ReplicateReads$squared = ReplicateReads$Reads * ReplicateReads$Reads
+
+	ReplicatePrint <- function(dat){
+		write.table(dat[-1], paste("ReplicateReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
+	}
+
+	ReplicateSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"])
+	lapply(ReplicateSplit, FUN=ReplicatePrint)
+
+	ReplicateReads = data.frame(data.table(ReplicateReads)[, list(ReadsSum=sum(Reads), ReadsSquaredSum=sum(squared)), by=c("Sample")])
+	clonalFreqCount = merge(clonalFreqCount, ReplicateReads, by.x="Sample", by.y="Sample", all.x=T)
+
+
+	ReplicateSumPrint <- function(dat){
+		write.table(dat[-1], paste("ReplicateSumReads_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
+	}
+
+	ReplicateSumSplit = split(ReplicateReads, f=ReplicateReads[,"Sample"])
+	lapply(ReplicateSumSplit, FUN=ReplicateSumPrint)
+
+	clonalFreqCountSum = data.frame(data.table(clonalFreqCount)[, list(Numerator=sum(WeightedCount, na.rm=T)), by=c("Sample")])
+	clonalFreqCount = merge(clonalFreqCount, clonalFreqCountSum, by.x="Sample", by.y="Sample", all.x=T)
+
+	clonalFreqCount$Denominator = (((clonalFreqCount$ReadsSum * clonalFreqCount$ReadsSum) - clonalFreqCount$ReadsSquaredSum) / 2)
+	clonalFreqCount$Result = (clonalFreqCount$Numerator + 1) / (clonalFreqCount$Denominator + 1)
+
+	ClonalityScorePrint <- function(dat){
+		write.table(dat$Result, paste("ClonalityScore_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
+	}
+
+	clonalityScore = clonalFreqCount[c("Sample", "Result")]
+	clonalityScore = unique(clonalityScore)
+
+	clonalityScoreSplit = split(clonalityScore, f=clonalityScore[,"Sample"])
+	lapply(clonalityScoreSplit, FUN=ClonalityScorePrint)
+
+	clonalityOverview = clonalFreqCount[c("Sample", "Type", "Count", "Weight", "WeightedCount")]
+
+
+
+	ClonalityOverviewPrint <- function(dat){
+		write.table(dat[-1], paste("ClonalityOverView_", unique(dat[1])[1,1] , ".csv", sep=""), sep=",",quote=F,na="-",row.names=F,col.names=F)
+	}
+
+	clonalityOverviewSplit = split(clonalityOverview, f=clonalityOverview$Sample)
+	lapply(clonalityOverviewSplit, FUN=ClonalityOverviewPrint)
+}
--- a/combined.sh	Mon Nov 25 09:21:55 2013 -0500
+++ b/combined.sh	Tue Dec 10 05:53:08 2013 -0500
@@ -1,55 +1,42 @@
 #!/bin/bash

-#export IGDATA=/home/david/Galaxy/galaxy-dist/toolsheddependencies/igBlastn/1.0.0/davidvanzessen/igblast_human/1c64c977624e/ncbi-igblast-1.0.0/;
+export IGDATA=/home/david/tmp/ncbi-igblast-1.0.0;

 clonalType=${@:(-3):1}
 html=${@:(-2):1}
 imageDir=${@:(-1):1}
+dataCount=`expr $# - 3`
+inputData=${@:(1):dataCount}
 dir="$(cd "$(dirname "$0")" && pwd)"
-fileCount=`expr $# - 3`
 array=("$@")
 echo "<html><h3>Progress</h3><table><tr><td>info</td></tr>" > $html
 echo "<tr><td>-----------------------------------</td></tr>" >> $html
-limit=`expr $fileCount / 2`

-function blastAndParse {
-	echo "<tr><td>Starting blast of $2</td></tr>" >> $html
-	fileName=$(basename $1)
-	$IGDATA/bin/igblastn -germline_db_V $IGDATA/database/human_gl_V -germline_db_J $IGDATA/database/human_gl_J -germline_db_D $IGDATA/database/human_gl_D -domain_system imgt -query $1 -auxiliary_data $IGDATA/optional_file/human_gl.aux -show_translation -outfmt 3 > $PWD/$fileName
-	echo "<tr><td>Finished blast of $2</td></tr>" >> $html
-
-	echo "<tr><td>Starting parse of $2</td></tr>" >> $html
-	parsedFileName="${fileName%.*}"
-	parsedFileName="$parsedFileName.parsed"
-	perl $dir/igparse.pl $PWD/$fileName 0 | grep -v "D:" | cut -f2- > $parsedFileName
-	echo "<tr><td>Finished parse of $2</td></tr>" >> $html
-}
-
-for ((i=0;i<$fileCount;i=$((i+2))))
-do
-	next=$((i+1))
-	blastAndParse ${array[$i]} ${array[$next]} &
+id=${inputData[0]}
+forwardSlash="/"
+mergerInput=()
+count=0
+for current in $inputData; do
+    if [[ "$current" != *"$forwardSlash"* ]]; then
+        id=$current
+        count=0
+        mergerInput+=($id)
+        continue
+    fi
+    fileName=$(basename $current)
+    convertedFileName="${fileName%.*}"
+    convertedFileName="$PWD/$convertedFileName.converted"
+	bash $dir/imgtconvert.sh $current $id $count $convertedFileName
+	mergerInput+=($convertedFileName)
+	count=$((count+1))
 done
-wait


 echo "<tr><td>-----------------------------------</td></tr>" >> $html
 echo "<tr><td>merging</td></tr>" >> $html

-count=0
-for ((i=0;i<$fileCount;i=$((i+2))))
-do
-	id=$((i+1))
-	place=$((count+limit))
-	fn=$(basename ${array[$i]})
-	fn="${fn%.*}"
-	mergeInputs[$count]="$PWD/$fn.parsed"
-	mergeIDs[$place]=${array[$id]}
-	count=$((count+1))
-done
-
-python $dir/igblastmerge.py --input ${mergeInputs[*]} --id ${mergeIDs[*]} --output $PWD/merged.txt
+python $dir/igblastmerge.py ${mergerInput[*]}  --output $PWD/merged.txt

 echo "<tr><td>done</td></tr>" >> $html
 echo "<tr><td>-----------------------------------</td></tr>" >> $html
@@ -68,10 +55,43 @@

 samples=`cat $outputDir/samples.txt`
 count=1
-echo "<table border='1'><caption><h3>$clonalType</h3></caption>" >> $outputFile
+echo "<table border='1'><caption><a href='allUnique.tsv'><h3>$clonalType</h3></a></caption>" >> $outputFile
+hasReplicateColumn="$(if head -n 1 $inputFile | grep -q 'Replicate'; then echo 'Yes'; else echo 'No'; fi)"
 for sample in $samples; do
-	echo "<tr><td colspan='3' height='100'></td>" >> $outputFile
+	clonalityScore="$(cat $outputDir/ClonalityScore_$sample.csv)"
+	echo "<tr><td colspan='3' height='100'></td></tr>" >> $outputFile
 	echo "<tr><td colspan='3'><h1>$sample</h1></td></tr>" >> $outputFile
+
+	echo "$hasReplicateColumn"
+	#if its a 'new' merged file with replicate info
+	if [[ "$hasReplicateColumn" == "Yes" ]] ; then
+		echo "<tr><td colspan='3'><a href='clonality_$sample.tsv'><h2>Clonality Score: $clonalityScore</h2></a></td></tr>" >> $outputFile
+
+		#replicate,reads,squared
+		echo "<tr><td colspan='3'><table border='1'><tr><th>Replicate ID</th><th>Number of Reads</th><th>Reads Squared</th></tr>" >> $outputFile
+		while IFS=, read replicate reads squared
+		do
+
+			echo "<tr><td><a href='clonality_${sample}_$replicate.tsv'>$replicate</a></td><td>$reads</td><td>$squared</td></tr>" >> $outputFile
+		done < $outputDir/ReplicateReads_$sample.csv
+
+		#sum of reads and reads squared
+		while IFS=, read readsSum squaredSum
+			do
+				echo "<tr><td>Sum</td><td>$readsSum</td><td>$squaredSum</td></tr>" >> $outputFile
+		done < $outputDir/ReplicateSumReads_$sample.csv
+
+		echo "</table></td></tr>" >> $outputFile
+
+		#overview
+		echo "<tr><td colspan='3'><table border='1'><tr><th>Coincidence Type</th><th>Raw Coincidence Freq</th><th>Coincidence Weight</th><th>Coincidences, Weighted</th></tr>" >> $outputFile
+		while IFS=, read type count weight weightedCount
+		do
+			echo "<tr><td>$type</td><td>$count</td><td>$weight</td><td>$weightedCount</td></tr>" >> $outputFile
+		done < $outputDir/ClonalityOverView_$sample.csv
+		echo "</table></td></tr>" >> $outputFile
+	fi
+
 	echo "<tr><td><h2>V-D Heatmap:</h2></td><td><h2>V-J Heatmap:</h2></td><td><h2>D-J Heatmap:</h2></td></tr><tr>" >> $outputFile
 	mv "$outputDir/HeatmapVD_$sample.png" "$outputDir/VD_$sample.png"
 	echo "<td><img src='VD_$sample.png'/></td>" >> $outputFile
@@ -83,4 +103,4 @@
 done
 echo "</table>" >> $outputFile

-echo "</html>" >> $2
+echo "</html>" >> $outputFile
--- a/combined.xml	Mon Nov 25 09:21:55 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,34 +0,0 @@
-<tool id="immunerepertoirecombined" name="Immunerepertoire combined" version="1.0">
-	<description>pipeline</description>
-	<command interpreter="bash">
-		combined.sh
-		#for $i, $f in enumerate($files)
-			${f.file}
-			${f.id}
-		#end for
-		"$clonaltype_select" $out_file $out_file.files_path
-	</command>
-	<inputs>
-		<repeat name="files" title="File" min="1" default="1">
-			<param name="file" format="fasta" type="data" label="Data to Process" />
-			<param name="id" type="text" label="ID" />
-		</repeat>
-		<param name="clonaltype_select" type="select" label="Clonal Type Definition">
-			<option value="Top.V.Gene,CDR3.Seq">Top.V.Gene, CDR3.Seq</option>
-			<option value="Top.V.Gene,CDR3.Seq.DNA">Top.V.Gene, CDR3.Seq.DNA</option>
-			<option value="Top.V.Gene,Top.J.Gene,CDR3.Seq">Top.V.Gene, Top.J.Gene, CDR3.Seq</option>
-			<option value="Top.V.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.J.Gene, CDR3.Seq.DNA</option>
-			<option value="Top.V.Gene,Top.D.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.D.Gene, Top.J.Gene, CDR3.Seq.DNA</option>
-		</param>
-	</inputs>
-	<outputs>
-		<data format="html" name="out_file" />
-	</outputs>
-	<help>
-		The entire Immune Repertoire pipeline as a single tool, input several FASTA files, give them an ID and it will BLAST, parse, merge and plot them.
-	</help>
-	<requirements>
-		<requirement type="package" version="1.0.0">igBlastn</requirement>
-	</requirements>
-</tool>
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/combined_imgt.xml	Tue Dec 10 05:53:08 2013 -0500
@@ -0,0 +1,35 @@
+<tool id="immunerepertoirecombined_imgt" name="Immunerepertoire combined" version="1.0">
+	<description>pipeline for IMGT</description>
+	<command interpreter="bash">
+		combined.sh
+		#for $i, $f in enumerate($patients)
+			${f.id}
+            #for $j, $g in enumerate($f.samples)
+            	${g.sample}
+            #end for
+		#end for
+		"$clonaltype_select" $out_file $out_file.files_path
+	</command>
+	<inputs>
+		<repeat name="patients" title="Patients" min="1" default="1">
+            <repeat name="samples" title="Sample" min="1" default="1">
+                <param name="sample" type="data" label="Sample to Process" />
+            </repeat>
+			<param name="id" type="text" label="ID" />
+		</repeat>
+		<param name="clonaltype_select" type="select" label="Clonal Type Definition">
+			<option value="Top.V.Gene,CDR3.Seq">Top.V.Gene, CDR3.Seq</option>
+			<option value="Top.V.Gene,CDR3.Seq.DNA">Top.V.Gene, CDR3.Seq.DNA</option>
+			<option value="Top.V.Gene,Top.J.Gene,CDR3.Seq">Top.V.Gene, Top.J.Gene, CDR3.Seq</option>
+			<option value="Top.V.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.J.Gene, CDR3.Seq.DNA</option>
+			<option value="Top.V.Gene,Top.D.Gene,Top.J.Gene,CDR3.Seq.DNA">Top.V.Gene, Top.D.Gene, Top.J.Gene, CDR3.Seq.DNA</option>
+		</param>
+	</inputs>
+	<outputs>
+		<data format="html" name="out_file" />
+	</outputs>
+	<help>
+		The entire Immune Repertoire pipeline as a single tool, input several IMGT zip files, give them an ID and it will parse, merge and plot them.
+	</help>
+</tool>
+
--- a/igblastmerge.py	Mon Nov 25 09:21:55 2013 -0500
+++ b/igblastmerge.py	Tue Dec 10 05:53:08 2013 -0500
@@ -1,4 +1,3 @@
-import argparse
 import sys
 # error
 def stop_err( msg ):
@@ -7,35 +6,45 @@

 # main
 def main():
-	parser = argparse.ArgumentParser() #docs.python.org/dev/library/argparse.html
-	parser.add_argument("--input", help="Input file(s)", nargs="+")
-	parser.add_argument("--id", help="Input file(s) id's", nargs="+")
-	parser.add_argument("--output", help="Output file")
-
-	args = parser.parse_args()
-	try:
-		o = open(args.output, 'w')
-		i = open(args.input[-1], 'r')
-		separator = "\t"
-		newline = "\n"
-		header = "Sample"
-		line = i.readline()
-		o.write(line[:line.rfind(newline)] + separator + header + newline) #write the header
-		i.close()
-
-		for cf,i in zip(args.input,args.id):
-			f = open(cf, 'r')
-			line = f.readline()
-			line = f.readline() #skip header
-			while line:
-				o.write(line[:line.rfind(newline)] + separator + i + newline)
-				line = f.readline()
-			f.close()
-		o.close()
+    args = sys.argv[1:-2]

-	except Exception, ex:
-		stop_err('Error running new_column.py\n' + str(ex))
-	sys.exit(0)
+    try:
+        o = open(sys.argv[-1], 'w')
+        i = open(args[1], 'r')
+        separator = "\t"
+        newline = "\n"
+        line = i.readline()
+        #write the header
+        o.write(line[:line.rfind(newline)] + separator + "Sample" + separator + "Replicate" + newline)
+        i.close()
+
+        current = 1
+        sampleID = args[0]
+        count = 1
+
+        while True:
+            f = open(args[current], 'r')
+            line = f.readline()
+            line = f.readline()
+            while line:
+                o.write(line[:line.rfind(newline)] + separator + sampleID + separator + str(count) + newline)
+                line = f.readline()
+            f.close()
+
+            if current >= (len(args) - 1):
+                break
+            if args[current + 1].find("/") is -1:
+                sampleID = args[current + 1]
+                current += 1
+                count = 1
+            else:
+                count += 1
+            current += 1
+        o.close()
+
+    except Exception, ex:
+        stop_err('Error running new_column.py\n' + str(ex))
+    sys.exit(0)

 if __name__ == "__main__":
 	print sys.argv
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/imgtconvert.py	Tue Dec 10 05:53:08 2013 -0500
@@ -0,0 +1,139 @@
+import pandas as pd
+import re
+import argparse
+import os
+
+def stop_err( msg, ret=1 ):
+    sys.stderr.write( msg )
+    sys.exit( ret )
+
+#docs.python.org/dev/library/argparse.html
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Input folder with files")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+old_summary_columns = [u'Sequence ID', u'JUNCTION frame', u'V-GENE and allele', u'D-GENE and allele', u'J-GENE and allele', u'CDR1-IMGT length', u'CDR2-IMGT length', u'CDR3-IMGT length', u'Orientation']
+old_sequence_columns = [u'CDR1-IMGT', u'CDR2-IMGT', u'CDR3-IMGT']
+old_junction_columns = [u'JUNCTION']
+
+added_summary_columns = [u'V-REGION identity %', u'V-REGION identity nt', u'D-REGION reading frame', u'AA JUNCTION', u'Functionality comment', u'Sequence']
+added_sequence_columns = [u'FR1-IMGT', u'FR2-IMGT', u'FR3-IMGT', u'CDR3-IMGT', u'JUNCTION', u'J-REGION', u'FR4-IMGT']
+added_junction_columns = [u"P3'V-nt nb", u'N1-REGION-nt nb', u"P5'D-nt nb", u"P3'D-nt nb", u'N2-REGION-nt nb', u"P5'J-nt nb", u"3'V-REGION trimmed-nt nb", u"5'D-REGION trimmed-nt nb", u"3'D-REGION trimmed-nt nb", u"5'J-REGION trimmed-nt nb"]
+
+inputFolder = args.input
+
+dirContents = os.listdir(inputFolder)
+if len(dirContents) == 1:
+    inputFolder = os.path.join(inputFolder, dirContents[0])
+    if os.path.isdir(inputFolder):
+        print "is dir"
+        dirContents = os.listdir(inputFolder)
+files = sorted([os.path.join(inputFolder, f) for f in dirContents])
+
+if len(files) % 3 is not 0:
+    stop_err("Files in zip not a multiple of 3, it should contain the all the 1_, 5_ and 6_ files for a sample")
+    import sys
+    sys.exit()
+
+
+triplets = []
+step = len(files) / 3
+for i in range(0, step):
+    triplets.append((files[i], files[i + step], files[i + step + step]))
+
+outFile = args.output
+
+fSummary = pd.read_csv(triplets[0][0], sep="\t")
+fSequence = pd.read_csv(triplets[0][1], sep="\t")
+fJunction = pd.read_csv(triplets[0][2], sep="\t")
+tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]]
+
+tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"]
+tmp["CDR1 Length"] = fSummary["CDR1-IMGT length"]
+
+tmp["CDR2 Seq"] = fSequence["CDR2-IMGT"]
+tmp["CDR2 Length"] = fSummary["CDR2-IMGT length"]
+
+tmp["CDR3 Seq"] = fSequence["CDR3-IMGT"]
+tmp["CDR3 Length"] = fSummary["CDR3-IMGT length"]
+
+tmp["CDR3 Seq DNA"] = fJunction["JUNCTION"]
+tmp["CDR3 Length DNA"] = '1'
+tmp["Strand"] = fSummary["Orientation"]
+tmp["CDR3 Found How"] = 'a'
+
+for col in added_summary_columns:
+    tmp[col] = fSummary[col]
+
+for col in added_sequence_columns:
+    tmp[col] = fSequence[col]
+
+for col in added_junction_columns:
+    tmp[col] = fJunction[col]
+
+outFrame = tmp
+
+for triple in triplets[1:]:
+    fSummary = pd.read_csv(triple[0], sep="\t")
+    fSequence = pd.read_csv(triple[1], sep="\t")
+    fJunction = pd.read_csv(triple[2], sep="\t")
+
+    tmp = fSummary[["Sequence ID", "JUNCTION frame", "V-GENE and allele", "D-GENE and allele", "J-GENE and allele"]]
+
+    tmp["CDR1 Seq"] = fSequence["CDR1-IMGT"]
+    tmp["CDR1 Length"] = fSummary["CDR1-IMGT length"]
+
+    tmp["CDR2 Seq"] = fSequence["CDR2-IMGT"]
+    tmp["CDR2 Length"] = fSummary["CDR2-IMGT length"]
+
+    tmp["CDR3 Seq"] = fSequence["CDR3-IMGT"]
+    tmp["CDR3 Length"] = fSummary["CDR3-IMGT length"]
+
+    tmp["CDR3 Seq DNA"] = fJunction["JUNCTION"]
+    tmp["CDR3 Length DNA"] = '1'
+    tmp["Strand"] = fSummary["Orientation"]
+    tmp["CDR3 Found How"] = 'a'
+
+    for col in added_summary_columns:
+        tmp[col] = fSummary[col]
+
+    for col in added_sequence_columns:
+        tmp[col] = fSequence[col]
+
+    for col in added_junction_columns:
+        tmp[col] = fJunction[col]
+
+    outFrame = outFrame.append(tmp)
+
+outFrame.columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', 'V-REGION identity %', 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT', 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb', 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb']
+
+vPattern = re.compile(r"IGHV[1-9]-[0-9ab]+-?[1-9]?")
+dPattern = re.compile(r"IGHD[1-9]-[0-9ab]+")
+jPattern = re.compile(r"IGHJ[1-9]")
+
+def filterGenes(s, pattern):
+    if type(s) is not str:
+        return "NA"
+    res = pattern.search(s)
+    if res:
+        return res.group(0)
+    return "NA"
+
+
+outFrame["Top V Gene"] = outFrame["Top V Gene"].apply(lambda x: filterGenes(x, vPattern))
+outFrame["Top D Gene"] = outFrame["Top D Gene"].apply(lambda x: filterGenes(x, dPattern))
+outFrame["Top J Gene"] = outFrame["Top J Gene"].apply(lambda x: filterGenes(x, jPattern))
+
+
+
+tmp = outFrame["VDJ Frame"]
+tmp = tmp.replace("in-frame", "In-frame")
+tmp = tmp.replace("null", "Out-of-frame")
+tmp = tmp.replace("out-of-frame", "Out-of-frame")
+outFrame["VDJ Frame"] = tmp
+outFrame["CDR3 Length"] = outFrame["CDR3 Seq DNA"].map(str).map(len)
+safeLength = lambda x: len(x) if type(x) == str else 0
+outFrame = outFrame[(outFrame["CDR3 Seq DNA"].map(safeLength) > 0) & (outFrame["Top V Gene"] != "NA") & (outFrame["Top D Gene"] != "NA") & (outFrame["Top J Gene"] != "NA")] #filter out weird rows?
+outFrame.to_csv(outFile, sep="\t", index=False)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/imgtconvert.sh	Tue Dec 10 05:53:08 2013 -0500
@@ -0,0 +1,57 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+mkdir $PWD/$2_$3
+
+
+#!/bin/bash
+f=$(file $1)
+zip7Type="7-zip archive"
+tarType="tar archive"
+bzip2Type="bzip2 compressed"
+gzipType="gzip compressed"
+zipType="Zip archive"
+rarType="RAR archive"
+
+if [[ "$f" == *"$zip7Type"* ]]; then
+	echo "7-zip"
+	echo "Trying: 7za e $1 -o$PWD/$2_$3/"
+	7za e $1 -o$PWD/$2_$3/
+fi
+
+if [[ "$f" == *"$tarType"* ]]
+then
+	echo "tar archive"
+	echo "Trying: tar xvf $1 -C $PWD/$2_$3/"
+	tar xvf $1 -C $PWD/$2_$3/
+fi
+
+if [[ "$f" == *"$bzip2Type"* ]]
+then
+	echo "bzip2 compressed data"
+	echo "Trying: tar jxf $1 -C $PWD/$2_$3/"
+	tar jxf $1 -C $PWD/$2_$3/
+fi
+
+if [[ "$f" == *"$gzipType"* ]]
+then
+	echo "gzip compressed data"
+	echo "Trying: tar xvzf $1 -C $PWD/$2_$3/"
+	tar xvzf $1 -C $PWD/$2_$3/
+fi
+
+if [[ "$f" == *"$zipType"* ]]
+then
+	echo "Zip archive"
+	echo "Trying: unzip $1 -d $PWD/$2_$3/"
+	unzip $1 -d $PWD/$2_$3/
+fi
+
+if [[ "$f" == *"$rarType"* ]]
+then
+	echo "RAR archive"
+	echo "Trying: unrar e $1 $PWD/$2_$3/"
+	unrar e $1 $PWD/$2_$3/
+fi
+find $PWD/$2_$3/ -type f |  grep -v "1_Summary_\|5_AA-sequences_\|6_Junction_" | xargs rm -f
+python $dir/imgtconvert.py --input $PWD/$2_$3 --output $4
+
--- a/tool_dependencies.xml	Mon Nov 25 09:21:55 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,40 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-	<set_environment version="1.0">
-    </set_environment>
-
-	<package name="igBlastn" version="1.0.0">
-        <install version="1.0">
-            <actions>
-            	<action type="download_by_url" target_filename="ncbi-igblast-1.0.0-x64-linux.tar.gz">
-            		ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/1.0.0/ncbi-igblast-1.0.0-x64-linux.tar.gz
-            	</action>
-
-
-            	<action type="move_directory_files">
-		    		<source_directory>ncbi-igblast-1.0.0</source_directory>
-			   		<destination_directory>$INSTALL_DIR/ncbi-igblast-1.0.0</destination_directory>
-	       		</action>
-                <action type="set_environment">
-                	<environment_variable name="IGDATA" action="set_to">$INSTALL_DIR/ncbi-igblast-1.0.0</environment_variable>
-                </action>
-                <action type="set_environment">
-                	<environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/ncbi-igblast-1.0.0/bin</environment_variable>
-                </action>
-
-                <action type="make_directory">$INSTALL_DIR/ncbi-igblast-1.0.0/internal_data</action>
-                <action type="make_directory">$INSTALL_DIR/ncbi-igblast-1.0.0/internal_data/human</action>
-
-                <action type="shell_command">wget -r -np -nH --cut-dirs=4 -R index.html -P $INSTALL_DIR/ncbi-igblast-1.0.0 "ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data"</action>
-
-                <action type="shell_command">wget -r -np -nH --cut-dirs=4 -R index.html -P $INSTALL_DIR/ncbi-igblast-1.0.0 "ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/database"</action>
-
-                <action type="shell_command">wget -r -np -nH --cut-dirs=4 -R index.html -P $INSTALL_DIR/ncbi-igblast-1.0.0 "ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file"</action>
-
-            </actions>
-        </install>
-        <readme>
-		Downloads igBlast with the latest database provided by NCBI, output may change depending on the database available at ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/
-        </readme>
-    </package>
-</tool_dependency>