Mercurial > repos > davidvanzessen > baseline_fasta_generator

--- a/baseline_generator.xml	Wed Jul 23 06:00:58 2014 -0400
+++ b/baseline_generator.xml	Wed Jul 23 07:51:10 2014 -0400
@@ -1,7 +1,7 @@
 <tool id="baseline_fasta_generator" name="Baseline generator" version="1.0">
 	<description>Generate baseline fasta file</description>
-	<command interpreter="python">
-		script.py --input $in_file --ref $reference --output $out_file
+	<command interpreter="bash">
+		wrapper.sh $in_file $reference $out_file
 	</command>
 	<inputs>
 		<param name="in_file" type="data" label="Input excel file" />
--- a/script.py	Wed Jul 23 06:00:58 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,58 +0,0 @@
-import xlrd
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
-parser.add_argument("--ref", help="Reference file")
-parser.add_argument("--output", help="Output file")
-
-args = parser.parse_args()
-
-gene_column = 6
-id_column = 7
-seq_column = 8
-LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
-
-
-refdic = dict()
-with open(args.ref, 'r') as ref:
-	currentSeq = ""
-	currentId = ""
-	for line in ref.readlines():
-		if line[0] is ">":
-			if currentSeq is not "" and currentId is not "":
-				refdic[currentId[1:]] = currentSeq
-			currentId = line.rstrip()
-			currentSeq = ""
-		else:
-			currentSeq += line.rstrip()
-	refdic[currentId[1:]] = currentSeq
-
-currentSeq = ""
-currentId = ""
-with xlrd.open_workbook(args.input, 'r') as wb:
-	with open(args.output, 'w') as o:
-		for sheet in wb.sheets():
-			if sheet.cell(1,gene_column).value.find("IGHV") < 0:
-				print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
-				continue
-			o.write(">>>" + sheet.name + "\n")
-			outputdic = dict()
-			for rowindex in range(1, sheet.nrows):
-				ref = sheet.cell(rowindex, gene_column).value[2:]
-				if ref in outputdic:
-					outputdic[ref] += [(sheet.cell(rowindex, id_column).value[1:], sheet.cell(rowindex, seq_column).value)]
-				else:
-					outputdic[ref] = [(sheet.cell(rowindex, id_column).value[1:], sheet.cell(rowindex, seq_column).value)]
-			#print outputdic
-
-			for k in outputdic.keys():
-				if k in refdic:
-					o.write(">>" + k + "\n")
-					o.write(refdic[k] + "\n")
-					for seq in outputdic[k]:
-						#print seq
-						o.write(">" + seq[0] + "\n")
-						o.write(seq[1] + "\n")
-				else:
-					print k + " not in reference, skipping " + k
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script_imgt.py	Wed Jul 23 07:51:10 2014 -0400
@@ -0,0 +1,74 @@
+import xlrd
+import argparse
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+	currentSeq = ""
+	currentId = ""
+	for line in ref.readlines():
+		if line[0] is ">":
+			if currentSeq is not "" and currentId is not "":
+				refdic[currentId[1:]] = currentSeq
+			currentId = line.rstrip()
+			currentSeq = ""
+		else:
+			currentSeq += line.rstrip()
+	refdic[currentId[1:]] = currentSeq
+
+
+vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
+#						r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
+#						r"(IGKV[0-3]D?-[0-9]{1,2})",
+#						r"(IGLV[0-9]-[0-9]{1,2})",
+#						r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
+#						r"(TRGV[234589])",
+#						r"(TRDV[1-3])"]
+
+#vPattern = re.compile(r"|".join(vPattern))
+vPattern = re.compile("|".join(vPattern))
+
+def filterGene(s, pattern):
+    if type(s) is not str:
+        return None
+    res = pattern.search(s)
+    if res:
+        return res.group(0)
+    return None
+
+
+
+currentSeq = ""
+currentId = ""
+with open(args.input, 'r') as i:
+	with open(args.output, 'w') as o:
+		o.write(">>>IMGT\n")
+		outputdic = dict()
+		for line in i.readlines()[1:]:
+			linesplt = line.split("\t")
+			ref = filterGene(linesplt[1], vPattern)
+			if not ref:
+				continue
+			if ref in outputdic:
+				outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+			else:
+				outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+		#print outputdic
+
+		for k in outputdic.keys():
+			if k in refdic:
+				o.write(">>" + k + "\n")
+				o.write(refdic[k] + "\n")
+				for seq in outputdic[k]:
+					#print seq
+					o.write(">" + seq[0] + "\n")
+					o.write(seq[1] + "\n")
+			else:
+				print k + " not in reference, skipping " + k
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script_xlsx.py	Wed Jul 23 07:51:10 2014 -0400
@@ -0,0 +1,58 @@
+import xlrd
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+gene_column = 6
+id_column = 7
+seq_column = 8
+LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
+
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+	currentSeq = ""
+	currentId = ""
+	for line in ref.readlines():
+		if line[0] is ">":
+			if currentSeq is not "" and currentId is not "":
+				refdic[currentId[1:]] = currentSeq
+			currentId = line.rstrip()
+			currentSeq = ""
+		else:
+			currentSeq += line.rstrip()
+	refdic[currentId[1:]] = currentSeq
+
+currentSeq = ""
+currentId = ""
+with xlrd.open_workbook(args.input, 'r') as wb:
+	with open(args.output, 'w') as o:
+		for sheet in wb.sheets():
+			if sheet.cell(1,gene_column).value.find("IGHV") < 0:
+				print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
+				continue
+			o.write(">>>" + sheet.name + "\n")
+			outputdic = dict()
+			for rowindex in range(1, sheet.nrows):
+				ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
+				if ref in outputdic:
+					outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+				else:
+					outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+			#print outputdic
+
+			for k in outputdic.keys():
+				if k in refdic:
+					o.write(">>" + k + "\n")
+					o.write(refdic[k] + "\n")
+					for seq in outputdic[k]:
+						#print seq
+						o.write(">" + seq[0] + "\n")
+						o.write(seq[1] + "\n")
+				else:
+					print k + " not in reference, skipping " + k