Mercurial > repos > davidvanzessen > baseline_fasta_generator
changeset 3:be17e1e7dc2b draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 23 Jul 2014 07:51:10 -0400 |
parents | fac36e2248fd |
children | aa4b95abef11 |
files | baseline_generator.xml script.py script_imgt.py script_xlsx.py wrapper.sh |
diffstat | 4 files changed, 134 insertions(+), 60 deletions(-) [+] |
line wrap: on
line diff
--- a/baseline_generator.xml Wed Jul 23 06:00:58 2014 -0400 +++ b/baseline_generator.xml Wed Jul 23 07:51:10 2014 -0400 @@ -1,7 +1,7 @@ <tool id="baseline_fasta_generator" name="Baseline generator" version="1.0"> <description>Generate baseline fasta file</description> - <command interpreter="python"> - script.py --input $in_file --ref $reference --output $out_file + <command interpreter="bash"> + wrapper.sh $in_file $reference $out_file </command> <inputs> <param name="in_file" type="data" label="Input excel file" />
--- a/script.py Wed Jul 23 06:00:58 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -import xlrd -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") -parser.add_argument("--ref", help="Reference file") -parser.add_argument("--output", help="Output file") - -args = parser.parse_args() - -gene_column = 6 -id_column = 7 -seq_column = 8 -LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] - - -refdic = dict() -with open(args.ref, 'r') as ref: - currentSeq = "" - currentId = "" - for line in ref.readlines(): - if line[0] is ">": - if currentSeq is not "" and currentId is not "": - refdic[currentId[1:]] = currentSeq - currentId = line.rstrip() - currentSeq = "" - else: - currentSeq += line.rstrip() - refdic[currentId[1:]] = currentSeq - -currentSeq = "" -currentId = "" -with xlrd.open_workbook(args.input, 'r') as wb: - with open(args.output, 'w') as o: - for sheet in wb.sheets(): - if sheet.cell(1,gene_column).value.find("IGHV") < 0: - print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name - continue - o.write(">>>" + sheet.name + "\n") - outputdic = dict() - for rowindex in range(1, sheet.nrows): - ref = sheet.cell(rowindex, gene_column).value[2:] - if ref in outputdic: - outputdic[ref] += [(sheet.cell(rowindex, id_column).value[1:], sheet.cell(rowindex, seq_column).value)] - else: - outputdic[ref] = [(sheet.cell(rowindex, id_column).value[1:], sheet.cell(rowindex, seq_column).value)] - #print outputdic - - for k in outputdic.keys(): - if k in refdic: - o.write(">>" + k + "\n") - o.write(refdic[k] + "\n") - for seq in outputdic[k]: - #print seq - o.write(">" + seq[0] + "\n") - o.write(seq[1] + "\n") - else: - print k + " not in reference, skipping " + k \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script_imgt.py Wed Jul 23 07:51:10 2014 -0400 @@ -0,0 +1,74 @@ +import xlrd +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref.readlines(): + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + + +vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, +# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", +# r"(IGKV[0-3]D?-[0-9]{1,2})", +# r"(IGLV[0-9]-[0-9]{1,2})", +# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", +# r"(TRGV[234589])", +# r"(TRDV[1-3])"] + +#vPattern = re.compile(r"|".join(vPattern)) +vPattern = re.compile("|".join(vPattern)) + +def filterGene(s, pattern): + if type(s) is not str: + return None + res = pattern.search(s) + if res: + return res.group(0) + return None + + + +currentSeq = "" +currentId = "" +with open(args.input, 'r') as i: + with open(args.output, 'w') as o: + o.write(">>>IMGT\n") + outputdic = dict() + for line in i.readlines()[1:]: + linesplt = line.split("\t") + ref = filterGene(linesplt[1], vPattern) + if not ref: + continue + if ref in outputdic: + outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + else: + outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script_xlsx.py Wed Jul 23 07:51:10 2014 -0400 @@ -0,0 +1,58 @@ +import xlrd +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +gene_column = 6 +id_column = 7 +seq_column = 8 +LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] + + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref.readlines(): + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + +currentSeq = "" +currentId = "" +with xlrd.open_workbook(args.input, 'r') as wb: + with open(args.output, 'w') as o: + for sheet in wb.sheets(): + if sheet.cell(1,gene_column).value.find("IGHV") < 0: + print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name + continue + o.write(">>>" + sheet.name + "\n") + outputdic = dict() + for rowindex in range(1, sheet.nrows): + ref = sheet.cell(rowindex, gene_column).value.replace(">", "") + if ref in outputdic: + outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + else: + outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k