Mercurial > repos > davidvanzessen > baseline_fasta_generator
changeset 0:91221003f5ee draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 15 Jul 2014 08:51:22 -0400 |
parents | |
children | 85f18c2f74dd |
files | baseline_generator.xml script.py |
diffstat | 2 files changed, 76 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline_generator.xml Tue Jul 15 08:51:22 2014 -0400 @@ -0,0 +1,18 @@ +<tool id="baseline_fasta_generator" name="Baseline generator" version="1.0"> + <description>Generate baseline fasta file</description> + <command interpreter="python"> + script.py --input $in_file --ref $reference --output $out_file + </command> + <inputs> + <param name="in_file" type="data" label="Input fasta file" /> + <param name="reference" type="data" format="fasta" label="Reference fasta file" /> + </inputs> + <outputs> + <data format="fasta" name="out_file" label = "Baseline generator on $in_file.name with $reference.name"/> + </outputs> + <help> + Gur Yaari; Mohamed Uduman; Steven H. Kleinstein. Quantifying selection in high-throughput Immunoglobulin sequencing data sets. Nucleic Acids Res. 2012 May 27. + + Mohamed Uduman; Gur Yaari; Uri Hershberg; Mark J. Shlomchik; Steven H. Kleinstein. Detecting selection in immunoglobulin sequences. Nucleic Acids Res. 2011 Jul;39(Web Server issue):W499-504. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script.py Tue Jul 15 08:51:22 2014 -0400 @@ -0,0 +1,58 @@ +import xlrd +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +gene_column = 6 +id_column = 7 +seq_column = 8 +LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] + + +refdic = dict() +with open(args.ref, 'r') as ref: + currentSeq = "" + currentId = "" + for line in ref.readlines(): + if line[0] is ">": + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + +currentSeq = "" +currentId = "" +with xlrd.open_workbook(args.input, 'r') as wb: + with open(args.output, 'w') as o: + for sheet in wb.sheets(): + if sheet.cell(1,gene_column).value.find("IGHV") < 0: + print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name + continue + o.write(">>>" + sheet.name + "\n") + outputdic = dict() + for rowindex in range(1, sheet.nrows): + ref = sheet.cell(rowindex, gene_column).value[2:] + if ref in outputdic: + outputdic[ref] += [(sheet.cell(rowindex, id_column).value[1:], sheet.cell(rowindex, seq_column).value)] + else: + outputdic[ref] = [(sheet.cell(rowindex, id_column).value[1:], sheet.cell(rowindex, seq_column).value)] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k \ No newline at end of file