Mercurial > repos > rhpvorderman > shm_csr
comparison baseline/script_imgt.py @ 0:64d74ba01a7c draft
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
| author | rhpvorderman |
|---|---|
| date | Wed, 27 Oct 2021 12:34:47 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:64d74ba01a7c |
|---|---|
| 1 #import xlrd #avoid dep | |
| 2 import argparse | |
| 3 import re | |
| 4 | |
| 5 parser = argparse.ArgumentParser() | |
| 6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") | |
| 7 parser.add_argument("--ref", help="Reference file") | |
| 8 parser.add_argument("--output", help="Output file") | |
| 9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") | |
| 10 | |
| 11 args = parser.parse_args() | |
| 12 | |
| 13 print("script_imgt.py") | |
| 14 print("input:", args.input) | |
| 15 print("ref:", args.ref) | |
| 16 print("output:", args.output) | |
| 17 print("id:", args.id) | |
| 18 | |
| 19 refdic = dict() | |
| 20 with open(args.ref, 'rU') as ref: | |
| 21 currentSeq = "" | |
| 22 currentId = "" | |
| 23 for line in ref: | |
| 24 if line.startswith(">"): | |
| 25 if currentSeq is not "" and currentId is not "": | |
| 26 refdic[currentId[1:]] = currentSeq | |
| 27 currentId = line.rstrip() | |
| 28 currentSeq = "" | |
| 29 else: | |
| 30 currentSeq += line.rstrip() | |
| 31 refdic[currentId[1:]] = currentSeq | |
| 32 | |
| 33 print("Have", str(len(refdic)), "reference sequences") | |
| 34 | |
| 35 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, | |
| 36 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", | |
| 37 # r"(IGKV[0-3]D?-[0-9]{1,2})", | |
| 38 # r"(IGLV[0-9]-[0-9]{1,2})", | |
| 39 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", | |
| 40 # r"(TRGV[234589])", | |
| 41 # r"(TRDV[1-3])"] | |
| 42 | |
| 43 #vPattern = re.compile(r"|".join(vPattern)) | |
| 44 vPattern = re.compile("|".join(vPattern)) | |
| 45 | |
| 46 def filterGene(s, pattern): | |
| 47 if type(s) is not str: | |
| 48 return None | |
| 49 res = pattern.search(s) | |
| 50 if res: | |
| 51 return res.group(0) | |
| 52 return None | |
| 53 | |
| 54 | |
| 55 | |
| 56 currentSeq = "" | |
| 57 currentId = "" | |
| 58 first=True | |
| 59 with open(args.input, 'r') as i: | |
| 60 with open(args.output, 'a') as o: | |
| 61 o.write(">>>" + args.id + "\n") | |
| 62 outputdic = dict() | |
| 63 for line in i: | |
| 64 if first: | |
| 65 first = False | |
| 66 continue | |
| 67 linesplt = line.split("\t") | |
| 68 ref = filterGene(linesplt[1], vPattern) | |
| 69 if not ref or not linesplt[2].rstrip(): | |
| 70 continue | |
| 71 if ref in outputdic: | |
| 72 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] | |
| 73 else: | |
| 74 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] | |
| 75 #print outputdic | |
| 76 | |
| 77 for k in list(outputdic.keys()): | |
| 78 if k in refdic: | |
| 79 o.write(">>" + k + "\n") | |
| 80 o.write(refdic[k] + "\n") | |
| 81 for seq in outputdic[k]: | |
| 82 #print seq | |
| 83 o.write(">" + seq[0] + "\n") | |
| 84 o.write(seq[1] + "\n") | |
| 85 else: | |
| 86 print(k + " not in reference, skipping " + k) |
