annotate baseline/script_imgt.py @ 4:c8f02bce10d0 draft

"planemo upload commit 05326cc2233eb0fa8beaf4198bd7c1519f2a0bf9"
author rhpvorderman
date Mon, 15 Nov 2021 14:04:10 +0000
parents 64d74ba01a7c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
1 #import xlrd #avoid dep
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
2 import argparse
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
3 import re
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
4
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
5 parser = argparse.ArgumentParser()
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
7 parser.add_argument("--ref", help="Reference file")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
8 parser.add_argument("--output", help="Output file")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
10
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
11 args = parser.parse_args()
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
12
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
13 print("script_imgt.py")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
14 print("input:", args.input)
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
15 print("ref:", args.ref)
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
16 print("output:", args.output)
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
17 print("id:", args.id)
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
18
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
19 refdic = dict()
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
20 with open(args.ref, 'rU') as ref:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
21 currentSeq = ""
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
22 currentId = ""
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
23 for line in ref:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
24 if line.startswith(">"):
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
25 if currentSeq is not "" and currentId is not "":
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
26 refdic[currentId[1:]] = currentSeq
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
27 currentId = line.rstrip()
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
28 currentSeq = ""
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
29 else:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
30 currentSeq += line.rstrip()
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
31 refdic[currentId[1:]] = currentSeq
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
32
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
33 print("Have", str(len(refdic)), "reference sequences")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
34
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
35 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
36 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
37 # r"(IGKV[0-3]D?-[0-9]{1,2})",
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
38 # r"(IGLV[0-9]-[0-9]{1,2})",
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
39 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
40 # r"(TRGV[234589])",
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
41 # r"(TRDV[1-3])"]
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
42
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
43 #vPattern = re.compile(r"|".join(vPattern))
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
44 vPattern = re.compile("|".join(vPattern))
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
45
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
46 def filterGene(s, pattern):
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
47 if type(s) is not str:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
48 return None
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
49 res = pattern.search(s)
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
50 if res:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
51 return res.group(0)
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
52 return None
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
53
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
54
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
55
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
56 currentSeq = ""
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
57 currentId = ""
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
58 first=True
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
59 with open(args.input, 'r') as i:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
60 with open(args.output, 'a') as o:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
61 o.write(">>>" + args.id + "\n")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
62 outputdic = dict()
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
63 for line in i:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
64 if first:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
65 first = False
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
66 continue
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
67 linesplt = line.split("\t")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
68 ref = filterGene(linesplt[1], vPattern)
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
69 if not ref or not linesplt[2].rstrip():
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
70 continue
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
71 if ref in outputdic:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
72 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
73 else:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
74 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
75 #print outputdic
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
76
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
77 for k in list(outputdic.keys()):
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
78 if k in refdic:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
79 o.write(">>" + k + "\n")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
80 o.write(refdic[k] + "\n")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
81 for seq in outputdic[k]:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
82 #print seq
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
83 o.write(">" + seq[0] + "\n")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
84 o.write(seq[1] + "\n")
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
85 else:
64d74ba01a7c "planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff changeset
86 print(k + " not in reference, skipping " + k)