Mercurial > repos > rhpvorderman > shm_csr
annotate baseline/script_imgt.py @ 4:c8f02bce10d0 draft
"planemo upload commit 05326cc2233eb0fa8beaf4198bd7c1519f2a0bf9"
| author | rhpvorderman |
|---|---|
| date | Mon, 15 Nov 2021 14:04:10 +0000 |
| parents | 64d74ba01a7c |
| children |
| rev | line source |
|---|---|
|
0
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
1 #import xlrd #avoid dep |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
2 import argparse |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
3 import re |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
4 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
5 parser = argparse.ArgumentParser() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
7 parser.add_argument("--ref", help="Reference file") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
8 parser.add_argument("--output", help="Output file") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
10 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
11 args = parser.parse_args() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
12 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
13 print("script_imgt.py") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
14 print("input:", args.input) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
15 print("ref:", args.ref) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
16 print("output:", args.output) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
17 print("id:", args.id) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
18 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
19 refdic = dict() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
20 with open(args.ref, 'rU') as ref: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
21 currentSeq = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
22 currentId = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
23 for line in ref: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
24 if line.startswith(">"): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
25 if currentSeq is not "" and currentId is not "": |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
26 refdic[currentId[1:]] = currentSeq |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
27 currentId = line.rstrip() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
28 currentSeq = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
29 else: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
30 currentSeq += line.rstrip() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
31 refdic[currentId[1:]] = currentSeq |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
32 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
33 print("Have", str(len(refdic)), "reference sequences") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
34 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
35 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
36 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
37 # r"(IGKV[0-3]D?-[0-9]{1,2})", |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
38 # r"(IGLV[0-9]-[0-9]{1,2})", |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
39 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
40 # r"(TRGV[234589])", |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
41 # r"(TRDV[1-3])"] |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
42 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
43 #vPattern = re.compile(r"|".join(vPattern)) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
44 vPattern = re.compile("|".join(vPattern)) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
45 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
46 def filterGene(s, pattern): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
47 if type(s) is not str: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
48 return None |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
49 res = pattern.search(s) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
50 if res: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
51 return res.group(0) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
52 return None |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
53 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
54 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
55 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
56 currentSeq = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
57 currentId = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
58 first=True |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
59 with open(args.input, 'r') as i: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
60 with open(args.output, 'a') as o: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
61 o.write(">>>" + args.id + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
62 outputdic = dict() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
63 for line in i: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
64 if first: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
65 first = False |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
66 continue |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
67 linesplt = line.split("\t") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
68 ref = filterGene(linesplt[1], vPattern) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
69 if not ref or not linesplt[2].rstrip(): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
70 continue |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
71 if ref in outputdic: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
72 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
73 else: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
74 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
75 #print outputdic |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
76 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
77 for k in list(outputdic.keys()): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
78 if k in refdic: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
79 o.write(">>" + k + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
80 o.write(refdic[k] + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
81 for seq in outputdic[k]: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
82 #print seq |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
83 o.write(">" + seq[0] + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
84 o.write(seq[1] + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
85 else: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
86 print(k + " not in reference, skipping " + k) |
