Mercurial > repos > rhpvorderman > shm_csr
annotate baseline/script_xlsx.py @ 6:ea9d5fc4c001 draft default tip
"planemo upload commit 9ada186a78831ca2618ec817a23a77de6adf1a5d"
| author | rhpvorderman |
|---|---|
| date | Wed, 22 Dec 2021 11:29:16 +0000 |
| parents | 64d74ba01a7c |
| children |
| rev | line source |
|---|---|
|
0
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
1 import xlrd |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
2 import argparse |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
3 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
4 parser = argparse.ArgumentParser() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
5 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
6 parser.add_argument("--ref", help="Reference file") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
7 parser.add_argument("--output", help="Output file") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
8 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
9 args = parser.parse_args() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
10 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
11 gene_column = 6 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
12 id_column = 7 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
13 seq_column = 8 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
14 LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"] |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
15 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
16 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
17 refdic = dict() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
18 with open(args.ref, 'r') as ref: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
19 currentSeq = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
20 currentId = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
21 for line in ref.readlines(): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
22 if line[0] is ">": |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
23 if currentSeq is not "" and currentId is not "": |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
24 refdic[currentId[1:]] = currentSeq |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
25 currentId = line.rstrip() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
26 currentSeq = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
27 else: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
28 currentSeq += line.rstrip() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
29 refdic[currentId[1:]] = currentSeq |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
30 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
31 currentSeq = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
32 currentId = "" |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
33 with xlrd.open_workbook(args.input, 'r') as wb: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
34 with open(args.output, 'a') as o: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
35 for sheet in wb.sheets(): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
36 if sheet.cell(1,gene_column).value.find("IGHV") < 0: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
37 print("Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name) |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
38 continue |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
39 o.write(">>>" + sheet.name + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
40 outputdic = dict() |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
41 for rowindex in range(1, sheet.nrows): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
42 ref = sheet.cell(rowindex, gene_column).value.replace(">", "") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
43 if ref in outputdic: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
44 outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
45 else: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
46 outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)] |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
47 #print outputdic |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
48 |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
49 for k in list(outputdic.keys()): |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
50 if k in refdic: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
51 o.write(">>" + k + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
52 o.write(refdic[k] + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
53 for seq in outputdic[k]: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
54 #print seq |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
55 o.write(">" + seq[0] + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
56 o.write(seq[1] + "\n") |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
57 else: |
|
64d74ba01a7c
"planemo upload commit 78d1fae87dbcf490e49a9f99e7a06de7328e16d4"
rhpvorderman
parents:
diff
changeset
|
58 print(k + " not in reference, skipping " + k) |
