comparison script_imgt.py @ 2:94fada165724 draft

Uploaded
author davidvanzessen
date Wed, 13 Aug 2014 07:32:38 -0400
parents
children d8de51314d3f
comparison
equal deleted inserted replaced
1:e8dd8474aecb 2:94fada165724
1 import xlrd
2 import argparse
3 import re
4
5 parser = argparse.ArgumentParser()
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
7 parser.add_argument("--ref", help="Reference file")
8 parser.add_argument("--output", help="Output file")
9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
10
11 args = parser.parse_args()
12
13 refdic = dict()
14 with open(args.ref, 'r') as ref:
15 currentSeq = ""
16 currentId = ""
17 for line in ref.readlines():
18 if line[0] is ">":
19 if currentSeq is not "" and currentId is not "":
20 refdic[currentId[1:]] = currentSeq
21 currentId = line.rstrip()
22 currentSeq = ""
23 else:
24 currentSeq += line.rstrip()
25 refdic[currentId[1:]] = currentSeq
26
27
28 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
29 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
30 # r"(IGKV[0-3]D?-[0-9]{1,2})",
31 # r"(IGLV[0-9]-[0-9]{1,2})",
32 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
33 # r"(TRGV[234589])",
34 # r"(TRDV[1-3])"]
35
36 #vPattern = re.compile(r"|".join(vPattern))
37 vPattern = re.compile("|".join(vPattern))
38
39 def filterGene(s, pattern):
40 if type(s) is not str:
41 return None
42 res = pattern.search(s)
43 if res:
44 return res.group(0)
45 return None
46
47
48
49 currentSeq = ""
50 currentId = ""
51 with open(args.input, 'r') as i:
52 with open(args.output, 'a') as o:
53 o.write(">>>" + args.id + "\n")
54 outputdic = dict()
55 for line in i.readlines()[1:]:
56 linesplt = line.split("\t")
57 ref = filterGene(linesplt[1], vPattern)
58 if not ref or not linesplt[2].rstrip():
59 continue
60 if ref in outputdic:
61 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
62 else:
63 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
64 #print outputdic
65
66 for k in outputdic.keys():
67 if k in refdic:
68 o.write(">>" + k + "\n")
69 o.write(refdic[k] + "\n")
70 for seq in outputdic[k]:
71 #print seq
72 o.write(">" + seq[0] + "\n")
73 o.write(seq[1] + "\n")
74 else:
75 print k + " not in reference, skipping " + k