comparison script_imgt.py @ 3:be17e1e7dc2b draft

Uploaded
author davidvanzessen
date Wed, 23 Jul 2014 07:51:10 -0400
parents
children aa4b95abef11
comparison
equal deleted inserted replaced
2:fac36e2248fd 3:be17e1e7dc2b
1 import xlrd
2 import argparse
3 import re
4
5 parser = argparse.ArgumentParser()
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
7 parser.add_argument("--ref", help="Reference file")
8 parser.add_argument("--output", help="Output file")
9
10 args = parser.parse_args()
11
12 refdic = dict()
13 with open(args.ref, 'r') as ref:
14 currentSeq = ""
15 currentId = ""
16 for line in ref.readlines():
17 if line[0] is ">":
18 if currentSeq is not "" and currentId is not "":
19 refdic[currentId[1:]] = currentSeq
20 currentId = line.rstrip()
21 currentSeq = ""
22 else:
23 currentSeq += line.rstrip()
24 refdic[currentId[1:]] = currentSeq
25
26
27 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
28 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
29 # r"(IGKV[0-3]D?-[0-9]{1,2})",
30 # r"(IGLV[0-9]-[0-9]{1,2})",
31 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
32 # r"(TRGV[234589])",
33 # r"(TRDV[1-3])"]
34
35 #vPattern = re.compile(r"|".join(vPattern))
36 vPattern = re.compile("|".join(vPattern))
37
38 def filterGene(s, pattern):
39 if type(s) is not str:
40 return None
41 res = pattern.search(s)
42 if res:
43 return res.group(0)
44 return None
45
46
47
48 currentSeq = ""
49 currentId = ""
50 with open(args.input, 'r') as i:
51 with open(args.output, 'w') as o:
52 o.write(">>>IMGT\n")
53 outputdic = dict()
54 for line in i.readlines()[1:]:
55 linesplt = line.split("\t")
56 ref = filterGene(linesplt[1], vPattern)
57 if not ref:
58 continue
59 if ref in outputdic:
60 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
61 else:
62 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
63 #print outputdic
64
65 for k in outputdic.keys():
66 if k in refdic:
67 o.write(">>" + k + "\n")
68 o.write(refdic[k] + "\n")
69 for seq in outputdic[k]:
70 #print seq
71 o.write(">" + seq[0] + "\n")
72 o.write(seq[1] + "\n")
73 else:
74 print k + " not in reference, skipping " + k