Mercurial > repos > davidvanzessen > baseline_fasta_generator
comparison script_imgt.py @ 3:be17e1e7dc2b draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 23 Jul 2014 07:51:10 -0400 |
parents | |
children | aa4b95abef11 |
comparison
equal
deleted
inserted
replaced
2:fac36e2248fd | 3:be17e1e7dc2b |
---|---|
1 import xlrd | |
2 import argparse | |
3 import re | |
4 | |
5 parser = argparse.ArgumentParser() | |
6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") | |
7 parser.add_argument("--ref", help="Reference file") | |
8 parser.add_argument("--output", help="Output file") | |
9 | |
10 args = parser.parse_args() | |
11 | |
12 refdic = dict() | |
13 with open(args.ref, 'r') as ref: | |
14 currentSeq = "" | |
15 currentId = "" | |
16 for line in ref.readlines(): | |
17 if line[0] is ">": | |
18 if currentSeq is not "" and currentId is not "": | |
19 refdic[currentId[1:]] = currentSeq | |
20 currentId = line.rstrip() | |
21 currentSeq = "" | |
22 else: | |
23 currentSeq += line.rstrip() | |
24 refdic[currentId[1:]] = currentSeq | |
25 | |
26 | |
27 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, | |
28 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", | |
29 # r"(IGKV[0-3]D?-[0-9]{1,2})", | |
30 # r"(IGLV[0-9]-[0-9]{1,2})", | |
31 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", | |
32 # r"(TRGV[234589])", | |
33 # r"(TRDV[1-3])"] | |
34 | |
35 #vPattern = re.compile(r"|".join(vPattern)) | |
36 vPattern = re.compile("|".join(vPattern)) | |
37 | |
38 def filterGene(s, pattern): | |
39 if type(s) is not str: | |
40 return None | |
41 res = pattern.search(s) | |
42 if res: | |
43 return res.group(0) | |
44 return None | |
45 | |
46 | |
47 | |
48 currentSeq = "" | |
49 currentId = "" | |
50 with open(args.input, 'r') as i: | |
51 with open(args.output, 'w') as o: | |
52 o.write(">>>IMGT\n") | |
53 outputdic = dict() | |
54 for line in i.readlines()[1:]: | |
55 linesplt = line.split("\t") | |
56 ref = filterGene(linesplt[1], vPattern) | |
57 if not ref: | |
58 continue | |
59 if ref in outputdic: | |
60 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] | |
61 else: | |
62 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] | |
63 #print outputdic | |
64 | |
65 for k in outputdic.keys(): | |
66 if k in refdic: | |
67 o.write(">>" + k + "\n") | |
68 o.write(refdic[k] + "\n") | |
69 for seq in outputdic[k]: | |
70 #print seq | |
71 o.write(">" + seq[0] + "\n") | |
72 o.write(seq[1] + "\n") | |
73 else: | |
74 print k + " not in reference, skipping " + k |