annotate igblast_post.py @ 63:7997fea1f16a draft default tip

Uploaded
author davidvanzessen
date Thu, 06 Nov 2014 03:31:15 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
63
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
1 import urllib2
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
2 import urllib
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
3 import httplib
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
4 import re
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
5 import sys
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
6 import time
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
7 import httplib
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
8
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
9 infile = "d:/wd/igblast_post/Stanford_S22.fasta"
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
10 infile = "d:/wd/igblast_post/Stanford_S22_Small.fasta"
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
11 infile = "d:/wd/igblast_post/Stanford_S22_Smallest.fasta"
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
12 outfile = "d:/wd/igblast_post/out.txt"
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
13
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
14 #needed or else response.read() fails on large results
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
15 httplib.HTTPConnection._http_vsn = 10
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
16 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
17
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
18 formregex = re.compile("http:\/\/www\.ncbi\.nlm\.nih\.gov\/igblast\/igblast.cgi\?.*ctg_time=(\d+)&job_key=(.{28})")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
19
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
20 seq = ""
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
21 seq_dic = dict()
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
22
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
23 current_id = ""
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
24 current_seq = ""
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
25
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
26 with open(infile) as i:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
27 for line in i:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
28 seq += line
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
29 if line.startswith(">"):
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
30 new_id = line.replace(">", "").rstrip()
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
31 if current_id != new_id:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
32 if current_seq != "":
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
33 seq_dic[current_id] = current_seq
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
34 current_id = new_id
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
35 current_seq = ""
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
36 else:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
37 current_seq += line.rstrip()
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
38
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
39 seqcount = seq.count(">")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
40 print "%i sequences" % (seqcount)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
41 url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi"
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
42 headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
43 values = {"queryseq": seq,
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
44 "germline_db_V": "IG_DB/imgt.Homo_sapiens.V.f.orf.p",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
45 "germline_db_D": "IG_DB/imgt.Homo_sapiens.D.f.orf",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
46 "germline_db_J": "IG_DB/imgt.Homo_sapiens.J.f.orf",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
47 "num_alignments_V": "1",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
48 "num_alignments_D": "1",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
49 "num_alignments_J": "1",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
50 "outfmt": "7",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
51 "domain": "imgt",
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
52 "program": "blastn"}
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
53
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
54 ctg_time = ""
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
55 job_key = ""
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
56 check_url = ""
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
57 regions = ["FR1-IMGT", "CDR1-IMGT", "FR2-IMGT","CDR2-IMGT","FR3-IMGT","CDR3-IMGT","FR4-IMGT"]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
58 region_loc = {region: i*len(regions) for i,region in enumerate(regions)}
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
59 VDJ_loc = {loci: i*11 for i,loci in enumerate("VDJ")}
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
60 data = urllib.urlencode(values)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
61 req = urllib2.Request(url, data, headers)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
62 response = urllib2.urlopen(req)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
63 html = response.read()
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
64 re_match = formregex.search(html)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
65 with open(outfile, 'w') as o:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
66 if re_match:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
67 ctg_time = re_match.group(1)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
68 job_key = re_match.group(2)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
69 o.write("NCBI igblast is processing...")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
70 o.write("ctg_time: %s" % (ctg_time))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
71 o.write("job_key: %s" % (job_key))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
72 check_url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?ctg_time=%s&job_key=%s" % (ctg_time, job_key)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
73 print "URL: %s" % check_url
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
74 o.write("URL: %s" % check_url)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
75 total_time = 0
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
76 while re_match:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
77 time.sleep(10)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
78 total_time += 10
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
79 o.write("Waited %i seconds, checking progress..." % (total_time))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
80 req = urllib2.Request(check_url)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
81 response = urllib2.urlopen(req)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
82 html = response.read()
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
83 if html.find("Job failed.") != -1:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
84 sys.exit("Job Failed")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
85 if html.find("Job is canceled.") != -1:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
86 sys.exit("Job canceled")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
87 re_match = formregex.search(html)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
88 else:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
89 o.write("NCBI is done analysing, parsing result...")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
90 else:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
91 test = re.compile("Formatting Results: Job id = (.{28})")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
92 s = test.search(html)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
93 o.write("http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=" + s.group(1))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
94 print "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=%s" % s.group(1)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
95
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
96 html = html.split("\n")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
97
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
98 queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
99 o.write("%i results" % len(queries))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
100 retry_count = 1
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
101 while seqcount != len(queries) and retry_count <= 3:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
102 #sys.stderr.write("Error, %i sequences in input and %i in result, happens sometimes, rerun?" % (seqcount, len(queries)))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
103 o.write("Error, %i sequences in input and %i in result, requesting results again. (%i)" % (seqcount, len(queries), retry_count))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
104 req = urllib2.Request(check_url)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
105 response = urllib2.urlopen(req)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
106 html = response.read()
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
107 html = html.split("\n")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
108 queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
109 retry_count += 1
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
110 if seqcount == len(queries):
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
111 o.write("Success, continuing with analysis")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
112
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
113
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
114
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
115
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
116
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
117 header = "ID,Top V Gene,Top D Gene,Top J Gene,Chain type,stop codon,VDJ Frame,Productive,Strand,V end,V-D junction,D region,D-J junction,J start".split(",")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
118 alignments_summ_headers = "from,to,length,matches,mismatches,gaps,percent identity".split(",")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
119 for region in regions:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
120 header += ["%s %s" % (region, x) for x in alignments_summ_headers]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
121 header += ["%s seq" % region for region in regions]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
122 hit_table_headers = "% identity,alignment length,mismatches,gap opens,gaps,q. start,q. end,s.start,s. end,evalue,bit score".split(",")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
123 for g in ["V","D","J"]:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
124 header += ["%s %s" % (g, x) for x in hit_table_headers]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
125
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
126 with open(outfile, 'w') as o:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
127 o.write("\t".join(header) + "\n")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
128 for i in range(len(queries)-1):
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
129 frm = queries[i]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
130 to = 0
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
131 if len(queries) == i+1:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
132 to = len(html)
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
133 else:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
134 to = queries[i+1]-1
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
135 IDLine = queries[i]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
136
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
137 ID = html[IDLine].replace("# Query: ", "")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
138
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
139 if html[IDLine + 3] == "# 0 hits found":
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
140 print "No match on", ID
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
141 continue
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
142 result_row = [ID]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
143 currentLine = IDLine + 5 #V-(D)-J rearrangement summary
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
144 if html[currentLine-1].find("Top D gene") == -1: #sometimes ncbi leaves out the D gene, not even a N/A...
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
145 split_row = html[currentLine].split("\t")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
146 result_row += [split_row[0]]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
147 result_row += ["N/A"]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
148 result_row += split_row[1:]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
149 else:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
150 result_row += html[currentLine].split("\t")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
151 currentLine += 3 #V-(D)-J junction details
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
152 result_row += html[currentLine].split("\t")[:-1] #[:-1] because the igblast tabular output $#@* sucks
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
153 currentLine += 2 #Alignment summary
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
154 al_summ = ["N/A"] * (7 * len(regions))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
155 al_seqs = {region: "N/A" for region in regions}
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
156 if html[currentLine].startswith("# Alignment summary"): #Alignment summary might not be included, add N/A if not, otherwise parse it
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
157 for region in regions:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
158 currentLine += 1
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
159 if html[currentLine].startswith("# Hit table"):
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
160 break
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
161 splitrow = html[currentLine].split("\t")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
162 current_region = splitrow[0].replace(" (germline)", "")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
163 if current_region in regions:
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
164 al_summ[region_loc[current_region]:region_loc[current_region] + len(regions)] = splitrow[1:]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
165 al_seqs[current_region] = seq_dic[ID][int(splitrow[1])-1:int(splitrow[2])-1]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
166 result_row += al_summ
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
167 result_row += [al_seqs[region] for region in regions]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
168 VDJ_info = ["N/A"] * (11 * len(VDJ_loc.keys()))
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
169 if html[currentLine].startswith("# Hit table"):
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
170 currentLine += 3
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
171 while html[currentLine] != "":
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
172 splitrow = html[currentLine].split("\t")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
173 loci = splitrow[0]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
174 VDJ_info[VDJ_loc[loci]:VDJ_loc[loci] + 11] = splitrow[3:]
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
175 currentLine +=1
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
176 result_row += VDJ_info
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
177
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
178 o.write("\t".join(result_row) + "\n")
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
179
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
180
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
181 #print V, D, J, frame, strand
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
182
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
183
7997fea1f16a Uploaded
davidvanzessen
parents:
diff changeset
184