Mercurial > repos > davidvanzessen > igblast_human
comparison igblast_post.py @ 63:7997fea1f16a draft default tip
Uploaded
| author | davidvanzessen |
|---|---|
| date | Thu, 06 Nov 2014 03:31:15 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 62:a700a552f031 | 63:7997fea1f16a |
|---|---|
| 1 import urllib2 | |
| 2 import urllib | |
| 3 import httplib | |
| 4 import re | |
| 5 import sys | |
| 6 import time | |
| 7 import httplib | |
| 8 | |
| 9 infile = "d:/wd/igblast_post/Stanford_S22.fasta" | |
| 10 infile = "d:/wd/igblast_post/Stanford_S22_Small.fasta" | |
| 11 infile = "d:/wd/igblast_post/Stanford_S22_Smallest.fasta" | |
| 12 outfile = "d:/wd/igblast_post/out.txt" | |
| 13 | |
| 14 #needed or else response.read() fails on large results | |
| 15 httplib.HTTPConnection._http_vsn = 10 | |
| 16 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' | |
| 17 | |
| 18 formregex = re.compile("http:\/\/www\.ncbi\.nlm\.nih\.gov\/igblast\/igblast.cgi\?.*ctg_time=(\d+)&job_key=(.{28})") | |
| 19 | |
| 20 seq = "" | |
| 21 seq_dic = dict() | |
| 22 | |
| 23 current_id = "" | |
| 24 current_seq = "" | |
| 25 | |
| 26 with open(infile) as i: | |
| 27 for line in i: | |
| 28 seq += line | |
| 29 if line.startswith(">"): | |
| 30 new_id = line.replace(">", "").rstrip() | |
| 31 if current_id != new_id: | |
| 32 if current_seq != "": | |
| 33 seq_dic[current_id] = current_seq | |
| 34 current_id = new_id | |
| 35 current_seq = "" | |
| 36 else: | |
| 37 current_seq += line.rstrip() | |
| 38 | |
| 39 seqcount = seq.count(">") | |
| 40 print "%i sequences" % (seqcount) | |
| 41 url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi" | |
| 42 headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } | |
| 43 values = {"queryseq": seq, | |
| 44 "germline_db_V": "IG_DB/imgt.Homo_sapiens.V.f.orf.p", | |
| 45 "germline_db_D": "IG_DB/imgt.Homo_sapiens.D.f.orf", | |
| 46 "germline_db_J": "IG_DB/imgt.Homo_sapiens.J.f.orf", | |
| 47 "num_alignments_V": "1", | |
| 48 "num_alignments_D": "1", | |
| 49 "num_alignments_J": "1", | |
| 50 "outfmt": "7", | |
| 51 "domain": "imgt", | |
| 52 "program": "blastn"} | |
| 53 | |
| 54 ctg_time = "" | |
| 55 job_key = "" | |
| 56 check_url = "" | |
| 57 regions = ["FR1-IMGT", "CDR1-IMGT", "FR2-IMGT","CDR2-IMGT","FR3-IMGT","CDR3-IMGT","FR4-IMGT"] | |
| 58 region_loc = {region: i*len(regions) for i,region in enumerate(regions)} | |
| 59 VDJ_loc = {loci: i*11 for i,loci in enumerate("VDJ")} | |
| 60 data = urllib.urlencode(values) | |
| 61 req = urllib2.Request(url, data, headers) | |
| 62 response = urllib2.urlopen(req) | |
| 63 html = response.read() | |
| 64 re_match = formregex.search(html) | |
| 65 with open(outfile, 'w') as o: | |
| 66 if re_match: | |
| 67 ctg_time = re_match.group(1) | |
| 68 job_key = re_match.group(2) | |
| 69 o.write("NCBI igblast is processing...") | |
| 70 o.write("ctg_time: %s" % (ctg_time)) | |
| 71 o.write("job_key: %s" % (job_key)) | |
| 72 check_url = "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?ctg_time=%s&job_key=%s" % (ctg_time, job_key) | |
| 73 print "URL: %s" % check_url | |
| 74 o.write("URL: %s" % check_url) | |
| 75 total_time = 0 | |
| 76 while re_match: | |
| 77 time.sleep(10) | |
| 78 total_time += 10 | |
| 79 o.write("Waited %i seconds, checking progress..." % (total_time)) | |
| 80 req = urllib2.Request(check_url) | |
| 81 response = urllib2.urlopen(req) | |
| 82 html = response.read() | |
| 83 if html.find("Job failed.") != -1: | |
| 84 sys.exit("Job Failed") | |
| 85 if html.find("Job is canceled.") != -1: | |
| 86 sys.exit("Job canceled") | |
| 87 re_match = formregex.search(html) | |
| 88 else: | |
| 89 o.write("NCBI is done analysing, parsing result...") | |
| 90 else: | |
| 91 test = re.compile("Formatting Results: Job id = (.{28})") | |
| 92 s = test.search(html) | |
| 93 o.write("http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=" + s.group(1)) | |
| 94 print "http://www.ncbi.nlm.nih.gov/igblast/igblast.cgi?job_key=%s" % s.group(1) | |
| 95 | |
| 96 html = html.split("\n") | |
| 97 | |
| 98 queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1] | |
| 99 o.write("%i results" % len(queries)) | |
| 100 retry_count = 1 | |
| 101 while seqcount != len(queries) and retry_count <= 3: | |
| 102 #sys.stderr.write("Error, %i sequences in input and %i in result, happens sometimes, rerun?" % (seqcount, len(queries))) | |
| 103 o.write("Error, %i sequences in input and %i in result, requesting results again. (%i)" % (seqcount, len(queries), retry_count)) | |
| 104 req = urllib2.Request(check_url) | |
| 105 response = urllib2.urlopen(req) | |
| 106 html = response.read() | |
| 107 html = html.split("\n") | |
| 108 queries = [i for i,x in enumerate(html) if x.find("# Query:") != -1] | |
| 109 retry_count += 1 | |
| 110 if seqcount == len(queries): | |
| 111 o.write("Success, continuing with analysis") | |
| 112 | |
| 113 | |
| 114 | |
| 115 | |
| 116 | |
| 117 header = "ID,Top V Gene,Top D Gene,Top J Gene,Chain type,stop codon,VDJ Frame,Productive,Strand,V end,V-D junction,D region,D-J junction,J start".split(",") | |
| 118 alignments_summ_headers = "from,to,length,matches,mismatches,gaps,percent identity".split(",") | |
| 119 for region in regions: | |
| 120 header += ["%s %s" % (region, x) for x in alignments_summ_headers] | |
| 121 header += ["%s seq" % region for region in regions] | |
| 122 hit_table_headers = "% identity,alignment length,mismatches,gap opens,gaps,q. start,q. end,s.start,s. end,evalue,bit score".split(",") | |
| 123 for g in ["V","D","J"]: | |
| 124 header += ["%s %s" % (g, x) for x in hit_table_headers] | |
| 125 | |
| 126 with open(outfile, 'w') as o: | |
| 127 o.write("\t".join(header) + "\n") | |
| 128 for i in range(len(queries)-1): | |
| 129 frm = queries[i] | |
| 130 to = 0 | |
| 131 if len(queries) == i+1: | |
| 132 to = len(html) | |
| 133 else: | |
| 134 to = queries[i+1]-1 | |
| 135 IDLine = queries[i] | |
| 136 | |
| 137 ID = html[IDLine].replace("# Query: ", "") | |
| 138 | |
| 139 if html[IDLine + 3] == "# 0 hits found": | |
| 140 print "No match on", ID | |
| 141 continue | |
| 142 result_row = [ID] | |
| 143 currentLine = IDLine + 5 #V-(D)-J rearrangement summary | |
| 144 if html[currentLine-1].find("Top D gene") == -1: #sometimes ncbi leaves out the D gene, not even a N/A... | |
| 145 split_row = html[currentLine].split("\t") | |
| 146 result_row += [split_row[0]] | |
| 147 result_row += ["N/A"] | |
| 148 result_row += split_row[1:] | |
| 149 else: | |
| 150 result_row += html[currentLine].split("\t") | |
| 151 currentLine += 3 #V-(D)-J junction details | |
| 152 result_row += html[currentLine].split("\t")[:-1] #[:-1] because the igblast tabular output $#@* sucks | |
| 153 currentLine += 2 #Alignment summary | |
| 154 al_summ = ["N/A"] * (7 * len(regions)) | |
| 155 al_seqs = {region: "N/A" for region in regions} | |
| 156 if html[currentLine].startswith("# Alignment summary"): #Alignment summary might not be included, add N/A if not, otherwise parse it | |
| 157 for region in regions: | |
| 158 currentLine += 1 | |
| 159 if html[currentLine].startswith("# Hit table"): | |
| 160 break | |
| 161 splitrow = html[currentLine].split("\t") | |
| 162 current_region = splitrow[0].replace(" (germline)", "") | |
| 163 if current_region in regions: | |
| 164 al_summ[region_loc[current_region]:region_loc[current_region] + len(regions)] = splitrow[1:] | |
| 165 al_seqs[current_region] = seq_dic[ID][int(splitrow[1])-1:int(splitrow[2])-1] | |
| 166 result_row += al_summ | |
| 167 result_row += [al_seqs[region] for region in regions] | |
| 168 VDJ_info = ["N/A"] * (11 * len(VDJ_loc.keys())) | |
| 169 if html[currentLine].startswith("# Hit table"): | |
| 170 currentLine += 3 | |
| 171 while html[currentLine] != "": | |
| 172 splitrow = html[currentLine].split("\t") | |
| 173 loci = splitrow[0] | |
| 174 VDJ_info[VDJ_loc[loci]:VDJ_loc[loci] + 11] = splitrow[3:] | |
| 175 currentLine +=1 | |
| 176 result_row += VDJ_info | |
| 177 | |
| 178 o.write("\t".join(result_row) + "\n") | |
| 179 | |
| 180 | |
| 181 #print V, D, J, frame, strand | |
| 182 | |
| 183 | |
| 184 |
