diff OTUtable_addblast.py @ 1:4afa63644ac3 draft default tip

Uploaded
author saskia-hiltemann
date Mon, 09 Nov 2015 09:50:15 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/OTUtable_addblast.py	Mon Nov 09 09:50:15 2015 -0500
@@ -0,0 +1,71 @@
+import requests
+import time
+import sys
+
+baseurl="http://www.ncbi.nlm.nih.gov/blast/Blast.cgi"
+
+OTUfile=sys.argv[0]
+BLASTfile=sys.argv[1]
+fastafile=sys.argv[2]
+
+    
+def make_url(seq):
+    return baseurl+"?DATABASE=nr&PERC_IDENT=97&EXCLUDE_SEQ_UNCULT=on&HITLIST_SIZE=10&FILTER=L&FILTER=m&FILTER=R&EXPECT=10&FORMAT_TYPE=HTML&PROGRAM=blastn&CLIENT=web&SERVICE=megablast&PAGE=Nucleotides&CMD=Put&QUERY="+seq.lower()
+
+def make_RIDlink(RID):
+    return "<a target=\"_blank\" href=\""+baseurl+"?CMD=Get&RID="+RID+"\">view results</a>"
+
+def make_rerun_link(seq):
+    return "<a target=\"_blank\" href=\""+baseurl+"?DATABASE=nr&HITLIST_SIZE=10&EXCLUDE_SEQ_UNCULT=true&FILTER=L&EXPECT=10&FORMAT_TYPE=HTML&PROGRAM=blastn&CLIENT=web&SERVICE=megablast&PAGE=Nucleotides&CMD=Put&QUERY="+seq.lower()+"\">resubmit query</a>"
+    
+
+    
+### for each fasta sequence create blast search
+sequences = [line.rstrip('\n').replace('-','') for line in open(fastafile) if '>' not in line]
+urls = [make_url(seq) for seq in sequences]
+
+RIDs = []
+for url in urls:
+    r=requests.get(url)    
+    RID = r.text[r.text.find("RID"):r.text.find("RTOE")]
+    RID = RID[6:-3].lstrip().rstrip()
+    RIDs.append(RID)
+    print "Submitted request, RID: "+ RID
+    time.sleep(3) # be nice to the server
+
+
+
+### Get top hits from local BLAST results file, add to OTUtable file
+blastf = open(BLASTfile, "r")
+otuf = open(OTUfile, "r")
+outfile = open("newtable.tsv","w+")
+
+linenum=0    
+for line in otuf:
+   if linenum == 0:
+       outfile.write( line.rstrip()+"\tBLAST Top Hit\n" )
+   else: 
+       outfile.write( line.rstrip() +"\t"+ blastf.readline().strip().split("\t")[-1]+"\n" )
+   linenum +=1
+
+blastf.close()
+otuf.close()
+outfile.close()
+
+
+
+### Add RID link and rerun link to table
+otuf = open("newtable.tsv","r")
+outfile = open("newtable2.tsv","w+")
+
+print len(sequences)
+print len (RIDs)
+linenum=-1    
+for line in otuf:
+   if linenum == -1:
+       outfile.write( line.rstrip()+"\tBLAST result\tBLAST resubmit\n" )
+   else: 
+       outfile.write( line.rstrip() +"\t"+ make_RIDlink(RIDs[linenum]) + "\t" + make_rerun_link(sequences[linenum])+"\n" )
+   linenum +=1
+
+