Mercurial > repos > mbernt > longorf

diff getLongestORF.py @ 0:c0f423210af0 draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 82ae2fa7e7a4a51f7583c6a95bdafc5f843c7c3b
author: mbernt
date: Mon, 07 Aug 2023 13:52:15 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getLongestORF.py	Mon Aug 07 13:52:15 2023 +0000
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+
+#example:
+#>STRG.1.1(-)_1 [10 - 69]
+#GGNHHTLGGKKTFSYTHPPC
+#>STRG.1.1(-)_2 [3 - 80]
+#FLRGEPPHIGGKKDIFLHPPTLLKGR
+
+#output1: fasta file with all longest ORFs per transcript
+#output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs
+
+import sys,re;
+
+def findlongestOrf(transcriptDict,old_seqID):
+	#write for previous seqID
+	prevTranscript = transcriptDict[old_seqID];
+	i_max = 0;
+	transcript = old_seqID.split("(")[0]
+
+	#find longest orf in transcript
+	for i in range(0,len(prevTranscript)):
+		if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
+			i_max = i;
+
+	for i in range(0,len(prevTranscript)):
+		prevORFstart = prevTranscript[i][0];
+		prevORFend = prevTranscript[i][1];
+		prevORFlength = prevTranscript[i][2];
+		header = prevTranscript[i][3];
+		strand = re.search('\(([+-]+)\)',header).group(1);
+		
+		output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand);
+		if (prevORFend - prevORFstart > 0):
+			output+="\tnormal";
+		else:
+			output+="\treverse_sense";
+		if(i == i_max):
+			output += "\ty\n";
+		else:
+			output += "\tn\n";
+
+		OUTPUT_ORF_SUMMARY.write(output);
+
+	transcriptDict.pop(old_seqID, None);
+	return None;
+
+#-----------------------------------------------------------------------------------------------------
+
+INPUT = open(sys.argv[1],"r");
+OUTPUT_FASTA = open(sys.argv[2],"w");
+OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w");
+
+seqID = "";
+old_seqID = "";
+lengthDict = {};
+seqDict = {};
+headerDict = {};
+transcriptDict = {};
+
+skip = False;
+
+OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n");
+
+for line in INPUT:
+	line = line.strip();
+	if(re.match(">",line)): #header
+		header = line.split(">")[1].split(" ")[0]
+		seqID = "_".join(line.split(">")[1].split("_")[:-1])
+		ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1));
+		ORFend = int (re.search('-\ (\d+)\]',line).group(1));
+		length = abs(ORFend - ORFstart);
+
+		if(seqID not in transcriptDict and old_seqID != ""): #new transcript
+			findlongestOrf(transcriptDict,old_seqID);
+			
+		if seqID not in transcriptDict:
+			transcriptDict[seqID] = [];
+
+		transcriptDict[seqID].append([ORFstart,ORFend,length,header]);
+
+		if(seqID not in lengthDict and old_seqID != ""): #new transcript
+			#write FASTA
+			OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n");
+			#delete old dict entry
+			headerDict.pop(old_seqID, None);
+			seqDict.pop(old_seqID, None);
+			lengthDict.pop(old_seqID, None);
+		#if several longest sequences exist with the same length, the dictionary saves the last occuring.
+		if(seqID not in lengthDict or length >= lengthDict[seqID]):
+			headerDict[seqID] = line;
+			lengthDict[seqID] = length;
+			seqDict[seqID] = "";
+			skip = False;
+		else:
+			skip = True;
+			next;
+		old_seqID = seqID;
+	elif(skip):
+		next;
+	else:
+		seqDict[seqID] += line;
+
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]);
+findlongestOrf(transcriptDict,old_seqID);
+
+INPUT.close();
+OUTPUT_FASTA.close();
+OUTPUT_ORF_SUMMARY.close();
\ No newline at end of file
author	mbernt
date	Mon, 07 Aug 2023 13:52:15 +0000
parents
children