Mercurial > repos > mbernt > longorf
comparison getLongestORF.py @ 0:c0f423210af0 draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 82ae2fa7e7a4a51f7583c6a95bdafc5f843c7c3b
| author | mbernt |
|---|---|
| date | Mon, 07 Aug 2023 13:52:15 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c0f423210af0 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 #example: | |
| 4 #>STRG.1.1(-)_1 [10 - 69] | |
| 5 #GGNHHTLGGKKTFSYTHPPC | |
| 6 #>STRG.1.1(-)_2 [3 - 80] | |
| 7 #FLRGEPPHIGGKKDIFLHPPTLLKGR | |
| 8 | |
| 9 #output1: fasta file with all longest ORFs per transcript | |
| 10 #output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs | |
| 11 | |
| 12 import sys,re; | |
| 13 | |
| 14 def findlongestOrf(transcriptDict,old_seqID): | |
| 15 #write for previous seqID | |
| 16 prevTranscript = transcriptDict[old_seqID]; | |
| 17 i_max = 0; | |
| 18 transcript = old_seqID.split("(")[0] | |
| 19 | |
| 20 #find longest orf in transcript | |
| 21 for i in range(0,len(prevTranscript)): | |
| 22 if(prevTranscript[i][2] >= prevTranscript[i_max][2]): | |
| 23 i_max = i; | |
| 24 | |
| 25 for i in range(0,len(prevTranscript)): | |
| 26 prevORFstart = prevTranscript[i][0]; | |
| 27 prevORFend = prevTranscript[i][1]; | |
| 28 prevORFlength = prevTranscript[i][2]; | |
| 29 header = prevTranscript[i][3]; | |
| 30 strand = re.search('\(([+-]+)\)',header).group(1); | |
| 31 | |
| 32 output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand); | |
| 33 if (prevORFend - prevORFstart > 0): | |
| 34 output+="\tnormal"; | |
| 35 else: | |
| 36 output+="\treverse_sense"; | |
| 37 if(i == i_max): | |
| 38 output += "\ty\n"; | |
| 39 else: | |
| 40 output += "\tn\n"; | |
| 41 | |
| 42 OUTPUT_ORF_SUMMARY.write(output); | |
| 43 | |
| 44 transcriptDict.pop(old_seqID, None); | |
| 45 return None; | |
| 46 | |
| 47 #----------------------------------------------------------------------------------------------------- | |
| 48 | |
| 49 INPUT = open(sys.argv[1],"r"); | |
| 50 OUTPUT_FASTA = open(sys.argv[2],"w"); | |
| 51 OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w"); | |
| 52 | |
| 53 seqID = ""; | |
| 54 old_seqID = ""; | |
| 55 lengthDict = {}; | |
| 56 seqDict = {}; | |
| 57 headerDict = {}; | |
| 58 transcriptDict = {}; | |
| 59 | |
| 60 skip = False; | |
| 61 | |
| 62 OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n"); | |
| 63 | |
| 64 for line in INPUT: | |
| 65 line = line.strip(); | |
| 66 if(re.match(">",line)): #header | |
| 67 header = line.split(">")[1].split(" ")[0] | |
| 68 seqID = "_".join(line.split(">")[1].split("_")[:-1]) | |
| 69 ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1)); | |
| 70 ORFend = int (re.search('-\ (\d+)\]',line).group(1)); | |
| 71 length = abs(ORFend - ORFstart); | |
| 72 | |
| 73 if(seqID not in transcriptDict and old_seqID != ""): #new transcript | |
| 74 findlongestOrf(transcriptDict,old_seqID); | |
| 75 | |
| 76 if seqID not in transcriptDict: | |
| 77 transcriptDict[seqID] = []; | |
| 78 | |
| 79 transcriptDict[seqID].append([ORFstart,ORFend,length,header]); | |
| 80 | |
| 81 if(seqID not in lengthDict and old_seqID != ""): #new transcript | |
| 82 #write FASTA | |
| 83 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n"); | |
| 84 #delete old dict entry | |
| 85 headerDict.pop(old_seqID, None); | |
| 86 seqDict.pop(old_seqID, None); | |
| 87 lengthDict.pop(old_seqID, None); | |
| 88 #if several longest sequences exist with the same length, the dictionary saves the last occuring. | |
| 89 if(seqID not in lengthDict or length >= lengthDict[seqID]): | |
| 90 headerDict[seqID] = line; | |
| 91 lengthDict[seqID] = length; | |
| 92 seqDict[seqID] = ""; | |
| 93 skip = False; | |
| 94 else: | |
| 95 skip = True; | |
| 96 next; | |
| 97 old_seqID = seqID; | |
| 98 elif(skip): | |
| 99 next; | |
| 100 else: | |
| 101 seqDict[seqID] += line; | |
| 102 | |
| 103 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]); | |
| 104 findlongestOrf(transcriptDict,old_seqID); | |
| 105 | |
| 106 INPUT.close(); | |
| 107 OUTPUT_FASTA.close(); | |
| 108 OUTPUT_ORF_SUMMARY.close(); |
