Mercurial > repos > mbernt > longorf
changeset 0:c0f423210af0 draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 82ae2fa7e7a4a51f7583c6a95bdafc5f843c7c3b
| author | mbernt |
|---|---|
| date | Mon, 07 Aug 2023 13:52:15 +0000 |
| parents | |
| children | |
| files | getLongestORF.py longORF.xml test-data/test_input.fasta test-data/test_output.fasta test-data/test_output.tab |
| diffstat | 5 files changed, 295 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getLongestORF.py Mon Aug 07 13:52:15 2023 +0000 @@ -0,0 +1,108 @@ +#!/usr/bin/env python + +#example: +#>STRG.1.1(-)_1 [10 - 69] +#GGNHHTLGGKKTFSYTHPPC +#>STRG.1.1(-)_2 [3 - 80] +#FLRGEPPHIGGKKDIFLHPPTLLKGR + +#output1: fasta file with all longest ORFs per transcript +#output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs + +import sys,re; + +def findlongestOrf(transcriptDict,old_seqID): + #write for previous seqID + prevTranscript = transcriptDict[old_seqID]; + i_max = 0; + transcript = old_seqID.split("(")[0] + + #find longest orf in transcript + for i in range(0,len(prevTranscript)): + if(prevTranscript[i][2] >= prevTranscript[i_max][2]): + i_max = i; + + for i in range(0,len(prevTranscript)): + prevORFstart = prevTranscript[i][0]; + prevORFend = prevTranscript[i][1]; + prevORFlength = prevTranscript[i][2]; + header = prevTranscript[i][3]; + strand = re.search('\(([+-]+)\)',header).group(1); + + output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand); + if (prevORFend - prevORFstart > 0): + output+="\tnormal"; + else: + output+="\treverse_sense"; + if(i == i_max): + output += "\ty\n"; + else: + output += "\tn\n"; + + OUTPUT_ORF_SUMMARY.write(output); + + transcriptDict.pop(old_seqID, None); + return None; + +#----------------------------------------------------------------------------------------------------- + +INPUT = open(sys.argv[1],"r"); +OUTPUT_FASTA = open(sys.argv[2],"w"); +OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w"); + +seqID = ""; +old_seqID = ""; +lengthDict = {}; +seqDict = {}; +headerDict = {}; +transcriptDict = {}; + +skip = False; + +OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n"); + +for line in INPUT: + line = line.strip(); + if(re.match(">",line)): #header + header = line.split(">")[1].split(" ")[0] + seqID = "_".join(line.split(">")[1].split("_")[:-1]) + ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1)); + ORFend = int (re.search('-\ (\d+)\]',line).group(1)); + length = abs(ORFend - ORFstart); + + if(seqID not in transcriptDict and old_seqID != ""): #new transcript + findlongestOrf(transcriptDict,old_seqID); + + if seqID not in transcriptDict: + transcriptDict[seqID] = []; + + transcriptDict[seqID].append([ORFstart,ORFend,length,header]); + + if(seqID not in lengthDict and old_seqID != ""): #new transcript + #write FASTA + OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n"); + #delete old dict entry + headerDict.pop(old_seqID, None); + seqDict.pop(old_seqID, None); + lengthDict.pop(old_seqID, None); + #if several longest sequences exist with the same length, the dictionary saves the last occuring. + if(seqID not in lengthDict or length >= lengthDict[seqID]): + headerDict[seqID] = line; + lengthDict[seqID] = length; + seqDict[seqID] = ""; + skip = False; + else: + skip = True; + next; + old_seqID = seqID; + elif(skip): + next; + else: + seqDict[seqID] += line; + +OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]); +findlongestOrf(transcriptDict,old_seqID); + +INPUT.close(); +OUTPUT_FASTA.close(); +OUTPUT_ORF_SUMMARY.close(); \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/longORF.xml Mon Aug 07 13:52:15 2023 +0000 @@ -0,0 +1,35 @@ +<tool id="longORF" name="Obtain longest ORFs" version="0.3.0"> + <description> in six-frame translations</description> + <command><![CDATA[ + python $__tool_directory__/getLongestORF.py $input $output_longestORF $output_ORFs + ]]> + </command> + <inputs> + <param name="input" format="fasta" type="data" label="sequences"/> + </inputs> + <outputs> + <data name="output_longestORF" format="fasta"/> + <data name="output_ORFs" format="tabular"/> + </outputs> + + <tests> + <test> + <param name="input" value="test_input.fasta"/> + <output name="output_longestORF" file="test_output.fasta"/> + <output name="output_ORFs" file="test_output.tab"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +This tool identifies the longest Open Reading Frames within the six-frame translations of a set of sequences. + +**Input** + +It takes an amino acid fasta file with all open reading frames (+ and - strand) listed by the correspondng transcript. The tool is designed to process the output of the Galaxy tool "getorf" from the EMBOSS package. + +**Output** + +For each transcript, the respected longest ORF is identified and listed in fasta format. Furthermore, table with information about seqID, start, end, length, orientation, longest for all ORFs is given.]]> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_input.fasta Mon Aug 07 13:52:15 2023 +0000 @@ -0,0 +1,100 @@ +>STRG.4.1(-)_1 [3 - 77] +PNHCLRGHESPETRQSPLSGKRIPS +>STRG.4.1(-)_2 [59 - 88] +WKEDPLIVCP +>STRG.4.1(-)_3 [92 - 127] +WNRERYFQGFGL +>STRG.4.1(-)_4 [131 - 268] +LPTQKQKDRWGTHTLERFGFTVPTMPAVISLFTETNPSSQITSTQD +>STRG.4.1(-)_5 [81 - 332] +SAPDGIGKDTSRGSDYNCQPRNKRTAGGLTPWRDLGSQCPQCQRLFHYLRRRIPPVRSLQ +LKTKFWRHPDHLGTHRLLGVPAEN +>STRG.4.1(-)_6 [272 - 379] +VLETPRPSGHAQATWGSCGELSSQMQMESAMMPSTW +>STRG.4.1(-)_7 [366 - 437] +CPLRGRGTGRSHGAAGHAVPPNRH +>STRG.4.1(-)_8 [465 - 518] +KRYGMCCVYLDKFVGGCG +>STRG.4.1(-)_9 [383 - 565] +GHWTKPWSCGACGTTQSTLRPRSSEPIKALRNVLCLPGQICWWLWLSWMGRNKKPWTWFP +W +>STRG.4.1(-)_10 [599 - 664] +SSEWWSWWTSASSPSTPVGRSS +>STRG.4.1(-)_11 [522 - 665] +AGWVGTRSLGPGSLGDQRGPGGALPDRRSGGRGGHRRHPHQLPWGEAA +>STRG.4.1(-)_12 [1 - 666] +DPTTVYVDMRALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDSHLGE +IWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLGFLRRTELTDANGER +HDALYVVGALDEAMELRGMRYHPIDIETSVIRAHKSVTECAVFTWTNLLVVVVELDGSEQ +EALDLVPLVTNVVLEEHYLIVGVVVVVDIGVIPINSRGEKQR +>STRG.4.1(-)_13 [665 - 594] (REVERSE SENSE) +RCFSPRELMGMTPMSTTTTTPTIR +>STRG.4.1(-)_14 [631 - 491] (REVERSE SENSE) +RRCPPRPPLRRSGSAPPGPRWSPREPGPRLLVPTHPAQPQPPTNLSR +>STRG.4.1(-)_15 [590 - 375] (REVERSE SENSE) +CSSRTTLVTKGTRSKASCSDPSSSTTTTNKFVQVNTAHSVTLLWALMTEVSMSIGWYRMP +RSSMASSSAPTT +>STRG.4.1(-)_16 [448 - 341] (REVERSE SENSE) +PRSQCRLGGTACPAAPWLRPVPLPRRGHHGALHLHL +>STRG.4.1(-)_17 [371 - 312] (REVERSE SENSE) +RASWRSPFASVSSVLRRNPK +>STRG.4.1(-)_18 [666 - 271] (REVERSE SENSE) +ALLLPTGVDGDDADVHHDHHSDDQVVLLQDHVGHQGNQVQGFLFRPIQLNHNHQQICPGK +HSTFRNAFMGSDDRGLNVDWVVPHAPQLHGFVQCPYHVEGIMALSICICELSSPQEPQVA +CACPDGLGVSKT +>STRG.4.1(-)_19 [337 - 260] (REVERSE SENSE) +AQFSAGTPSSLCVPRWSGCLQNLVLS +>STRG.4.1(-)_20 [308 - 255] (REVERSE SENSE) +PVRAQMVWVSPKLSLELK +>STRG.4.1(-)_21 [256 - 224] (REVERSE SENSE) +SDLTGGIRLRK +>STRG.4.1(-)_22 [246 - 169] (REVERSE SENSE) +LEGFVSVNSEITAGIVGTVNPNLSKV +>STRG.4.1(-)_23 [217 - 128] (REVERSE SENSE) +NNRWHCGHCEPKSLQGVSPPAVLLFLGWQL +>STRG.4.1(-)_24 [188 - 78] (REVERSE SENSE) +TQISPRCESPSGPFVSGLAIIIRTPGSIFPDSIRGRL +>STRG.4.1(-)_25 [165 - 58] (REVERSE SENSE) +VPQRSFCFWVGNYNPNPWKYLSRFHQGQTMRGSSFH +>STRG.4.1(-)_26 [74 - 18] (REVERSE SENSE) +GDPLSTKRTLSCLRALMST +>STRG.4.1(-)_27 [124 - 2] (REVERSE SENSE) +SEPLEVSFPIPSGADYEGILFPLSGLCRVSGLSCPRRQWLG +>STRG.4.1(-)_28 [54 - 1] (REVERSE SENSE) +ADSVVSQGSHVHVDSGWV +>STRG.6.1(-)_1 [1 - 63] +NWDASWRKDVSRSHQCLLPFH +>STRG.6.1(-)_2 [24 - 182] +RCLTQPPVPSAVPLSCSVNFTPLEKWPSAWTLTVDWDLSSGASAVCILGTSPS +>STRG.6.1(-)_3 [94 - 195] +RSGHLPGPLLWTGICPLVPLQCVFWAPVHPDPAL +>STRG.6.1(-)_4 [186 - 233] +SRPLSWKPTPPCGFLP +>STRG.6.1(-)_5 [2 - 250] +TGMLAGVKMSHAATSAFCRSIKLQCELYPSREVAICLDPYCGLGFVLWCLCSVYSGHQSI +LIPPSELETNPALWLLAVSQYKV +>STRG.6.1(-)_6 [199 - 252] +AGNQPRLVASCRESVQSP +>STRG.6.1(-)_7 [237 - 121] (REVERSE SENSE) +LTARSHKAGLVSSSEGGIRMDWCPEYTLQRHQRTNPSPQ +>STRG.6.1(-)_8 [251 - 93] (REVERSE SENSE) +GLCTDSRQEATRRGWFPAQRAGSGWTGAQNTHCRGTRGQIPVHSKGPGRWPLL +>STRG.6.1(-)_9 [117 - 85] (REVERSE SENSE) +GSRQMATSLEG +>STRG.6.1(-)_10 [81 - 34] (REVERSE SENSE) +SSHCSLMERQKALVAA +>STRG.6.1(-)_11 [250 - 14] (REVERSE SENSE) +DFVLTHGKKPQGGVGFQLRGRDQDGLVPRIHTAEAPEDKSQSTVRVQADGHFSRGVKFTL +QLNGTAEGTGGCVRHLYAS +>STRG.6.1(-)_12 [62 - 3] (REVERSE SENSE) +WNGRRHWWLRETSLRQLASQ +>STRG.6.1(-)_13 [30 - 1] (REVERSE SENSE) +DIFTPASIPV +>STRG.8.1(-)_1 [18 - 56] +RTSKKPNGRDPTV +>STRG.8.1(-)_2 [60 - 95] +RLAKAAVVCHRV +>STRG.8.1(-)_3 [99 - 137] +TSLQTAPRLVPTH +>STRG.8.1(-)_4 [2 - 205] +VTPAIKDFQKAQRERSHSLKVGQSCCGLSQSLNISPNRPETGSHTLKMPITTLRILSTRR \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.fasta Mon Aug 07 13:52:15 2023 +0000 @@ -0,0 +1,6 @@ +>STRG.4.1(-)_12 [1 - 666] +DPTTVYVDMRALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDSHLGEIWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLGFLRRTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHKSVTECAVFTWTNLLVVVVELDGSEQEALDLVPLVTNVVLEEHYLIVGVVVVVDIGVIPINSRGEKQR +>STRG.6.1(-)_5 [2 - 250] +TGMLAGVKMSHAATSAFCRSIKLQCELYPSREVAICLDPYCGLGFVLWCLCSVYSGHQSILIPPSELETNPALWLLAVSQYKV +>STRG.8.1(-)_4 [2 - 205] +VTPAIKDFQKAQRERSHSLKVGQSCCGLSQSLNISPNRPETGSHTLKMPITTLRILSTRR \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.tab Mon Aug 07 13:52:15 2023 +0000 @@ -0,0 +1,46 @@ +seqID transcript orf_start orf_end length strand sense longest +STRG.4.1(-)_1 STRG.4.1 3 77 74 - normal n +STRG.4.1(-)_2 STRG.4.1 59 88 29 - normal n +STRG.4.1(-)_3 STRG.4.1 92 127 35 - normal n +STRG.4.1(-)_4 STRG.4.1 131 268 137 - normal n +STRG.4.1(-)_5 STRG.4.1 81 332 251 - normal n +STRG.4.1(-)_6 STRG.4.1 272 379 107 - normal n +STRG.4.1(-)_7 STRG.4.1 366 437 71 - normal n +STRG.4.1(-)_8 STRG.4.1 465 518 53 - normal n +STRG.4.1(-)_9 STRG.4.1 383 565 182 - normal n +STRG.4.1(-)_10 STRG.4.1 599 664 65 - normal n +STRG.4.1(-)_11 STRG.4.1 522 665 143 - normal n +STRG.4.1(-)_12 STRG.4.1 1 666 665 - normal y +STRG.4.1(-)_13 STRG.4.1 665 594 71 - reverse_sense n +STRG.4.1(-)_14 STRG.4.1 631 491 140 - reverse_sense n +STRG.4.1(-)_15 STRG.4.1 590 375 215 - reverse_sense n +STRG.4.1(-)_16 STRG.4.1 448 341 107 - reverse_sense n +STRG.4.1(-)_17 STRG.4.1 371 312 59 - reverse_sense n +STRG.4.1(-)_18 STRG.4.1 666 271 395 - reverse_sense n +STRG.4.1(-)_19 STRG.4.1 337 260 77 - reverse_sense n +STRG.4.1(-)_20 STRG.4.1 308 255 53 - reverse_sense n +STRG.4.1(-)_21 STRG.4.1 256 224 32 - reverse_sense n +STRG.4.1(-)_22 STRG.4.1 246 169 77 - reverse_sense n +STRG.4.1(-)_23 STRG.4.1 217 128 89 - reverse_sense n +STRG.4.1(-)_24 STRG.4.1 188 78 110 - reverse_sense n +STRG.4.1(-)_25 STRG.4.1 165 58 107 - reverse_sense n +STRG.4.1(-)_26 STRG.4.1 74 18 56 - reverse_sense n +STRG.4.1(-)_27 STRG.4.1 124 2 122 - reverse_sense n +STRG.4.1(-)_28 STRG.4.1 54 1 53 - reverse_sense n +STRG.6.1(-)_1 STRG.6.1 1 63 62 - normal n +STRG.6.1(-)_2 STRG.6.1 24 182 158 - normal n +STRG.6.1(-)_3 STRG.6.1 94 195 101 - normal n +STRG.6.1(-)_4 STRG.6.1 186 233 47 - normal n +STRG.6.1(-)_5 STRG.6.1 2 250 248 - normal y +STRG.6.1(-)_6 STRG.6.1 199 252 53 - normal n +STRG.6.1(-)_7 STRG.6.1 237 121 116 - reverse_sense n +STRG.6.1(-)_8 STRG.6.1 251 93 158 - reverse_sense n +STRG.6.1(-)_9 STRG.6.1 117 85 32 - reverse_sense n +STRG.6.1(-)_10 STRG.6.1 81 34 47 - reverse_sense n +STRG.6.1(-)_11 STRG.6.1 250 14 236 - reverse_sense n +STRG.6.1(-)_12 STRG.6.1 62 3 59 - reverse_sense n +STRG.6.1(-)_13 STRG.6.1 30 1 29 - reverse_sense n +STRG.8.1(-)_1 STRG.8.1 18 56 38 - normal n +STRG.8.1(-)_2 STRG.8.1 60 95 35 - normal n +STRG.8.1(-)_3 STRG.8.1 99 137 38 - normal n +STRG.8.1(-)_4 STRG.8.1 2 205 203 - normal y
