Mercurial > repos > matthias > longorf
changeset 0:e09750baa9ac draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 8e118a4d24047e2c62912b962e854f789d6ff559-dirty
author | matthias |
---|---|
date | Wed, 20 Jun 2018 10:55:21 -0400 |
parents | |
children | |
files | getLongestORF.py longORF.xml test-data/test_input.fasta test-data/test_output.fasta test-data/test_output.tab |
diffstat | 5 files changed, 340 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getLongestORF.py Wed Jun 20 10:55:21 2018 -0400 @@ -0,0 +1,114 @@ +#!/usr/bin/env python + +""" +usage: getLongestORF.py input output.fas output.tab + + +input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") +output.fas: fasta file with all longest ORFs per transcript +output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs + +example: + +>253936-254394(+)_1 [28 - 63] +LTNYCQMVHNIL +>253936-254394(+)_2 [18 - 77] +HKLIDKLLPNGAQYFVKSTQ +>253936-254394(+)_3 [32 - 148] +QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV +>253936-254394(+)_4 [117 - 152] +LGIIVPSLLLCN +>248351-252461(+)_1 [14 - 85] +VLARKYPRCLSPSKKSPCQLRQRS +>248351-252461(+)_2 [21 - 161] +PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP +>248351-252461(+)_3 [89 - 202] +VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG +>248351-252461(+)_4 [206 - 259] +SRTTSGRQSVLSEKLWRR +>248351-252461(+)_5 [263 - 313] +CLSPLWVPCCSRHSCHG +""" + +import sys,re + +def findlongestOrf(transcriptDict,old_seqID): + #write for previous seqID + prevTranscript = transcriptDict[old_seqID] + i_max = 0 + #find longest orf in transcript + for i in range(0,len(prevTranscript)): + if(prevTranscript[i][2] >= prevTranscript[i_max][2]): + i_max = i + for i in range(0,len(prevTranscript)): + prevStart = prevTranscript[i][0] + prevEnd = prevTranscript[i][1] + prevLength = prevTranscript[i][2] + output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength) + if (end - start > 0): + output+="\tForward" + else: + output+="\tReverse" + if(i == i_max): + output += "\ty\n" + else: + output += "\tn\n" + OUTPUT_ORF_SUMMARY.write(output) + transcriptDict.pop(old_seqID, None) + return None + +INPUT = open(sys.argv[1],"r") +OUTPUT_FASTA = open(sys.argv[2],"w") +OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") + +seqID = "" +old_seqID = "" +lengthDict = {} +seqDict = {} +headerDict = {} +transcriptDict = {} +skip = False + +OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") + +for line in INPUT: + line = line.strip() +# print line + if(re.match(">",line)): #header + seqID = "_".join(line.split(">")[1].split("_")[:-1]) + #seqID = line.split(">")[1].split("_")[0] + start = int (re.search('\ \[(\d+)\ -', line).group(1)) + end = int (re.search('-\ (\d+)\]',line).group(1)) + length = abs(end - start) + if(seqID not in transcriptDict and old_seqID != ""): #new transcript + findlongestOrf(transcriptDict,old_seqID) + if seqID not in transcriptDict: + transcriptDict[seqID] = [] + transcriptDict[seqID].append([start,end,length]) + if(seqID not in lengthDict and old_seqID != ""): #new transcript + #write FASTA + OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n") + #delete old dict entry + headerDict.pop(old_seqID, None) + seqDict.pop(old_seqID, None) + lengthDict.pop(old_seqID, None) + #if several longest sequences exist with the same length, the dictionary saves the last occuring. + if(seqID not in lengthDict or length >= lengthDict[seqID]): + headerDict[seqID] = line + lengthDict[seqID] = length + seqDict[seqID] = "" + skip = False + else: + skip = True + next + old_seqID = seqID + elif(skip): + next + else: + seqDict[seqID] += line + +OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) +findlongestOrf(transcriptDict,old_seqID) +INPUT.close() +OUTPUT_FASTA.close() +OUTPUT_ORF_SUMMARY.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/longORF.xml Wed Jun 20 10:55:21 2018 -0400 @@ -0,0 +1,35 @@ +<tool id="longORF" name="Obtain longest ORFs" version="0.1.0"> + <description> in six-frame translations</description> + <command><![CDATA[ + python $__tool_directory__/getLongestORF.py $input $output_longestORF $output_ORFs + ]]> + </command> + <inputs> + <param name="input" format="fasta" type="data" label="sequences"/> + </inputs> + <outputs> + <data name="output_longestORF" format="fasta"/> + <data name="output_ORFs" format="tabular"/> + </outputs> + + <tests> + <test> + <param name="input" value="test_input.fasta"/> + <output name="output_longestORF" file="test_output.fasta"/> + <output name="output_ORFs" file="test_output.tab"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +This tool identifies the longest Open Reading Frames within the six-frame translations of a set of sequences. + +**Input** + +It takes an amino acid fasta file with all open reading frames (+ and - strand) listed by the correspondng transcript. The tool is designed to process the output of the Galaxy tool "getorf" from the EMBOSS package. + +**Output** + +For each transcript, the respected longest ORF is identified and listed in fasta format. Furthermore, table with information about seqID, start, end, length, orientation, longest for all ORFs is given.]]> + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_input.fasta Wed Jun 20 10:55:21 2018 -0400 @@ -0,0 +1,127 @@ +>14520830-14521117(-)_1 [2 - 37] +KPLENISASREF +>14520830-14521117(-)_2 [3 - 47] +SPWRIFQPAENFDLQ +>14520830-14521117(-)_3 [41 - 94] +LAVGFGLIFLRSGWMPCL +>14520830-14521117(-)_4 [63 - 152] +FSYDLGGCLACDSCSSYSPNEGQCPARKLE +>14520830-14521117(-)_5 [146 - 175] +VGMMDLCSET +>14520830-14521117(-)_6 [156 - 200] +WTCVQRLNRTNKQNK +>14520830-14521117(-)_7 [1 - 240] +KAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSAPHVSWNDGPVFRDLT +EPTSKTSENRKKEEDTGINS +>14520830-14521117(-)_8 [179 - 325] +QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRGTKPLENISASREF +>14520830-14521117(-)_9 [204 - 335] +EQEKGRGHWNQFLRVALLGFSSIPSFVGQSPWRIFQPAENFDLQ +>14520830-14521117(-)_10 [329 - 382] +LAVGFGLIFLRSGWMPCL +>14520830-14521117(-)_11 [351 - 440] +FSYDLGGCLACDSCSSYSPNEGQCPARKLE +>14520830-14521117(-)_12 [434 - 463] +VGMMDLCSET +>14520830-14521117(-)_13 [444 - 488] +WTCVQRLNRTNKQNK +>14520830-14521117(-)_14 [244 - 528] +ELHYLVFLPFQVSWDKAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSA +PHVSWNDGPVFRDLTEPTSKTSENRKKEEDTGINS +>14520830-14521117(-)_15 [467 - 574] +QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRG +>14520830-14521117(-)_16 [492 - 575] +EQEKGRGHWNQFLRVALLGFSSIPSFVG +>14520830-14521117(-)_17 [532 - 576] +ELHYLVFLPFQVSWD +>14520830-14521117(-)_18 [575 - 543] (REVERSE SENSE) +SHETWNGRKTK +>14520830-14521117(-)_19 [574 - 524] (REVERSE SENSE) +PTKLGMEEKPSSATLKN +>14520830-14521117(-)_20 [576 - 466] (REVERSE SENSE) +VPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC +>14520830-14521117(-)_21 [520 - 458] (REVERSE SENSE) +FQCPLPFSCSHLFCLLVLLSL +>14520830-14521117(-)_22 [454 - 401] (REVERSE SENSE) +TQVHHSNLRAGHCPSFGE +>14520830-14521117(-)_23 [397 - 359] (REVERSE SENSE) +ELQLSQARHPPRS +>14520830-14521117(-)_24 [355 - 311] (REVERSE SENSE) +ENQTKSYCKSKFSAG +>14520830-14521117(-)_25 [539 - 255] (REVERSE SENSE) +CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI +VGKSNQILLQVKILCWLKYSPGALSHETWNGRKTK +>14520830-14521117(-)_26 [307 - 236] (REVERSE SENSE) +NILQGLCPTKLGMEEKPSSATLKN +>14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE) +VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVP +RNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC +>14520830-14521117(-)_28 [232 - 170] (REVERSE SENSE) +FQCPLPFSCSHLFCLLVLLSL +>14520830-14521117(-)_29 [166 - 113] (REVERSE SENSE) +TQVHHSNLRAGHCPSFGE +>14520830-14521117(-)_30 [109 - 71] (REVERSE SENSE) +ELQLSQARHPPRS +>14520830-14521117(-)_31 [67 - 23] (REVERSE SENSE) +ENQTKSYCKSKFSAG +>14520830-14521117(-)_32 [251 - 3] (REVERSE SENSE) +CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI +VGKSNQILLQVKILCWLKYSPGA +>14520830-14521117(-)_33 [174 - 1] (REVERSE SENSE) +VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGF +>103089310-103089560(-)_1 [2 - 37] +GTSEKFLKILLS +>103089310-103089560(-)_2 [24 - 92] +RFYYHRYLFWFCVSVLSADGPKL +>103089310-103089560(-)_3 [13 - 117] +KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK +>103089310-103089560(-)_4 [138 - 167] +TACIWLHCGL +>103089310-103089560(-)_5 [180 - 260] +NHPYVSVSGYTRKRKESQSGTKSGRYV +>103089310-103089560(-)_6 [41 - 271] +ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK +ERVSEWNKEWEVRLKSF +>103089310-103089560(-)_7 [127 - 288] +NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRVGGTSEKFLKILLS +>103089310-103089560(-)_8 [275 - 343] +RFYYHRYLFWFCVSVLSADGPKL +>103089310-103089560(-)_9 [264 - 368] +KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK +>103089310-103089560(-)_10 [389 - 418] +TACIWLHCGL +>103089310-103089560(-)_11 [378 - 500] +NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRV +>103089310-103089560(-)_12 [292 - 501] +ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK +ERVSEWNKEW +>103089310-103089560(-)_13 [431 - 502] +NHPYVSVSGYTRKRKESQSGTKSG +>103089310-103089560(-)_14 [500 - 447] (REVERSE SENSE) +HSLFHSETLSFSSCIQIH +>103089310-103089560(-)_15 [480 - 436] (REVERSE SENSE) +DSFLFLVYPDTLTYG +>103089310-103089560(-)_16 [502 - 383] (REVERSE SENSE) +PTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA +>103089310-103089560(-)_17 [426 - 361] (REVERSE SENSE) +KCHKPQWSHIHAVQPDFTLIFF +>103089310-103089560(-)_18 [357 - 289] (REVERSE SENSE) +YRLHYNFGPSAESTDTQNQNKYL +>103089310-103089560(-)_19 [379 - 233] (REVERSE SENSE) +FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTYLPLFVPL +>103089310-103089560(-)_20 [279 - 196] (REVERSE SENSE) +NLQKLFRRTSHSLFHSETLSFSSCIQIH +>103089310-103089560(-)_21 [229 - 185] (REVERSE SENSE) +DSFLFLVYPDTLTYG +>103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE) +HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF +SDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA +>103089310-103089560(-)_23 [175 - 110] (REVERSE SENSE) +KCHKPQWSHIHAVQPDFTLIFF +>103089310-103089560(-)_24 [106 - 38] (REVERSE SENSE) +YRLHYNFGPSAESTDTQNQNKYL +>103089310-103089560(-)_25 [128 - 3] (REVERSE SENSE) +FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTY +>103089310-103089560(-)_26 [192 - 1] (REVERSE SENSE) +HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF +SDVP
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.fasta Wed Jun 20 10:55:21 2018 -0400 @@ -0,0 +1,4 @@ +>14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE) +VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC +>103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE) +HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNFSDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_output.tab Wed Jun 20 10:55:21 2018 -0400 @@ -0,0 +1,60 @@ +seqID start end length orientation longest +14520830-14521117(-) 2 37 35 Forward n +14520830-14521117(-) 3 47 44 Forward n +14520830-14521117(-) 41 94 53 Forward n +14520830-14521117(-) 63 152 89 Forward n +14520830-14521117(-) 146 175 29 Forward n +14520830-14521117(-) 156 200 44 Forward n +14520830-14521117(-) 1 240 239 Forward n +14520830-14521117(-) 179 325 146 Forward n +14520830-14521117(-) 204 335 131 Forward n +14520830-14521117(-) 329 382 53 Forward n +14520830-14521117(-) 351 440 89 Forward n +14520830-14521117(-) 434 463 29 Forward n +14520830-14521117(-) 444 488 44 Forward n +14520830-14521117(-) 244 528 284 Forward n +14520830-14521117(-) 467 574 107 Forward n +14520830-14521117(-) 492 575 83 Forward n +14520830-14521117(-) 532 576 44 Forward n +14520830-14521117(-) 575 543 32 Forward n +14520830-14521117(-) 574 524 50 Forward n +14520830-14521117(-) 576 466 110 Forward n +14520830-14521117(-) 520 458 62 Forward n +14520830-14521117(-) 454 401 53 Forward n +14520830-14521117(-) 397 359 38 Forward n +14520830-14521117(-) 355 311 44 Forward n +14520830-14521117(-) 539 255 284 Forward n +14520830-14521117(-) 307 236 71 Forward n +14520830-14521117(-) 462 178 284 Forward y +14520830-14521117(-) 232 170 62 Forward n +14520830-14521117(-) 166 113 53 Forward n +14520830-14521117(-) 109 71 38 Forward n +14520830-14521117(-) 67 23 44 Forward n +14520830-14521117(-) 251 3 248 Forward n +14520830-14521117(-) 174 1 173 Forward n +103089310-103089560(-) 2 37 35 Reverse n +103089310-103089560(-) 24 92 68 Reverse n +103089310-103089560(-) 13 117 104 Reverse n +103089310-103089560(-) 138 167 29 Reverse n +103089310-103089560(-) 180 260 80 Reverse n +103089310-103089560(-) 41 271 230 Reverse n +103089310-103089560(-) 127 288 161 Reverse n +103089310-103089560(-) 275 343 68 Reverse n +103089310-103089560(-) 264 368 104 Reverse n +103089310-103089560(-) 389 418 29 Reverse n +103089310-103089560(-) 378 500 122 Reverse n +103089310-103089560(-) 292 501 209 Reverse n +103089310-103089560(-) 431 502 71 Reverse n +103089310-103089560(-) 500 447 53 Reverse n +103089310-103089560(-) 480 436 44 Reverse n +103089310-103089560(-) 502 383 119 Reverse n +103089310-103089560(-) 426 361 65 Reverse n +103089310-103089560(-) 357 289 68 Reverse n +103089310-103089560(-) 379 233 146 Reverse n +103089310-103089560(-) 279 196 83 Reverse n +103089310-103089560(-) 229 185 44 Reverse n +103089310-103089560(-) 443 132 311 Reverse y +103089310-103089560(-) 175 110 65 Reverse n +103089310-103089560(-) 106 38 68 Reverse n +103089310-103089560(-) 128 3 125 Reverse n +103089310-103089560(-) 192 1 191 Reverse n