# HG changeset patch # User peterjc # Date 1370453818 14400 # Node ID dfd7c3ff3447b11b78ae5b04009db30dd2507067 # Parent 83ad539ccb67a95f01c10f810922a02fa317720e Restore original folder structure diff -r 83ad539ccb67 -r dfd7c3ff3447 README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Wed Jun 05 13:36:58 2013 -0400 @@ -0,0 +1,22 @@ +#Created 07/01/2011 - Konrad Paszkiewicz, Exeter Sequencing Service, University of Exeter + +The attached is a crude wrapper script for Interproscan. Typically this is useful when one wants to produce an annotation which is not based on sequence +similarity. E.g after a denovo transcriptome assembly, each transcript could be translated and run through this tool. + +Prerequisites: + +1. A working installation of Interproscan on your Galaxy server/cluster. + +Limitations: + +Currently it is setup to work with PFAM only due to the heavy computational demands Interproscan makes. + +Input formats: + +The standard interproscan input is either genomic or protein sequences. In the case of genomic sequences Interproscan will of run an ORF +prediction tool. However this tends to lose the ORF information (e.g. start/end co-ordinates) from the header. As such the requirement here is to input ORF +sequences (e.g. from EMBOSS getorf) and to then replace any spaces in the FASTA header with underscores. This workaround generally preserves the relevant +positional information. + + + diff -r 83ad539ccb67 -r dfd7c3ff3447 interproscan.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan.py Wed Jun 05 13:36:58 2013 -0400 @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +""" +Classes encapsulating decypher tool. +James E Johnson - University of Minnesota +""" +import pkg_resources; +import logging, os, string, sys, tempfile, glob, shutil, types, urllib +import shlex, subprocess +from optparse import OptionParser, OptionGroup +from stat import * + + +log = logging.getLogger( __name__ ) + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + s = 'interproscan.py: argv = %s\n' % (sys.argv) + # print >> sys.stderr, s # so will appear as blurb for file + argcnt = len(sys.argv) + working_dir = sys.argv[1] + input = sys.argv[2] + format = sys.argv[3] + output = sys.argv[4] + #Convert all spaces in ORF header to underscores + cmdline = 'sed \'s/ /_/\' %s > temp.fa' % (input) + #print >> sys.stderr, cmdline + try: + proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) + returncode = proc.wait() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + if returncode != 0: + raise Exception, stderr + except Exception, e: + stop_err( 'Error running sed ' + str( e ) ) + + cmdline = 'iprscan -cli -nocrc -i temp.fa -o temp.iprscan -goterms -seqtype p -altjobs -format %s -appl hmmpfam > /dev/null' % (format) + #print >> sys.stderr, cmdline # so will appear as blurb for file + try: + proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) + returncode = proc.wait() + # get stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + if returncode != 0: + raise Exception, stderr + except Exception, e: + stop_err( 'Error running iprscan ' + str( e ) ) + + out = open(output,'w') + #outpe_path = os.path.join(working_dir,'') + for line in open('temp.iprscan'): + out.write( "%s" % (line) ) + out.close() + +if __name__ == "__main__": __main__() diff -r 83ad539ccb67 -r dfd7c3ff3447 interproscan.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan.xml Wed Jun 05 13:36:58 2013 -0400 @@ -0,0 +1,78 @@ + + Interproscan functional predictions of ORFs + + interproscan.py + '$__app__.config.new_file_path' + '$input' + '$format' + '$output' + + + + + + + + + + + + + +**Interproscan** + +Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches of profile and other functional databases. These include SCOP, CATH, PFAM and SUPERFAMILY. Currently due to resource limitations, only the PFAM database is searched however. + +**Input** +A FASTA file containing ORF predictions is required. This file must NOT contain any spaces in the FASTA headers - any spaces will be convereted to underscores by this tool before submission to Interproscan. + +**Output** + +The output will consist of a file in Interproscan raw format, a tabular file in galaxy with 14 columns. +This can be use to upload the data into a relational database or concatenation of different runs. + +====== ============================================================================================================================= =========================================== +Column Example Description +------ ----------------------------------------------------------------------------------------------------------------------------- ------------------------------------------- + c1 NF00181542 Identifier of the input sequence + c2 0A5FDCE74AB7C3AD crc64 checksum of the protein sequence + c3 272 Length of sequence (in amino acids) + c4 HMMPIR Analysis metho launched + c5 PIRSF001424 Database members entry for match + c6 Prephenate dehydratase Description from the database + c7 1 Start of the domain match + c8 270 End of the domain match + c9 6.5e-141 e-value (reported by the database method) + c10 T Status of match (Tfor true, ? forunknown) + c11 06-Aug-2005 Date of the run + c12 IPR008237 InterPro entry (if iprlookup requested) + c13 Prephenate dehydratase with ACT region Description of the InterPro entry + c14 Molecular Function:prephenate dehydratase activity (GO:0004664), Biological Process:L-phenylalanine biosynthesis (GO:0009094) GO (gene ontology) description +====== ============================================================================================================================= =========================================== + +**Database updates** + +Typically these take place 2-3 times a year. + +**References** + +Zdobnov EM, Apweiler R (2001) +InterProScan an integration platform for the signature-recognition methods in InterPro. +Bioinformatics 17, 847-848. +http://dx.doi.org/10.1093/bioinformatics/17.9.847 + +Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005) +InterProScan: protein domains identifier. +Nucleic Acids Research 33 (Web Server issue), W116-W120. +http://dx.doi.org/10.1093/nar/gki442 + +Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009) +InterPro: the integrative protein signature database. +Nucleic Acids Research 37 (Database Issue), D224-228. +http://dx.doi.org/10.1093/nar/gkn785 + +This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at +http://toolshed.g2.bx.psu.edu/view/konradpaszkiewicz/interproscan + + + diff -r 83ad539ccb67 -r dfd7c3ff3447 interproscan/README.txt --- a/interproscan/README.txt Wed Jun 05 13:35:22 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -#Created 07/01/2011 - Konrad Paszkiewicz, Exeter Sequencing Service, University of Exeter - -The attached is a crude wrapper script for Interproscan. Typically this is useful when one wants to produce an annotation which is not based on sequence -similarity. E.g after a denovo transcriptome assembly, each transcript could be translated and run through this tool. - -Prerequisites: - -1. A working installation of Interproscan on your Galaxy server/cluster. - -Limitations: - -Currently it is setup to work with PFAM only due to the heavy computational demands Interproscan makes. - -Input formats: - -The standard interproscan input is either genomic or protein sequences. In the case of genomic sequences Interproscan will of run an ORF -prediction tool. However this tends to lose the ORF information (e.g. start/end co-ordinates) from the header. As such the requirement here is to input ORF -sequences (e.g. from EMBOSS getorf) and to then replace any spaces in the FASTA header with underscores. This workaround generally preserves the relevant -positional information. - - - diff -r 83ad539ccb67 -r dfd7c3ff3447 interproscan/interproscan.py --- a/interproscan/interproscan.py Wed Jun 05 13:35:22 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -#!/usr/bin/env python - -""" -Classes encapsulating decypher tool. -James E Johnson - University of Minnesota -""" -import pkg_resources; -import logging, os, string, sys, tempfile, glob, shutil, types, urllib -import shlex, subprocess -from optparse import OptionParser, OptionGroup -from stat import * - - -log = logging.getLogger( __name__ ) - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - s = 'interproscan.py: argv = %s\n' % (sys.argv) - # print >> sys.stderr, s # so will appear as blurb for file - argcnt = len(sys.argv) - working_dir = sys.argv[1] - input = sys.argv[2] - format = sys.argv[3] - output = sys.argv[4] - #Convert all spaces in ORF header to underscores - cmdline = 'sed \'s/ /_/\' %s > temp.fa' % (input) - #print >> sys.stderr, cmdline - try: - proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) - returncode = proc.wait() - # get stderr, allowing for case where it's very large - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += proc.stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - if returncode != 0: - raise Exception, stderr - except Exception, e: - stop_err( 'Error running sed ' + str( e ) ) - - cmdline = 'iprscan -cli -nocrc -i temp.fa -o temp.iprscan -goterms -seqtype p -altjobs -format %s -appl hmmpfam > /dev/null' % (format) - #print >> sys.stderr, cmdline # so will appear as blurb for file - try: - proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) - returncode = proc.wait() - # get stderr, allowing for case where it's very large - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += proc.stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - if returncode != 0: - raise Exception, stderr - except Exception, e: - stop_err( 'Error running iprscan ' + str( e ) ) - - out = open(output,'w') - #outpe_path = os.path.join(working_dir,'') - for line in open('temp.iprscan'): - out.write( "%s" % (line) ) - out.close() - -if __name__ == "__main__": __main__() diff -r 83ad539ccb67 -r dfd7c3ff3447 interproscan/interproscan.xml --- a/interproscan/interproscan.xml Wed Jun 05 13:35:22 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ - - Interproscan functional predictions of ORFs - - interproscan.py - '$__app__.config.new_file_path' - '$input' - '$format' - '$output' - - - - - - - - - - - - - -**Interproscan** - -Interproscan is a batch tool to query the Interpro database. It provides annotations based on multiple searches of profile and other functional databases. These include SCOP, CATH, PFAM and SUPERFAMILY. Currently due to resource limitations, only the PFAM database is searched however. - -**Input** -A FASTA file containing ORF predictions is required. This file must NOT contain any spaces in the FASTA headers - any spaces will be convereted to underscores by this tool before submission to Interproscan. - -**Output** - -The output will consist of a file in Interproscan raw format, a tabular file in galaxy with 14 columns. -This can be use to upload the data into a relational database or concatenation of different runs. - -====== ============================================================================================================================= =========================================== -Column Example Description ------- ----------------------------------------------------------------------------------------------------------------------------- ------------------------------------------- - c1 NF00181542 Identifier of the input sequence - c2 0A5FDCE74AB7C3AD crc64 checksum of the protein sequence - c3 272 Length of sequence (in amino acids) - c4 HMMPIR Analysis metho launched - c5 PIRSF001424 Database members entry for match - c6 Prephenate dehydratase Description from the database - c7 1 Start of the domain match - c8 270 End of the domain match - c9 6.5e-141 e-value (reported by the database method) - c10 T Status of match (Tfor true, ? forunknown) - c11 06-Aug-2005 Date of the run - c12 IPR008237 InterPro entry (if iprlookup requested) - c13 Prephenate dehydratase with ACT region Description of the InterPro entry - c14 Molecular Function:prephenate dehydratase activity (GO:0004664), Biological Process:L-phenylalanine biosynthesis (GO:0009094) GO (gene ontology) description -====== ============================================================================================================================= =========================================== - -**Database updates** - -Typically these take place 2-3 times a year. - -**References** - -Zdobnov EM, Apweiler R (2001) -InterProScan an integration platform for the signature-recognition methods in InterPro. -Bioinformatics 17, 847-848. -http://dx.doi.org/10.1093/bioinformatics/17.9.847 - -Quevillon E, Silventoinen V, Pillai S, Harte N, Mulder N, Apweiler R, Lopez R (2005) -InterProScan: protein domains identifier. -Nucleic Acids Research 33 (Web Server issue), W116-W120. -http://dx.doi.org/10.1093/nar/gki442 - -Hunter S, Apweiler R, Attwood TK, Bairoch A, Bateman A, Binns D, Bork P, Das U, Daugherty L, Duquenne L, Finn RD, Gough J, Haft D, Hulo N, Kahn D, Kelly E, Laugraud A, Letunic I, Lonsdale D, Lopez R, Madera M, Maslen J, McAnulla C, McDowall J, Mistry J, Mitchell A, Mulder N, Natale D, Orengo C, Quinn AF, Selengut JD, Sigrist CJ, Thimma M, Thomas PD, Valentin F, Wilson D, Wu CH, Yeats C. (2009) -InterPro: the integrative protein signature database. -Nucleic Acids Research 37 (Database Issue), D224-228. -http://dx.doi.org/10.1093/nar/gkn785 - -This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at -http://toolshed.g2.bx.psu.edu/view/konradpaszkiewicz/interproscan - - -