changeset 0:a389db9676d2

Uploaded
author radisson
date Tue, 10 Apr 2012 05:20:55 -0400
parents
children 0413c0088cc6
files PRIAM_search.xml
diffstat 1 files changed, 128 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PRIAM_search.xml	Tue Apr 10 05:20:55 2012 -0400
@@ -0,0 +1,128 @@
+<tool id="PRIAM_search" name="Priam" force_history_refresh="True">
+<description></description>
+<command>
+java -jar /home/radisson/galaxy-dist/tools/prabi/priam/PRIAM_search.jar
+
+#if $np.number_processor == "define" #-np $np.how_process
+#else   #-np 1
+#end if
+
+
+#set $file_name = $output1.file_name
+#set $output1=$output1.files_path
+#set $a= $output1.split("/")
+#set $path=("/".join(a[:8]))
+
+-pt $threshold_of_proba -mo $mo.max_overlap -mp $min_proportion -cc $check_cata -i $data_file -cg $mo.complet_genom -n PRIAMoutput -od $path -p ~/galaxy-dist/tools/prabi/data/May11/PRIAM_MAY11/
+
+</command>
+<inputs>
+	<conditional name="np">	
+		<param name="number_processor" type="select"  label="Number of processor you want to use for this work ">
+			<option value="default">Use default</option>
+			<option value="define">Define numbers</option>
+		</param>
+		<when value="default"/>
+		<when value="define">
+		    <param name="how_process" type="integer" value="4" label="Number of processor you want to use "/>
+		</when>
+	</conditional>
+	<param name="data_file" type="data" format="fasta" label="File containing the protein (or nucleic) sequences you want to analyse (Fasta format) "/>
+	<param name="check_cata" type="select" label="Check for catalytic residues patterns ? ">
+		<option value="T">Yes</option>
+		<option value="F">No</option>
+	</param>
+      <conditional name="mo">	
+	<param name="complet_genom" type="select" label="Analyse dataset as a complete genome ? ">
+		<option value="F">No</option>
+		<option value="T">Yes</option>
+	</param>
+	<when value="F">
+	  <param name="max_overlap" type="integer" value="20" label="Maximum overlap length between the matches of two profiles "/>
+	</when>
+	<when value="T">
+	  <param name="max_overlap" type="integer" value="-1" label="Maximum overlap length between the matches of two profiles "/>
+	</when>
+      </conditional>
+	<param name="threshold_of_proba" type="float" value="0.5" label="Threshold of probability "/>
+	<param name="min_proportion" type="integer" value="70" label="Minimal length proportion of a profile that must be matched to consider it "/>
+</inputs>
+<outputs name="output_dir">
+	<data name="output1" format="tabular" from_work_dir="./RESULTS/paj_PRIAMoutput_seqsHits.tab" label="PRIAMoutput_seqsHits.tab"/>
+	<data name="output2" format="tabular" from_work_dir="./RESULTS/paj_PRIAMoutput_seqsECs.txt" label="PRIAMoutput_seqsECs.txt"/>
+	<data name="output3" format="tabular" from_work_dir="./RESULTS/paj_PRIAMoutput_seqsECs.tab" label="PRIAMoutput_seqsECs.tab"/>
+	<data name="output4" format="tabular" from_work_dir="./RESULTS/paj_PRIAMoutput_predictableECs.txt" label="PRIAMoutput_predictableECs.txt"/>	
+	<data name="output5" format="tabular" from_work_dir="./RESULTS/paj_PRIAMoutput_genomeEnzymes.txt" label="PRIAMoutput_genomeEnzymes.txt">
+	  <filter>(mo['complet_genom'] == 'T')</filter>
+	</data>
+	<data name="output6" format="tabular" from_work_dir="./RESULTS/paj_PRIAMoutput_genomeECs.txt" label="PRIAMoutput_genomeECs.txt">
+	  <filter>(mo['complet_genom'] == 'T')</filter>
+	</data>
+</outputs>	
+<help>
+
+**PRIAM_search**
+
+-----
+
+Number of processor you want to use for this work : 
+
+By default is 1.
+
+-----
+
+File containing the protein (or nucleic) sequences you want to analyse (Fasta format) : 
+
+You need to upload file befor use with PRIAM_search, this file must be contain a proteic or nucleic sequences and need to be a Fasta file. Each sequence name needs to be unique as in the results files sequences would only be identified by their names.
+
+-----
+
+Check for catalytic residues patterns ? :
+
+Each profile of PRIAM may be associated with a pattern of catalytic residues which is automatically designed using Swiss-Prot annotations (when available). So, some positions of a pattern can be tagged as corresponding to known catalitic residues. If you choose to activate this catalytic patterns checkout, PRIAM would thus verify, for each match, that known catalytic residues are found. If not the case, this match would be considered as a false positive. Using this option would thus increase specificity, as it alows to predict inactive enzymes as false positives, but it can also significatively impact sensibility on some enzymes (enzymes for which the biological diversity was badly represented in the training dataset of PRIAM). So use it with caution.
+
+-----
+
+Analyse dataset as a complete genome ? :
+
+If you want to analyse a complete genome, this option must be set to true. This would results in the use of PRIAM genome annotation rules that define the minimal set of modules needed to be found in a genome to ensure a given enzymatic activity (See PRIAM publication of 2003 for more details).
+
+-----
+
+Maximum overlap length between the matches of two profiles :
+
+By default in PRIAM, all the matches bypassing the probability threshold are kept to annotate a sequence. However, to increase the specificity, it is possible to consider only the best non overlapping profiles (In that case, only the main activity might be predicted). Thus, the "maximum overlap" allows to define the length from which two matches are considered as overlapping (setting this parammeter to "-1" means that this filter is innactivated). We recommand to use this filter in case of a low probability threshold.
+
+-----
+
+Threshold of probability :
+
+The threshold of probability above which an activity, represented by a set of matching profiles, is considered as present. PRIAM is now able to associate each hit of a profile with a bayesian probability for this hit of beeing a true positive. That is the joint probability of the set of profiles characterizing an activity that is used as criteria to decide whether this activity must be kept in final predictions or not.
+
+-----
+
+Minimal length proportion of a profile that must be matched to consider it : 
+
+Usually, enzymes catatlytic domains have a constrained structure with a quite conserved genomic length. Thus, truncated domains have a really low probability to be functional (or at least to have kept their enzymatic specificity). So, in the case of a functional annotation, incomplete profiles matches can be removed by setting this parammetter to a convenient value (typically 60-80). However, if you are interested in complete genome annotation and if you assume that many of your genes are incomplete (for exemple in the case of small contigs) it may be necessary to set down this value to have interpretable results.
+
+-----
+
+**Output files :**
+
+-----
+
+They are many output file for one result of Priam.
+If you have set complet genom to no, you have 4 file generated in your history : 
+- a file ending with the "_predictableECs.txt" sufix. This file lists all the EC numbers the PRIAM release you use for this job is able to predict. So, if in the results you do not find an EC you were expecting to be present, please ensure this EC is in that list.
+- a file ending with the "_seqsHits.tab" sufix. This file correspond to the list of all profiles hits for each sequence in the query file. 
+- a file ending with the "_seqsECs.txt" sufix. This file correspond to the list of ECs predicted for each sequence in the query file. This is that file that must be looked at in case you use PRIAM for sequence annotation purpose. 
+In that file, for each sequence, are reported all activities with at least one representative profile having matched. After the activity is indicated the joint e-value of representative profiles corresponding to this activity, then, the probability of this prediction and, then, whether this prediction should be kept or not (T=kept; F=not kept). Finally, an optional "(fragment)" tag can be found that warn that a truncated representative domain has matched for the considered activity. So, if the probability for this activity is low, it can be just because of an incomplete protein sequence. Be aware, however, that this information is based only on representative modules. Thus, not all incomplete polypeptides could be identified that way. 
+
+If you have set "complet genom" to yes, you have two file more.
+- a file ending with the "_genomeECs.txt" sufix that gives the list of all ECs predicted has beeing present in your genome (according to the PRIAM genome annotation rules) with their associated probability.
+- a file ending with the "_genomeEnzymes.txt" sufix that gives the list of all Enzymes predicted has beeing present in your genome.
+
+
+
+</help>
+</tool>