Mercurial > repos > jjohnson > cistrome_motif
changeset 0:c392c4007d5e
Imported from capsule None
author | jjohnson |
---|---|
date | Tue, 14 Oct 2014 10:07:00 -0400 |
parents | |
children | 040fb4c886ae |
files | screen.xml seqpos.xml tool-data/cistrome_assembly.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml tool_macros.xml |
diffstat | 6 files changed, 437 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/screen.xml Tue Oct 14 10:07:00 2014 -0400 @@ -0,0 +1,159 @@ +<tool name="Screen Motif" id="motif_screen" version="0.1.0"> + <description>Given a motif, find all regions that match the motif</description> + <macros> + <import>tool_macros.xml</import> + </macros> + <!-- cistrome numpy --> + <expand macro="requirements" /> + <command> +MotifScan.py +#include source=$ref_genome_seq_opts# +-b $output_bed -f $output_fa +#if $motifs.motifSrc == 'raw': +-p $user_db_file $bfile $bfile.metadata.dbkey ignore_this +#elif $motifs.motifSrc == 'pssm': +-p $motifs.pssm_file $bfile $bfile.metadata.dbkey ignore_this +#elif $motifs.motifSrc == 'db': +-m $motifs.db $bfile $bfile.metadata.dbkey $motifs.motif_id +#elif $motifs.motifSrc == 'userdb': +-n $motifs.usr_db $bfile $bfile.metadata.dbkey $motifs.motif_id +#end if + </command> + <inputs> + <param format="bed" name="bfile" type="data" label="BED file(100,000 lines max)"> + <validator type="unspecified_build" /> + </param> + <expand macro="refGenomeSourceConditional"/> + <conditional name="motifs"> + <param name="motifSrc" type="select" label="which motif database?"> + <option value="db">builtin motif database</option> + <option value="userdb">motif database from your history</option> + <option value="pssm">pssm file from your history</option> + <option value="raw">pssm enterd as text</option> + </param> + <when value="pssm"> + <param format="text" name="pssm_file" type="data" label="PSSM file"/> + </when> + <when value="userdb"> + <param format="xml" name="usr_db" type="data" label="Motif XML file" optional="true"/> + <param name="motif_id" type="text" label="Motif Id" help="from Seqpos result or from our motif collection /Cistrome/Cistrome.xml"/> + </when> + <when value="db"> + <param name="motif_id" type="text" label="Motif Id" help="from Seqpos result or from our motif collection at http://cistrome.dfci.harvard.edu/~jian/motif_collection/databases/Cistrome/Cistrome.xml"/> + <param name="db" type="select" label="which motif database?"> + <option value="pbm.xml">pbm</option> + <option value="y1h.xml">y1h</option> + <option value="transfac.xml">transfac</option> + <option value="hpdi.xml">hpdi</option> + <option value="jaspar.xml">jaspar</option> + </param> + </when> + <when value="raw"> + <param name="file_data" type="text" size="40,40" label="PSSM Raw Text (e.g. [[0.1, 0.5, 0.2, 0.2], ...])" + help="Tip: Zero number is not allowed (use 0.001 instead.), also 4 numbers in each part should add up to 1, "/> + </when> + </conditional> + </inputs> + <outputs> + <data format="bed" name="output_bed" label="Motif scan output on ${bfile.name}"/> + <data format="fasta" name="output_fa" label="Motif sequences for ${bfile.name}"/> + </outputs> + <configfiles> + <configfile name="user_db_file"> +#if $motifs.motifSrc == 'raw': +echo $motifs.file_data +#end if + </configfile> + </configfiles> + <tests> + <test name="denovo_foo_8"> + <param name="bfile" value="runx3_small.bed" ftype="bed" /> + <param name="gv" value="mm8" /> + <param name="motif_id" value="denovo2" /> + <param name="db" value="None" /> + <param name="usr_db" value="denovo_foo.xml" ftype="xml"/> + <param name="pssm_file" /> + <param name="file_data" value="" /> + <output name="output1" file="motif/motifscan/outputs/denovo_foo_8.bed"/> + <output name="output2" file="motif/motifscan/outputs/denovo_foo_8.fsa"/> + </test> + <test name="pssm_SW0003"> + <param name="bfile" value="runx3_small.bed" ftype="bed" /> + <param name="gv" value="mm8" /> + <param name="motif_id" value="" /> + <param name="db" value="None" /> + <param name="usr_db" /> + <param name="pssm_file" value="pssm_SW0003.txt" /> + <param name="file_data" value="" /> + <output name="output1" file="motif/motifscan/outputs/pssm_SW0003.bed"/> + <output name="output2" file="motif/motifscan/outputs/pssm_SW0003.fsa"/> + </test> + <test name="y1h_SW0056"> + <param name="bfile" value="runx3_small.bed" ftype="bed" /> + <param name="gv" value="mm8" /> + <param name="motif_id" value="SW0056" /> + <param name="db" value="y1h.xml" /> + <param name="usr_db" /> + <param name="pssm_file" /> + <param name="file_data" value="" /> + <output name="output1" file="motif/motifscan/outputs/y1h_SW0056.bed"/> + <output name="output2" file="motif/motifscan/outputs/y1h_SW0056.fsa"/> + </test> + </tests> + <help> +Given a motif, this tool will find all regions that match the +motif. This tool is made by Cliff Meyer and Len Taing. + +.. class:: infomark + +**TIP:** Please check the result from Seqpos tool to understand the +parameters of this tool, such as Motif id, xml file, PSSM + +.. class:: warningmark + +**NEED IMPROVEMENT** + +----- + +**4 ways to specify the input** + +- **Method 1:** You can specify a motif in our motif database by the + motif id such as MM00481 for AR motif in TRANSFAC. This way, you + need to provide id in **Motif id**, and choose a "motif database" + from PBM, TRANSFAC or Y1H. +- **Method 2:** You can specify a motif with Seqpos result by the + motif id such as MM00481_observed for observed AR motif. This way, + you need to provide id in **Motif id**, and choose the Seqpos output + xml file in the drop-down menu of **Motif XML file**. +- **Method 3:** You can upload a PSSM file containing a motif matrix + to the history, and choose it from drag-down menu of **PSSM file**. +- **Method 4:** You can paste a PSSM raw text string to **PSSM Raw + Text** to scan. An example for this string can be seen in Seqpos + HTML result by selecting a motif and click the *show pssm in a new + window* button + +**Other parameters** + +- **BED file** defines the regions you want to scan the motif on. +- **Genome Asssembly version** is the UCSC database version. The tool + use this information to extract the DNA sequences in the regions of + **BED file**. + +.. class:: infomark + +**TIP:** To browse the known motif databases, click here_ +link to: http://cistrome.org/~jian/motif_collection/databases/Cistrome/Cistrome.xml + +.. _here: http://cistrome.org/~jian/motif_collection/databases/Cistrome/Cistrome.xml + +----- + +**Output** + +- **BED output** contains the regions with the motif. +- **Fasta output** contains the DNA sequences of motif. + + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seqpos.xml Tue Oct 14 10:07:00 2014 -0400 @@ -0,0 +1,146 @@ +<tool name="SeqPos motif tool" id="motif_denovo" version="0.1.0"> + <description>Find motifs from given regions enriched near the centers</description> + <macros> + <import>tool_macros.xml</import> + </macros> + <!-- cistrome numpy jinja2 --> + <expand macro="requirements_seqpos" /> + <command> +MDSeqPos.py +#include source=$ref_genome_seq_opts# +#if $search_type != None and len(str($search_type)) > 0: + -m $search_type +#end if +$denovo +-v -c --hcluster="$hcluster" -w "$width" +#if $maxmotif != None: + --maxmotif=$maxmotif +#end if +-p "$pval" +#if $species_list != None and len(str($search_type)) > 0: + -s $species_list +#end if +$bfile $bfile.metadata.dbkey &> $log && +cp results/table.html $output_html && +mkdir $output_html.extra_files_path && +cp -R results/* $output_html.extra_files_path + </command> + <expand macro="stdio"/> + <inputs> + <param format="bed" name="bfile" type="data" label="BED file (at most 5K lines.If you have more than 5K lines,please sort them and pick top 5k lines first)" help="Tip: the chromosome in bed file cannot be something like 'chr1_xxxx'. You need to filter them out using the tool 'Filter and Sort -> Select' by 'NOT matching' for the pattern '^chr([0-9A-Za-z])+_'"> + <validator type="unspecified_build" /> + </param> + + <expand macro="refGenomeSourceConditional"/> + + <param name="search_type" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which motif database(s) to use"> + <option value="cistrome.xml" selected="true">cistrome (Curated)</option> + <option value="pbm.xml">pbm</option> + <option value="y1h.xml">y1h</option> + <option value="transfac.xml">transfac</option> + <option value="hpdi.xml">hpdi</option> + <option value="jaspar.xml">jaspar</option> + </param> + <param name="denovo" type="boolean" truevalue="-d" falsevalue="" checked="false" label="Include denovo motif search"/> + + <param name="species_list" type="select" multiple="true" display="checkboxes" force_select="true" optional="false" label="Select which species to filter the results by (Optional)"> + <option value="hs,mm">Homo Sapien or Mus Musculus</option> + <option value="ce">Caenorhabditis Elegans</option> + <option value="dm">Drosophila Melanogaster</option> + </param> + <param name="width" type="integer" label="width of region to be scanned" value="600"> + <validator type="in_range" max="10000" min="100" message="width is out of range, width has to be between 100 to 10000" /> + </param> + <param name="pval" type="float" label="p-value cutoff" value="0.001"> + <validator type="in_range" max="1" min="0" message="Pvalue is out of range, Pvalue has to be between 0 to 1" /> + </param> + <param name="maxmotif" type="integer" label="max output hits. (0 means output all fit the pvalue cutoff)" value="0" min="0" optional="true" /> + <param name="hcluster" type="text" label="The similarity cutoff for hierarchical clustering of the output (The higher, the more groups, 0 ~ 1)" value="0.8"/> + </inputs> + <outputs> + <data format="xml" name="output_xml" label="SeqPos xml output on ${bfile.name}" from_work_dir="results/denovo.xml"> + <filter>denovo == True</filter> + </data> + <data format="html" name="output_html" label="SeqPos html output on ${bfile.name}"/> + <data format="txt" name="log" label="SeqPos Log on ${bfile.name}"/> + </outputs> + <help> +The **SeqPos** tool will find motifs enriched in a set of +regions. **SeqPos** use the distances from motif positions to the peak +summits ( center of the regions) to find the most enriched motifs near +peak summits. **SeqPos** can scan all the motifs in TRANSFAC, Matha's +Protein Binding Microarray ( a.k.a PBM) and Scot Wolfe's protein DNA +binding database ( y1h). Also **SeqPos** can try to find *de novo* +motifs using MDscan algorithm. At last, **SeqPos** can cluster the +similar motifs in a cluster tree to help user filter out the redundant +motifs. This tool is made by Cliff Meyer and Len Taing. A detail +explanation of the algorithm can be found in the supplementary +material of the paper "Nucleosome dynamics define transcriptional +enhancers." (Nat Genet, 42(4):343-347) The tool was modified then by +Jian Ma and Tao Liu. Version: 0.590. + +About our curated cistrome motif database: This database only +includes human and mouse data. It puts data from Transfac, +JASPAR, UniPROBE (pbm), hPDI together, also it includes the motifs derived +from ChIP-seq data. After that we delete the motifs look similar from +each other to keep a clean and smaller database. This database is a +recommended one and always in updating. + +.. class:: infomark + +**TIP:** Please make sure the regions in your BED file is valid! If +the region is out of boundary of chromosome, it will cause error. Also +please avoid abnormal chromosome names. + +.. class:: infomark + +**TIP:** The running time is increasing with the number of +regions. Please avoid using more than 10 thousand regions for input. + +.. class:: warningmark + +**NEED IMPROVEMENT** + +----- + +**Parameters** + +- **BED file** is the input file. It can be the output from peak + calling softwares. Please pay attention that the regions in the BED + file should not be out of boundary of chromosome. + *This file can only contain at most 5000 lines. If not, please + filter it using Galaxy:Filter and Sort tool*. + +- **Genome Assembly version** is the UCSC database version. +- **Motif databases** is the known motif collections in Cistrome, + including TRANSFAC, PBM and Scot wolfe's database. You can select + *de novo motif search* to enable *de novo* motif scan. +- **Species list** are the species that you want to filter the results + with. Select none of the species to see all of the results. +- **Width of regions** is the region to scan for motifs around peak + summits ( centers of input regions). +- **P-value cutoff** can be used to filter the results. + +.. class:: infomark + +**TIP:** To browse the known motif databases, click here_ + +.. _here: http://cistrome.org/~jian/motif_collection/databases/Cistrome/Cistrome.xml + +----- + +**Output** + +- **HTML output** can be open in web browser. Users can browse the + result in either the middle list view of the page or the bottom + cluster tree view, and the detail of motif can be seen in the top + detail view. The list view is sortable at every field. The detail + view provides two buttons to open the detail information in a + separate webpage, or to show the PSSM of the motif. +- **XML output** is the XML formated output. +- **LOG file** is for job log. If you see errors, please attach this + in the bug report. + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/cistrome_assembly.loc.sample Tue Oct 14 10:07:00 2014 -0400 @@ -0,0 +1,18 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Oct 14 10:07:00 2014 -0400 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="cistrome_assembly" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/cistrome_assembly.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Oct 14 10:07:00 2014 -0400 @@ -0,0 +1,18 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="cistrome" version="2014-09-29"> + <repository changeset_revision="b9b48eb563d1" name="package_cistrome_2014_09_29" owner="jjohnson" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="numpy" version="1.7.1"> + <repository changeset_revision="84125ffacb90" name="package_numpy_1_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="jinja2" version="2.7.3"> + <repository changeset_revision="6df9843aca2f" name="package_jinja2_2_7_3" owner="jjohnson" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="R" version="2.15.0"> + <repository changeset_revision="3a70cdc41d21" name="package_r_2_15_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="bioc_seqlogo" version="1.24.0"> + <repository changeset_revision="2c9adf14664d" name="package_bioc_seqlogo_1_24_0" owner="jjohnson" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_macros.xml Tue Oct 14 10:07:00 2014 -0400 @@ -0,0 +1,89 @@ +<macros> + + <macro name="requirements"> + <requirements> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="2014-09-16">cistrome</requirement> + </requirements> + </macro> + <macro name="requirements_seqpos"> + <requirements> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="2014-09-16">cistrome</requirement> + <requirement type="package" version="2.7.3">jinja2</requirement> + <requirement type="package" version="2.15.0">R</requirement> + <requirement type="package" version="1.24.0">bioc_seqlogo</requirement> + </requirements> + </macro> + <macro name="stdio"> + <stdio> + <exit_code range=":-1" level="fatal" description="Error: Cannot open file" /> + <exit_code range="1:" level="fatal" description="Error" /> + </stdio> + </macro> + <macro name="refGenomeSourceConditional"> + <conditional name="refGenomeSource"> + <param name="genomeSource" type="select" label="Use a built in reference genome or own from your history" help="Genome Reference Fasta sequence"> + <option value="cached" selected="True">Use a built-in genome</option> + <option value="history">Use a genome from history</option> + <option value="installed">cistrome installation</option> + </param> + <when value="cached"> + <param name="cistrome_assembly" type="select" label="Cistrome Genome Assembly"> + <options from_data_table="cistrome_assembly"/> + </param> + </when> + <when value="history"> + <param name="ownFile" type="data" format="cistrome_assembly" metadata_name="dbkey" label="Select the reference genome" /> + </when> <!-- history --> + <when value="installed"> + <param name="cistrome_static_assembly" type="select" label="Cistrome Genome Assembly"> + <option value="humanhg18">humanhg18</option> + <option value="humanhg19">humanhg19</option> + <option value="UCSC_MM7">UCSC_MM7</option> + <option value="mousemm8_Mar2006">mousemm8_Mar2006</option> + <option value="UCSC_MM9">UCSC_MM9</option> + <option value="UCSC_CE4">UCSC_CE4</option> + <option value="UCSC_CE6">UCSC_CE6</option> + <option value="UCSC_DM3">UCSC_DM3</option> + </param> + </when> <!-- history --> + </conditional> <!-- refGenomeSource --> + </macro> + + <template name="ref_genome_seq_opts"> +#if $refGenomeSource.genomeSource == 'cached': + -g $refGenomeSource.cistrome_assembly.fields.path +#elif $refGenomeSource.genomeSource == 'history': + -g $refGenomeSource.ownFile.extra_files_path +#else + #if hasattr($__app__.config,'cistrome_static_library_path'): + -g ${$__app__.config.cistrome_static_library_path}/assembly/$refGenomeSource.cistrome_static_assembly + #elif $os.path.exists($os.path.join($__tool_data_path__,'assembly')) + -g $__tool_data_path__/assembly/$refGenomeSource.cistrome_static_assembly + #end if +#end if + </template> + + <template name="script_chars"> +#set global $dollar = chr(36) +#set global $gt = chr(62) +#set global $lt = chr(60) +#set global $ad = chr(38) + </template> + + <token name="@EXTERNAL_DOCUMENTATION@"> + +For details about this application, please go to: + https://bitbucket.org/cistrome/cistrome-harvard/wiki/Home + + </token> + <token name="@CITATION_SECTION@">------ + +**Citation** + +For the underlying tool, please cite the following publication: +"Cistrome: an integrative platform for transcriptional regulation studies," Liu T, Ortiz JA, Taing L, Meyer CA, Lee B, Zhang Y, Shin H, Wong SS, Ma J, Lei Y, Pape UJ, Poidinger M, Chen Y, Yeung K, Brown M, Turpaz Y, Liu XS. Genome Biol. 2011 Aug 22;12(8):R83. + + </token> +</macros>