changeset 2:d4755a95c52d draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit f6a8d0f8eb4225276e3db9e5ca04b7d11bac5bf6-dirty
author ebi-gxa
date Thu, 16 Jul 2020 09:28:56 +0000
parents 540cc98d4587
children a4bfd4a146ec
files atlas-retrieve-macros.xml retrieve-scxa.xml
diffstat 2 files changed, 126 insertions(+), 62 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/atlas-retrieve-macros.xml	Thu Jul 16 09:28:56 2020 +0000
@@ -0,0 +1,30 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.0.0</token>
+    <token name="@HELP@">More information can be found at https://github.com/ebi-gene-expression-group/atlas-data-import</token>
+    <token name="@PROFILE@">18.01</token>
+    <xml name="requirements">
+      <requirements>
+        <requirement type="package" version="0.0.10">atlas-data-import</requirement>
+            <yield/>
+      </requirements>
+    </xml>
+    <token name="@VERSION_HISTORY@"><![CDATA[
+**Version history**
+0.0.6+galaxy0: Initial contribution. Andrey Solovyev, Expression Atlas team https://www.ebi.ac.uk/gxa/home at EMBL-EBI https://www.ebi.ac.uk/.
+    ]]></token>
+    <xml name="citations">
+      <citations>
+        <citation type="bibtex">
+          @misc{github-atlas-data-import.git,
+            author = {Andrey Solovyev, EBI Gene Expression Team},
+            year = {2020},
+            title = {Scripts for extracting expression- and metadata from SCXA in a programmatic way},
+            publisher = {GitHub},
+            journal = {GitHub repository},
+            url = {https://github.com/ebi-gene-expression-group/atlas-data-import.git},
+          }
+        </citation>
+        <yield />
+      </citations>
+    </xml>
+</macros>
--- a/retrieve-scxa.xml	Thu Apr 16 08:50:10 2020 +0000
+++ b/retrieve-scxa.xml	Thu Jul 16 09:28:56 2020 +0000
@@ -1,61 +1,79 @@
-<?xml version="1.0" encoding="utf-8"?>
-<tool id="retrieve_scxa" name="EBI SCXA Data Retrieval" version="v0.0.2+galaxy2">
-  <description>Retrieves expression matrixes and metadata from EBI Single Cell Expression Atlas (SCXA)</description>
-  <requirements>
-    <requirement type="package" version="1.20.1">wget</requirement>
-  </requirements>
-  <command detect_errors="exit_code"><![CDATA[
-
-#if str($matrix_type) == "tpm":
+<tool id="retrieve_scxa" name="Atlas import: get experiment data" version="@TOOL_VERSION@+galaxy0"  profile="@PROFILE@">
+    <description>Retrieve expression matrices and metadata from EBI Single Cell Expression Atlas (SCXA)</description>
+    <macros>
+         <import>atlas-retrieve-macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+        ln -s "${accession_code}_${matrix_type}/10x_data/matrix.mtx" matrix.mtx &&
+        ln -s "${accession_code}_${matrix_type}/10x_data/genes.tsv" genes.tsv &&
+        ln -s "${accession_code}_${matrix_type}/10x_data/barcodes.tsv" barcodes.tsv &&
+        ln -s "${accession_code}_${matrix_type}/sdrf.txt" sdrf.txt &&
+        ln -s "${accession_code}_${matrix_type}/condensed-sdrf.tsv" condensed-sdrf.tsv &&
+        ln -s "${accession_code}_${matrix_type}/idf.txt" idf.txt &&
+        ln -s "${accession_code}_${matrix_type}/marker_genes_${number_of_clusters}.tsv" marker_genes_${number_of_clusters}.tsv &&
+        ln -s "${accession_code}_${matrix_type}/exp_design.tsv" exp_design.tsv &&
 
-wget -O exp_quant.zip
-    'https://www.ebi.ac.uk/gxa/sc/experiment/${accession}/download/zip?fileType=quantification-filtered&accessKey=' &&
-unzip exp_quant.zip;
-mv '${accession}'.expression_tpm.mtx ${matrix_mtx} &&
-awk '{OFS="\t"; print \$2,\$2}' '${accession}'.expression_tpm.mtx_rows > ${genes_tsv} &&
-cut -f2 '${accession}'.expression_tpm.mtx_cols > ${barcode_tsv};
-
-#else if str($matrix_type) == "raw":
-
-wget -O ${matrix_mtx} 'ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/sc_experiments/${accession}/${accession}.aggregated_filtered_counts.mtx';
-wget -qO - 'ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/sc_experiments/${accession}/${accession}.aggregated_filtered_counts.mtx_cols' | cut -f2 > ${barcode_tsv};
-wget -qO - 'ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/sc_experiments/${accession}/${accession}.aggregated_filtered_counts.decorated.mtx_rows' |
-  awk -F'\t' '{ if (length($2) == 0) { print $1"\t"$1 } else { print $0 } }' > ${genes_tsv};
-
-#end if
-
-wget -O exp_design.tsv
-    'https://www.ebi.ac.uk/gxa/sc/experiment/${accession}/download?fileType=experiment-design&accessKey=';
+        get_experiment_data.R --accesssion-code "${accession_code}" --matrix-type "${matrix_type}" --get-sdrf "${get_sdrf}" --get-condensed-sdrf "${get_condensed_sdrf}" --get-marker-genes "${get_marker_genes}"
 
-]]></command>
-
-  <inputs>
-    <param name="accession" type="text" value="E-GEOD-100058" label="SC-Atlas experiment accession" help="EBI Single Cell Atlas accession for the experiment that you want to retrieve."/>
-    <param name="matrix_type" type="select" label="Choose the type of matrix to download" help="Raw filtered counts or (non-filtered) TPMs">
-      <option value="raw" selected="true">Raw filtered counts</option>
-      <option value="tpm">TPMs</option>
-    </param>
-  </inputs>
-
-  <outputs>
-    <data name="matrix_mtx" format="txt" label="${tool.name} on ${on_string} ${accession} matrix.mtx (${matrix_type.value_label})"/>
-    <data name="genes_tsv" format="tsv" label="${tool.name} on ${on_string} ${accession} genes.tsv (${matrix_type.value_label})"/>
-    <data name="barcode_tsv" format="tsv" label="${tool.name} on ${on_string} ${accession} barcodes.tsv (${matrix_type.value_label})"/>
-    <data name="design_tsv" format="tsv" from_work_dir="exp_design.tsv" label="${tool.name} on ${on_string} ${accession} exp_design.tsv"/>
-  </outputs>
-
-  <tests>
-    <test>
-      <param name="accession" value="E-GEOD-100058"/>
-      <param name="matrix_type" value="tpm"/>
-      <output name="matrix_mtx" file="E-GEOD-100058.expression_tpm.mtx" ftype="txt"/>
-      <output name="genes_tsv" file="E-GEOD-100058.genes.tsv" ftype="tsv"/>
-      <output name="barcode_tsv" file="E-GEOD-100058.barcodes.tsv" ftype="tsv"/>
-      <output name="design_tsv" file="E-GEOD-100058.exp_design.tsv" ftype="tsv"/>
-    </test>
-  </tests>
-
-  <help><![CDATA[
+        #if $config_file 
+        --config-file "${config_file}"
+        #end if        
+        #if $get_exp_design
+        --get-exp-design "${get_exp_design}" 
+        #end if 
+        #if $decorated_rows 
+        --decorated-rows "${decorated_rows}" 
+        #end if
+        #if $use_default_expr_names 
+        --use-default-expr-names "${use_default_expr_names}" 
+        #end if
+        #if $get_idf 
+        --get-idf "${get_idf}" 
+        #end if
+        #if $number_of_clusters 
+        --number-of-clusters  "${number_of_clusters}" 
+        #end if
+    ]]></command>
+    <inputs>
+        <param type="text" name="accession_code" label="SC-Atlas experiment accession" value="E-GEOD-100058" help="EBI Single Cell Atlas accession for the experiment that you want to retrieve." />
+        <param type="select" name="matrix_type" label="Choose the type of matrix to download" help="Type of matrix to be imported">
+            <option value="RAW">Raw</option>
+            <option value="FILTERED">Filtered Counts</option>
+            <option value="TPM">TPM-normalised</option>
+            <option value="CPM">CPM-normalised</option>
+        </param>
+        <param type="boolean" name="get_sdrf" checked="false" label="Import SDRF file" help="Boolean indicating whether SDRF file needs to be imported" />
+        <param type="boolean" name="get_exp_design" checked="false" label="Import experiment design file" help="Boolean indicating whether experiment design file needs to be imported" />
+        <param type="boolean" name="get_idf" checked="false" label="Import IDF file" help="Boolean indicating whether IDF file needs to be imported" />
+        <param type="boolean" name="get_condensed_sdrf" checked="false" label="Get condensed SDRF file" help="Boolean indicating whether condensed SDRF file needs to be imported" />
+        <param type="boolean" name="get_marker_genes" checked="false" label="Import marker genes" help="Boolean indicating whether marker genes should be imported" />
+        <param type="data" name="config_file" label="Config file" optional="true" format="yml" help="Config file with user-provided parameters" />
+        <param type="boolean" name="decorated_rows" checked="false" label="Decorated rows" help="Boolean indicating whether a decorated version of the rows should be imported" />
+        <param type="boolean" name="use_default_expr_names" checked="false"  label="Use default expr names" help="Should default (non 10x-type) file names be used for expression data? Default: FALSE" />
+        <param type="integer" name="number_of_clusters" value="0" label="Number of clusters" help="Number of clusters in marker genes file" />
+    </inputs>
+    <outputs>
+        <data name="expr_mtx" format="txt" from_work_dir="matrix.mtx" label="${tool.name} on ${on_string} ${accession_code} matrix.mtx (${matrix_type.value_label})" />
+        <data name="barcodes" format="txt" from_work_dir="barcodes.tsv" label="${tool.name} on ${on_string} ${accession_code} barcodes.tsv (${matrix_type.value_label})" />
+        <data name="genes" format="txt" from_work_dir="genes.tsv" label="${tool.name} on ${on_string} ${accession_code} genes.tsv (${matrix_type.value_label})" />
+        <data name="sdrf" format="txt" from_work_dir="sdrf.txt" label="${tool.name} on ${on_string} ${accession_code} sdrf.txt (${matrix_type.value_label})" >
+            <filter>get_sdrf</filter>
+        </data>
+        <data name="condensed_sdrf" format="txt" from_work_dir="condensed-sdrf.tsv" label="${tool.name} on ${on_string} ${accession_code} condensed-sdrf.tsv (${matrix_type.value_label})" >
+            <filter>get_condensed_sdrf</filter>
+        </data>
+        <data name="idf" format="txt" from_work_dir="idf.txt" label="${tool.name} on ${on_string} ${accession_code} idf.txt (${matrix_type.value_label})">
+            <filter>get_idf</filter>
+        </data>
+        <data name="marker_genes" from_work_dir="marker_genes_${number_of_clusters}.tsv" format="txt"  >
+            <filter>get_marker_genes</filter>
+        </data>
+        <data name="exp_design" from_work_dir="exp_design.tsv" format="txt"  >
+            <filter>get_exp_design</filter>
+        </data>
+    </outputs>
+    <help><![CDATA[
 =================================================================================
 Gene expression analysis in single cells across species and biological conditions
 =================================================================================
@@ -78,7 +96,10 @@
 To use it, simply set the accession for the desired experiment and choose the type of
 matrix that you want to download:
 
-:Raw filtered counts:
+:Raw counts:
+  Un-normalised, unfiltered version of the expression data. 
+
+:Filtered counts:
   This should be the default choice for running clustering and another analysis
   methods where you will introduce scaling and normalization of the data. The filtering
   is based on the quality control applied by iRAP prior to pseudo-alignment and quantification.
@@ -90,6 +111,9 @@
   particularities in the current Atlas SC pipeline, TPMs available here are not filtered.
   **Note: droplet databases won't have TPM data**
 
+:CPMS:
+  CPM normalisation stands for Counts Per Kilobase Million. As TPMs, these matrices are already normalised/scaled. You should keep this in mind when using this data on methods that will try to normalise data as part of their procedure.   
+
 Outputs will be:
 
 :Matrix (txt):
@@ -106,14 +130,24 @@
   Identifiers for the cells, samples or runs of the data matrix. The file is ordered
   to match the columns of the matrix.
 
+Optional outputs: 
+
 :Experiment Design file (tsv):
   Contains metadata for the different cells/samples/runs of the experiment.
   Please note that this file is generated before the filtering step, and while not
   often, it might be the case that it contains more cells/samples/runs than the matrix.
 
-]]></help>
-  <citations>
-    <citation type="doi">10.1093/nar/gkv1045</citation>
-    <citation type="doi">10.1101/2020.04.08.032698</citation>
-  </citations>
+:SDRF file (txt):
+  Similar to Experiment Design file, contains information on individual cells/sequencing runs. Might contain information on technical duplicates. 
+
+:IDF file (txt): 
+  IDF file holds general information about the sequencing experiment and interpretation of the fields in SDRF/metadata files. 
+  
+:Marker gene file (txt):
+  File containing information on marker genes that differentiate cell types present in the sequencing experiment. 
+
+@HELP@
+@VERSION_HISTORY@
+    ]]></help>
+    <expand macro="citations" />
 </tool>