changeset 0:8e4ebcd58df3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damid_deseq2_to_bedgraph commit 98722d2ca8205595f032361072aaab450e5f4f83
author mvdbeek
date Fri, 14 Dec 2018 06:24:38 -0500
parents
children 35011939bc8b
files damid_interpolate.xml damid_to_bedgraph.py test-data/line_count_100.tab test-data/line_count_50.tab test-data/test_data.tab
diffstat 5 files changed, 110 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/damid_interpolate.xml	Fri Dec 14 06:24:38 2018 -0500
@@ -0,0 +1,34 @@
+<tool id="damid_interpolate" name="Interpolate GATC signal" version="0.1.0">
+    <description>from DESeq2 processed DamID data</description>
+    <requirements>
+        <requirement type="package" version="2.4.1">traces</requirement>
+        <requirement type="package" version="7">click</requirement>
+        <requirement type="package" version="0.23.4">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/damid_to_bedgraph.py' '$input' '$output' --resolution $resolution
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Select DESeq2 result file for GATC sites"/>
+        <param argument="--resolution" type="integer" value="50" label="Select resolution for intervals"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="bedgraph"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="test_data.tab"/>
+            <output name="output" value="line_count_50.tab"/>
+        </test>
+        <test>
+            <param name="input" value="test_data.tab"/>
+            <param name="resolution" value="100"/>
+            <output name="output" value="line_count_100.tab"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Takes a deseq2 output file and generate a bedgraph file, where the score column
+is the log2 fold change value interpolated for intervals of the size selected in
+"Select resolution for intervals".
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/damid_to_bedgraph.py	Fri Dec 14 06:24:38 2018 -0500
@@ -0,0 +1,57 @@
+from collections import OrderedDict
+
+import click
+import numpy as np
+import pandas as pd
+import traces
+
+
+def order_index(df):
+    """
+    Split chr_start_stop in df index and order by chrom and start.
+    """
+    idx = df.index.str.split('_')
+    idx = pd.DataFrame.from_records(list(idx))
+
+    idx.columns = ['chr', 'start', 'stop']
+    idx = idx.astype(dtype={"chr": "object",
+                            "start": "int32",
+                            "stop": "int32"})
+    coordinates = idx.sort_values(['chr', 'start'])
+    df.index = np.arange(len(df.index))
+    df = df.loc[coordinates.index]
+    df = coordinates.join(df)
+    # index is center of GATC site
+    df.index = df['start'] + 2
+    return df
+
+
+def interpolate_values(df, sampling_width=100):
+    result = []
+    for chrom in df['chr'].unique():
+        chrom_df = df[df['chr'] == chrom]
+        time_series = traces.TimeSeries(chrom_df['log2FC'])
+        s = pd.DataFrame.from_records(time_series.sample(sampling_width, interpolate='linear'))
+        # Calculate new start and end of interpolated region
+        start = s[0] - int(sampling_width / 2)
+        start.loc[start < 0] = 1
+        end = s[0] + int(sampling_width / 2)
+        result.append(pd.DataFrame(OrderedDict([('chr', chrom), ('start', start), ('end', end), ('score', s[1])])))
+    return pd.concat(result)
+
+
+@click.command()
+@click.argument('input_path', type=click.Path(exists=True), required=True)
+@click.argument('output_path', type=click.Path(exists=False), required=True)
+@click.option('--resolution', help="Interpolate log2 fold change at this resolution (in basepairs)", default=50)
+def deseq2_to_bedgraph(input_path, output_path, resolution=50):
+    """Convert deseq2 output on GATC fragments to bedgraph file with interpolated values."""
+    df = pd.read_csv(input_path, sep='\t', header=None, index_col=0, usecols=[0, 2], names=['GATC', 'log2FC'])
+    df = df[~df.index.str.contains('\.')]
+    df = order_index(df)
+    r = interpolate_values(df, sampling_width=resolution)
+    r.to_csv(output_path, sep='\t', header=None, index=None)
+
+
+if __name__ == '__main__':
+    deseq2_to_bedgraph()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/line_count_100.tab	Fri Dec 14 06:24:38 2018 -0500
@@ -0,0 +1,5 @@
+4	1	53	6.113789605059059
+4	53	153	5.6056573997736905
+4	153	253	5.709334556659599
+4	253	353	4.95034623243659
+4	353	453	5.7152495881064205
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/line_count_50.tab	Fri Dec 14 06:24:38 2018 -0500
@@ -0,0 +1,9 @@
+4	1	28	6.113789605059059
+4	28	78	5.859723502416375
+4	78	128	5.6056573997736905
+4	128	178	5.6574959782166445
+4	178	228	5.709334556659599
+4	228	278	5.3298403945480946
+4	278	328	4.95034623243659
+4	328	378	5.3327979102715055
+4	378	428	5.7152495881064205
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_data.tab	Fri Dec 14 06:24:38 2018 -0500
@@ -0,0 +1,5 @@
+4_1_101	179.471138895981	6.11378960505906	0.621838986245469	9.83178883970079	8.2146464523617e-23	5.91609955253763e-19
+4_101_201	294.81296137271	5.60565739977369	0.577313080365325	9.70990886994353	2.73581605967775e-22	1.38780673739166e-18
+4_201_301	169.22778018355	5.7093345566596	0.605350118969888	9.43145855224256	4.0442550874846e-21	1.15555073230549e-17
+4_301_401	412.221939093451	4.95034623243659	0.53002737065317	9.33979357770923	9.65221128246454e-21	2.46758781436206e-17
+4_401_501	165.881402812843	5.71524958810642	0.620499842499124	9.21071883771587	3.23948149277376e-20	6.66609779385658e-17