Mercurial > repos > devteam > divide_pg_snp
changeset 0:137ec95c8ca6
Uploaded tool tarball.
author | devteam |
---|---|
date | Tue, 20 Aug 2013 09:39:35 -0400 |
parents | |
children | e342ab27851f |
files | dividePgSnpAlleles.pl dividePgSnpAlleles.xml dividePgSnpAlleles.xml.bak test-data/dividePgSnp_input.pgSnp test-data/dividePgSnp_output.txt |
diffstat | 5 files changed, 213 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dividePgSnpAlleles.pl Tue Aug 20 09:39:35 2013 -0400 @@ -0,0 +1,41 @@ +#!/usr/bin/perl -w +use strict; + +#divide the alleles and their information into separate columns for pgSnp-like +#files. Keep any additional columns beyond the pgSnp ones. +#reads from stdin, writes to stdout +my $ref; +my $in; +if (@ARGV && $ARGV[0] =~ /-ref=(\d+)/) { + $ref = $1 -1; + if ($ref == -1) { undef $ref; } + shift @ARGV; +} +if (@ARGV) { + $in = shift @ARGV; +} + +open(FH, $in) or die "Couldn't open $in, $!\n"; +while (<FH>) { + chomp; + my @f = split(/\t/); + my @a = split(/\//, $f[3]); + my @fr = split(/,/, $f[5]); + my @sc = split(/,/, $f[6]); + if ($f[4] == 1) { #homozygous add N, 0, 0 + if ($ref) { push(@a, $f[$ref]); } + else { push(@a, "N"); } + push(@fr, 0); + push(@sc, 0); + } + if ($f[4] > 2) { next; } #skip those with more than 2 alleles + print "$f[0]\t$f[1]\t$f[2]\t$a[0]\t$fr[0]\t$sc[0]\t$a[1]\t$fr[1]\t$sc[1]"; + if (scalar @f > 7) { + splice(@f, 0, 7); #remove first 7 + print "\t", join("\t", @f), "\n"; + }else { print "\n"; } +} +close FH; + +exit; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dividePgSnpAlleles.xml Tue Aug 20 09:39:35 2013 -0400 @@ -0,0 +1,76 @@ +<tool id="dividePgSnp" version="1.0.0" name="Separate pgSnp alleles" hidden="false"> + <description>into columns</description> + <command interpreter="perl"> + #if $refcol.ref == "yes" #dividePgSnpAlleles.pl -ref=$refcol.ref_column $input1 > $out_file1 + #else #dividePgSnpAlleles.pl $input1 > $out_file1 + #end if + </command> + <inputs> + <param format="interval" version="1.0.0" name="input1" type="data" label="pgSnp dataset" /> + <conditional version="1.0.0" name="refcol"> + <param version="1.0.0" name="ref" type="select" label="Dataset has a column with the reference allele:"> + <option value="yes">yes</option> + <option value="no" selected="true">no</option> + </param> + <when value="yes"> + <param version="1.0.0" name="ref_column" type="data_column" data_ref="input1" label="Column with reference allele" /> + </when> + <when value="no"> <!-- do nothing --> + </when> + </conditional> + </inputs> + <outputs> + <data format="interval" version="1.0.0" name="out_file1" /> + </outputs> + <tests> + <test> + <param name='input1' value='dividePgSnp_input.pgSnp' ftype='interval' /> + <param name='ref' value='no' /> + <output version="1.0.0" name="output" file="dividePgSnp_output.txt" /> + </test> + </tests> + + <help> +**Dataset formats** + +The input dataset is of Galaxy datatype interval_ with the columns specified +for pgSnp_. +Any additional columns beyond the pgSnp defined columns will be appended to +the output. +The output dataset is in interval_ format. (`Dataset missing?`_) + +.. _interval: ./static/formatHelp.html#interval +.. _Dataset missing?: ./static/formatHelp.html +.. _pgSnp: ./static/formatHelp.html#pgSnp + +**What it does** + +This separates the alleles from a pgSnp dataset into separate columns, +as well as the frequencies and scores that go with the alleles. It will skip +any positions with more than 2 alleles. If only a single allele is given then "N" +will be used for the second, with a frequency and score of zero. Or, if a +column with reference alleles is provided, +the value in that column will be used in place of the "N" for single alleles. + +----- + +**Examples** + +- input pgSnp file:: + + chr1 256 257 A/C 2 3,4 10,20 + chr1 56100 56101 A 1 5 30 + chr1 77052 77053 A/G 2 6,7 40,50 + chr1 110904 110905 A 1 8 60 + etc. + +- output:: + + chr1 256 257 A 3 10 C 4 20 + chr1 56100 56101 A 5 30 N 0 0 + chr1 77052 77053 A 6 40 G 7 50 + chr1 110904 110905 A 8 60 N 0 0 + etc. + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dividePgSnpAlleles.xml.bak Tue Aug 20 09:39:35 2013 -0400 @@ -0,0 +1,76 @@ +<tool id="dividePgSnp" name="Separate pgSnp alleles" hidden="false"> + <description>into columns</description> + <command interpreter="perl"> + #if $refcol.ref == "yes" #dividePgSnpAlleles.pl -ref=$refcol.ref_column $input1 > $out_file1 + #else #dividePgSnpAlleles.pl $input1 > $out_file1 + #end if + </command> + <inputs> + <param format="interval" name="input1" type="data" label="pgSnp dataset" /> + <conditional name="refcol"> + <param name="ref" type="select" label="Dataset has a column with the reference allele:"> + <option value="yes">yes</option> + <option value="no" selected="true">no</option> + </param> + <when value="yes"> + <param name="ref_column" type="data_column" data_ref="input1" label="Column with reference allele" /> + </when> + <when value="no"> <!-- do nothing --> + </when> + </conditional> + </inputs> + <outputs> + <data format="interval" name="out_file1" /> + </outputs> + <tests> + <test> + <param name='input1' value='dividePgSnp_input.pgSnp' ftype='interval' /> + <param name='ref' value='no' /> + <output name="output" file="dividePgSnp_output.txt" /> + </test> + </tests> + + <help> +**Dataset formats** + +The input dataset is of Galaxy datatype interval_ with the columns specified +for pgSnp_. +Any additional columns beyond the pgSnp defined columns will be appended to +the output. +The output dataset is in interval_ format. (`Dataset missing?`_) + +.. _interval: ./static/formatHelp.html#interval +.. _Dataset missing?: ./static/formatHelp.html +.. _pgSnp: ./static/formatHelp.html#pgSnp + +**What it does** + +This separates the alleles from a pgSnp dataset into separate columns, +as well as the frequencies and scores that go with the alleles. It will skip +any positions with more than 2 alleles. If only a single allele is given then "N" +will be used for the second, with a frequency and score of zero. Or, if a +column with reference alleles is provided, +the value in that column will be used in place of the "N" for single alleles. + +----- + +**Examples** + +- input pgSnp file:: + + chr1 256 257 A/C 2 3,4 10,20 + chr1 56100 56101 A 1 5 30 + chr1 77052 77053 A/G 2 6,7 40,50 + chr1 110904 110905 A 1 8 60 + etc. + +- output:: + + chr1 256 257 A 3 10 C 4 20 + chr1 56100 56101 A 5 30 N 0 0 + chr1 77052 77053 A 6 40 G 7 50 + chr1 110904 110905 A 8 60 N 0 0 + etc. + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dividePgSnp_input.pgSnp Tue Aug 20 09:39:35 2013 -0400 @@ -0,0 +1,10 @@ +chr1 256 257 A/C 2 4,5 0,0 +chr1 56100 56101 A 1 8 0 +chr1 77052 77053 A/G 2 3,5 0,0 +chr1 110904 110905 A 1 5 0 +chr1 160592 160593 G 1 3 0 +chr1 640353 640354 G 1 1 0 +chr1 695314 695315 A 1 7 0 +chr1 713681 713682 A 1 8 0 +chr1 713965 713966 A/G 2 3,2 0,0 +chr1 714056 714057 A/G 2 1,5 0,0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dividePgSnp_output.txt Tue Aug 20 09:39:35 2013 -0400 @@ -0,0 +1,10 @@ +chr1 256 257 A 4 0 C 5 0 +chr1 56100 56101 A 8 0 N 0 0 +chr1 77052 77053 A 3 0 G 5 0 +chr1 110904 110905 A 5 0 N 0 0 +chr1 160592 160593 G 3 0 N 0 0 +chr1 640353 640354 G 1 0 N 0 0 +chr1 695314 695315 A 7 0 N 0 0 +chr1 713681 713682 A 8 0 N 0 0 +chr1 713965 713966 A 3 0 G 2 0 +chr1 714056 714057 A 1 0 G 5 0