Mercurial > repos > jjohnson > samtools_split_by_chrom
changeset 1:6fb39843d37d draft default tip
planemo upload commit f3f0bef4a450aafab3c6b05a27647471f93b22f3
author | jjohnson |
---|---|
date | Wed, 22 Mar 2017 17:22:35 -0400 |
parents | a30dd3c77b30 |
children | |
files | samtools_split_by_chrom.xml test-data/test.bam |
diffstat | 2 files changed, 49 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/samtools_split_by_chrom.xml Wed Nov 30 16:46:40 2016 -0500 +++ b/samtools_split_by_chrom.xml Wed Mar 22 17:22:35 2017 -0400 @@ -9,39 +9,65 @@ <expand macro="version_command"></expand> <command> <![CDATA[ + #import re + #set $name = $re.sub('\.bam$','',$input_bam.name) + #if str($refs) != 'None': + #set ref_list = str($refs).split(",") + #else + #set ref_list = [$re.sub('^.*__sq__(.+)__sq__.*$','\\1',n) if n.find('__sq__') >= 0 else n for n in str($input_bam.metadata.reference_names).split(',')] + #end if mkdir -p outputs && ln -s "${input_bam}" temp_input.bam && ln -s "${input_bam.metadata.bam_index}" temp_input.bam.bai - #for $ref in str( $refs ).split(","): - && samtools view -@ \${GALAXY_SLOTS:-1} -bh inputs/temp_input.bam ${ref} | - samtools sort -O bam -T sorted -@ \${GALAXY_SLOTS:-1} -o "outputs/${input_bam.name}.${ref}.bam" - + #for $i,$ref in enumerate($ref_list): + #set $idx = "%04d" % $i + && samtools view -@ \${GALAXY_SLOTS:-1} -bh temp_input.bam ${ref} | + samtools sort -O bam -T sorted -@ \${GALAXY_SLOTS:-1} -o "outputs/${idx}-${name}.${ref}.bam" - #end for + && ls -l outputs | awk '/bam/{fname = substr(\$NF,6); printf("%s\t%d\n", fname, \$5)}' > "ls_split_files" ]]> </command> <inputs> <param name="input_bam" format="bam" label="Select BAM dataset to slice" type="data" /> - <param name="refs" type="select" optional="False" multiple="True" label="Select references (chromosomes and contigs) you would like to restrict bam to" help="Click and type in the box above to see options. You can select multiple entries. If "No options available" is displayed, you need to re-detect metadata on the input dataset. See help section below."> - <!-- The options tagset below extracts reference names from bam file metadata --> - <!-- This will not work with bed files with old style metadata. However this --> - <!-- Can be easily fixed by re-deceting metadata on a bam dataset by clicking --> - <!-- The pencil icon and settind datatype to "bam" --> - <!-- This change has been commited in the following pull request: --> - <!-- https://github.com/galaxyproject/galaxy/pull/107 --> + <param name="refs" type="select" optional="True" multiple="True" label="Select references (chromosomes and contigs) you would like to restrict bam to" help="Click and type in the box above to see options. You can select multiple entries. If "No options available" is displayed, you need to re-detect metadata on the input dataset. See help section below."> <options> <filter type="data_meta" ref="input_bam" key="reference_names" /> </options> </param> + <param name="show_listing" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="show listing"/> </inputs> <outputs> - <collection name="output_collection" type='list' label="${input_bam.name} by chrom"> - <discover_datasets pattern="(?P<designation>.+)\.bam" directory="outputs" ext='bam'/> + <data format="tabular" name="listing" from_work_dir="ls_split_files"> + <filter>show_listing</filter> + <actions> + <action name="column_names" type="metadata" default="name,size" /> + </actions> + </data> + <collection name="output_collection" type="list" label="${input_bam.name} by chrom"> + <discover_datasets pattern="\d+-(?P<designation>.*\.bam)" directory="outputs" format="bam" ext="bam" visible="false"/> </collection> </outputs> <tests> <test> - <param ftype="bam" name="input_bam" value="bam-slice-input.bam" /> - <param name="refs" value="chrM" /> - <output file="bam-slice-test2.bam" ftype="bam" name="output_bam" /> + <param ftype="bam" name="input_bam" value="test.bam" /> + <param name="show_listing" value="True" /> + <output file="listing"> + <assert_contents> + <has_text expression="chr1" /> + <has_text expression="chr2" /> + </assert_contents> + </output> + </test> + <test> + <param ftype="bam" name="input_bam" value="test.bam" /> + <param name="refs" value="chr1,chr3" /> + <param name="show_listing" value="True" /> + <output file="listing"> + <assert_contents> + <has_text expression="chr1" /> + <not_has_text expression="chr2" /> + </assert_contents> + </output> </test> </tests> <help> @@ -49,12 +75,17 @@ **What it does** -Creates a dataset collection of BAM files, one per selected chromosome. +Splits an input BAM dataset to a dataset collection of individual chromosome bam files. + +This dataset collection can be passed to a galaxy tool that takes a single bam input +in order to split the bam processing across multiple jobs. + +A suggested use case: + +hisat -> samtools_split_by_chrom => bcftools_mpileup => bcftools_call => bcftools_merge -> snpEff This tool is based on ``samtools view`` command. -@no-chrom-options@ - ]]> </help> <expand macro="citations"></expand>