changeset 9:85fd336b5b45 draft

Uploaded
author grau
date Thu, 07 Nov 2013 15:16:11 -0500
parents dec223357d6b
children 247f7edbe7f8
files ._DimontWeb.jar DimontDataExtractor.xml DimontWeb.xml extract_data_single_galaxy.pl test-data/.DS_Store test-data/._.DS_Store test-data/._Test test-data/._dimont_test.fasta test-data/Test/._Motif_(rc)1.png test-data/Test/._Motif_(rc)3.png test-data/Test/._Motif_0.png test-data/Test/._Motif_2.png test-data/Test/._Test_html.html test-data/mini.bed test-data/mini2.bed test-data/mini2_extracted.fa test-data/mini_extracted.fa test-data/minigenome.fa
diffstat 18 files changed, 261 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file ._DimontWeb.jar has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DimontDataExtractor.xml	Thu Nov 07 15:16:11 2013 -0500
@@ -0,0 +1,70 @@
+<tool id="DimontDataExtractor" name="Dimont Data Extractor" version="0.1" force_history_refresh="true">
+<description>Extracts genomic regions specified in a BED-like file format in the annotated FastA format as required by Dimont</description>
+<command interpreter="perl">extract_data_single_galaxy.pl $genomefa $regions $chromcol $startcol $seccol $seccoord $width $statcol extracted.fa</command>
+<inputs>
+	<param type="data" format="bed,gtf,txt,tabular" name="regions" label="Genomic regions" value="" optional="false" help="The genomic regions to be extracted in a BED-like file format, e.g., BED, GTF, narrowPeak." />
+	<param type="data" format="fasta" name="genomefa" label="Genome sequence" value="" optional="false" help="The input genome to which the genomic regions refer." />
+	<param type="integer" name="chromcol" label="Chromosome column" value="1" optional="false" help="The column of the regions file, which contains the chromosome information." />
+	<param type="integer" name="startcol" label="Start column" value="2" optional="false" help="The column of the Regions file containing the start position of the genomic region." />
+	<param type="select" name="seccol" label="Meaning of second coordinate" optional="false" help="The meaning of the second genomic coordinate. This may either be the position of the peak summit relative to the position in Start, or the end position of the peak.">
+		<option value="end">End of peak (in global coordinates)</option>
+		<option value="center">Center of peak (relative to start)</option>
+	</param>
+	<param type="integer" name="seccoord" label="Second coordinate" value="3" optional="false" help="The second genomic coordinate with meaning specified by parameter &quot;Meaning of second coordinate&quot;" />
+	<param type="integer" name="statcol" label="Statistics column" value="7" optional="false" help="The column containing the peak statistics information (or another measure of peak confidence)." />
+	<param type="integer" name="width" label="Width" value="1000" optional="false" help="The width of the genomic region to be extracted. Recommended values: 1000 for ChIP-seq and 100 for ChIP-exo." />
+</inputs>
+<outputs>
+	<data format="fasta" name="extracted.fa">
+	</data>
+</outputs>
+<tests>
+	<test>
+		<param name="regions" value="mini.bed" />
+		<param name="genomefa" value="minigenome.fa" />
+		<param name="chromcol" value="1" />
+		<param name="startcol" value="2" />
+		<param name="seccol" value="end" />
+		<param name="seccoord" value="3" />
+		<param name="statcol" value="7" />
+		<param name="width" value="200" />
+		<output name="extracted.fa" file="mini_extracted.fa" /> 
+	</test>
+	<test>
+		<param name="regions" value="mini2.bed" />
+		<param name="genomefa" value="minigenome.fa" />
+		<param name="chromcol" value="1" />
+		<param name="startcol" value="2" />
+		<param name="seccol" value="center" />
+		<param name="seccoord" value="3" />
+		<param name="statcol" value="7" />
+		<param name="width" value="200" />
+		<output name="extracted.fa" file="mini2_extracted.fa" /> 
+	</test>
+</tests>
+<help>
+**Dimont Data Extractor** prepares an annotated FastA file as required by Dimont from a genome (in FastA format) and a tabular file (e.g., BED, GTF, narrowPeak,...). The regions specified in the tabular file are used to determine the center of the extracted sequences. All extracted sequences have the same length as specified by parameter "Width".
+
+In case of ChIP data, the center position could for instance be the peak summit.
+An annotated FastA file for ChIP-exo data comprising sequences of length 100 centered around the peak summit might look like::
+	
+	> peak: 50; signal: 515
+	ggccatgtgtatttttttaaatttccac...
+	> peak: 50; signal: 199
+	GGTCCCCTGGGAGGATGGGGACGTGCTG...
+	...
+
+where the center is given as 50 for the first two sequences, and the confidence amounts to 515 and 199, respectively.
+
+We also provide an example_ input file and a stand alone Perl script_ for preparing data in the format required by Dimont_.
+
+
+If you experience problems using Dimont Data Extractor, please contact_ us.
+
+.. _example: http://www.jstacs.de/downloads/dimont-example.fa
+.. _script: http://www.jstacs.de/index.php/Dimont#Data_preparation
+.. _Dimont: http://jstacs.de/index.php/Dimont
+.. _contact: mailto:grau@informatik.uni-halle.de
+</help>
+</tool>
+
--- a/DimontWeb.xml	Wed Nov 06 17:10:20 2013 -0500
+++ b/DimontWeb.xml	Thu Nov 07 15:16:11 2013 -0500
@@ -40,6 +40,7 @@
 </inputs>
 <requirements>
 	<requirement type="set_environment">JAR_PATH</requirement>
+	<requirement type="binary" version=">=1.6">java</requirement>
 </requirements>
 <configfiles>
 <configfile name="script_file">
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_data_single_galaxy.pl	Thu Nov 07 15:16:11 2013 -0500
@@ -0,0 +1,128 @@
+use strict;
+
+if(@ARGV == 0){
+die <<USAGE
+usage:
+perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile>
+    
+    <chromFa>: the chromosome FastA containing all chromosome sequences
+    <bedfile>:  the file containing the peaks in tabular format, 
+                e.g., bed, gff, narrowPeak
+    <chromcol>: the column of <bedfile> containing the chromosome
+    <startcol>: the column of <bedfile> containing the start position relative to
+                the chromosome start
+    <seccolm>:   center: "Center of peak (relative to start)", end: "End of peak (global coordinates)"
+    <secondcol>:  the column of <bedfile> containing the peak center position (center) relative to
+                <startcol> or the column of <bedfile> containing the end position (end)
+    <width>:    fixed width of all regions
+    <statcol>:  the column of <bedfile> containing the peak statistic
+                or a similar measure of confidence
+    <outfile>:  the path to the output file, written as FastA
+USAGE
+}
+
+
+my $chromFa = $ARGV[0];
+my $bed = $ARGV[1];
+my $chromcol = $ARGV[2]-1;
+my $startcol = $ARGV[3]-1;
+my $seccolm = $ARGV[4];
+my $seccol = $ARGV[5]-1;
+my $width = $ARGV[6];
+my $statcol = $ARGV[7]-1;
+my $outfile = $ARGV[8];
+
+my $sort = 1;
+
+
+sub loadSeq{
+	my $prefix = shift;
+	print $prefix," ";
+	open(FA,$chromFa);
+	my $head = "";
+	my @lines = ();
+	while(<FA>){
+		chomp();
+		if(/^>/){
+			if($head){
+				last;
+			}
+			if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){
+				$head = $_;
+			}
+		}elsif($head){
+			push(@lines,lc($_));
+		}
+	}
+	my $str = join("",@lines);
+	print "loaded\n";
+	return $str;
+}
+
+
+
+open(IN,$ARGV[1]);
+
+my @lines = ();
+
+while(<IN>){
+	chomp();
+	my @parts = split("\t",$_);
+	$parts[$chromcol] =~ s/chr0/chr/g;
+	my @vals = ();
+	if($seccolm eq "center"){
+		@vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]);
+	}else{
+		@vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]);
+	}
+	push(@vals,$width);
+	push(@lines,\@vals);
+}
+
+close(IN);
+print "Read input file ".$bed."\n";
+
+
+if($sort){
+
+	@lines = sort { ${$a}[0] cmp ${$b}[0]  } @lines;
+
+}
+
+open(OUT,">".$outfile);
+
+print "Extracting sequences...\n\n";
+
+my $oldchr = "";
+my $sequence = "";
+for my $line (@lines){
+	my @ar = @{$line};
+	my $chr = $ar[0];
+	unless($chr eq $oldchr){
+		$sequence = loadSeq($chr);
+	}
+	$oldchr = $chr;
+	my $w = $ar[3];
+	if($w <= 0){
+		print $w," -> next\n";
+		next;
+	}
+	if($w % 2 == 0){
+		$w = $w/2;
+	}else{
+		$w = ($w-1)/2;
+	}
+
+	my $start = $ar[1]-$w-1;
+
+	my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n";
+	my $curr = substr($sequence,$start,$ar[3]);
+	if($curr =~ /[^ACGTacgt]/){
+		print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n";
+	}else{
+		print OUT $head,$curr,"\n";
+	}
+}
+
+close(OUT);
+print "\nDone.\n";
\ No newline at end of file
Binary file test-data/.DS_Store has changed
Binary file test-data/._.DS_Store has changed
Binary file test-data/._Test has changed
Binary file test-data/._dimont_test.fasta has changed
Binary file test-data/Test/._Motif_(rc)1.png has changed
Binary file test-data/Test/._Motif_(rc)3.png has changed
Binary file test-data/Test/._Motif_0.png has changed
Binary file test-data/Test/._Motif_2.png has changed
Binary file test-data/Test/._Test_html.html has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mini.bed	Thu Nov 07 15:16:11 2013 -0500
@@ -0,0 +1,3 @@
+chr1	20	250	id1	0	.	12.3
+chr1	374	450	id2	0	.	11.1
+chr2	53	273	id3	0	.	3.45
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mini2.bed	Thu Nov 07 15:16:11 2013 -0500
@@ -0,0 +1,3 @@
+chr1	90	70	id1	0	.	12.3
+chr1	374	74	id2	0	.	11.1
+chr2	53	120	id3	0	.	3.45
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mini2_extracted.fa	Thu Nov 07 15:16:11 2013 -0500
@@ -0,0 +1,6 @@
+> chr: chr1; start: 59; peak: 101; signal: 12.3
+catattatagggagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgcgtatgcgagagtagtgccaacatat
+> chr: chr1; start: 347; peak: 101; signal: 11.1
+atttagattgcctattaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctctatataatgactgcctctcattctgtcttattttaccgcaaacccaaatcgacaatgcacgacagaggaagcagaacagatatttagattgcctctcattttctctcccatattatagggagaaatatgatcgcgtat
+> chr: chr2; start: 72; peak: 101; signal: 3.45
+gccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgcgtatgcgagagtagtgccaacatattgtgctaatgagtgcctctcgttctctgtcttatattaccg
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mini_extracted.fa	Thu Nov 07 15:16:11 2013 -0500
@@ -0,0 +1,6 @@
+> chr: chr1; start: 34; peak: 101; signal: 12.3
+ttagattgcctctcattttctctcccatattatagggagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgc
+> chr: chr1; start: 311; peak: 101; signal: 11.1
+agacaatacacgacagagagagagagcagcggagatatttagattgcctattaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctctatataatgactgcctctcattctgtcttattttaccgcaaacccaaatcgacaatgcacgacagaggaagcagaacagatatttagattgcctctcatttt
+> chr: chr2; start: 62; peak: 101; signal: 3.45
+cgagagtagtgccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgcgtatgcgagagtagtgccaacatattgtgctaatgagtgcctctcgttctctgtct
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/minigenome.fa	Thu Nov 07 15:16:11 2013 -0500
@@ -0,0 +1,44 @@
+> chr1
+Cgacaatgcacgacagaggaagcagaacagatatttagattgcctctcat
+tttctctcccatattatagggagaaatatgatcgcgtatgcgagagtagt
+gccaacatattgtgctctttgattttttggcaacccaaaatggtggcgga
+tgaaCGAGATGATAATATATTCAAGTTGCCGCTAATCAGAAATAAATTCA
+TTGCAACGTTAAATACAGCACAATATATGATCGCGTATGCGAGAGTAGTG
+CCAACATATTGTGCTAATGAGTGCCTCTCGTTCTCTGTCTTATATTACCG
+CAAACCCAAAAAgacaatacacgacagagagagagagcagcggagatatt
+tagattgcctattaaatatgatcgcgtatgcgagagtagtgccaacatat
+tgtgctctCTATATAATGACTGCCTCTCATTCTGTCTTATTTTACCGCAA
+ACCCAAatcgacaatgcacgacagaggaagcagaacagatatttagattg
+cctctcattttctctcccatattatagggagaaatatgatcgcgtatgcg
+agagtagtgccaacatattgtgctctttgattttttggcaacccaaaatg
+gtggcggatgaaCGAGATGATAATATATTCAAGTTGCCGCTAATCAGAAA
+TAAATTCATTGCAACGTTAAATACAGCACAATATATGATCGCGTATGCGA
+GAGTAGTGCCAACATATTGTGCTAATGAGTGCCTCTCGTTCTCTGTCTTA
+TATTACCGCAAACCCAAAAAgacaatacacgacagagagagagagcagcg
+gagatatttagattgcctattaaatatgatcgcgtatgcgagagtagtgc
+caacatattgtgctctCTATATAATGACTGCCTCTCATTCTGTCTTATTT
+TACCGCAAACCCAAatcgacaatgcacgacagaggaagcagaacagatat
+ttagattgcctctcattttctctcccatattatagggagaaatatgatcg
+cgtatgcgagagtagtgccaacatattgtgctctttgattttttggcaac
+ccaaaatggtggcggatgaaCGAGATGATAATATATTCAAGTTGCCGCTA
+ATCAGAAATAAATTCATTGCAACGTTAAATACAGCACAATATATGATCGC
+GTATGCGAGAGTAGTGCCAACATATTGTGCTAATGAGTGCCTCTCGTTCT
+CTGTCTTATATTACCGCAAACCCAAAAAgacaatacacgacagagagaga
+gagcagcggagatatttagattgcctattaaatatgatcgcgtatgcgag
+> chr2
+agatatttagattgcctctcattttctctcccatattatagggagaaata
+tgatcgcgtatgcgagagtagtgccaacatattgtgctctttgatttttt
+ggcaacccaaaatggtggcggatgaaCGAGATGATAATATATTCAAGTTG
+CCGCTAATCAGAAATAAATTCATTGCAACGTTAAATACAGCACAATATAT
+GATCGCGTATGCGAGAGTAGTGCCAACATATTGTGCTAATGAGTGCCTCT
+CGTTCTCTGTCTTATATTACCGCAAACCCAAAAAgacaatacacgacaga
+gagagagagcagcggagatatttagattgcctattaaatatgatcgcgta
+tgcgagagtagtgccaacatattgtgctctCTATATAATGACTGCCTCTC
+ATTCTGTCTTATTTTACCGCAAACCCAAatcgacaatgcacgacagagga
+agcagaacagatatttagattgcctctcattttctctcccatattatagg
+gagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctcttt
+gattttttggcaacccaaaatggtggcggatgaaCGAGATGATAATATAT
+TCAAGTTGCCGCTAATCAGAAATAAATTCATTGCAACGTTAAATACAGCA
+CAATATATGATCGCGTATGCGAGAGTAGTGCCAACATATTGTGCTAATGA
+GTGCCTCTCGTTCTCTGTCTTATATTACCGCAAACCCAAAAAgacaatac
+acgacagagagagagagcagcggagatatttagattgcctattaaatatg
\ No newline at end of file