# HG changeset patch # User grau # Date 1383855371 18000 # Node ID 85fd336b5b458d1b80a1a736ec8a4b2a861b97da # Parent dec223357d6b97442727afae30558ae9d449f706 Uploaded diff -r dec223357d6b -r 85fd336b5b45 ._DimontWeb.jar Binary file ._DimontWeb.jar has changed diff -r dec223357d6b -r 85fd336b5b45 DimontDataExtractor.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DimontDataExtractor.xml Thu Nov 07 15:16:11 2013 -0500 @@ -0,0 +1,70 @@ + +Extracts genomic regions specified in a BED-like file format in the annotated FastA format as required by Dimont +extract_data_single_galaxy.pl $genomefa $regions $chromcol $startcol $seccol $seccoord $width $statcol extracted.fa + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Dimont Data Extractor** prepares an annotated FastA file as required by Dimont from a genome (in FastA format) and a tabular file (e.g., BED, GTF, narrowPeak,...). The regions specified in the tabular file are used to determine the center of the extracted sequences. All extracted sequences have the same length as specified by parameter "Width". + +In case of ChIP data, the center position could for instance be the peak summit. +An annotated FastA file for ChIP-exo data comprising sequences of length 100 centered around the peak summit might look like:: + + > peak: 50; signal: 515 + ggccatgtgtatttttttaaatttccac... + > peak: 50; signal: 199 + GGTCCCCTGGGAGGATGGGGACGTGCTG... + ... + +where the center is given as 50 for the first two sequences, and the confidence amounts to 515 and 199, respectively. + +We also provide an example_ input file and a stand alone Perl script_ for preparing data in the format required by Dimont_. + + +If you experience problems using Dimont Data Extractor, please contact_ us. + +.. _example: http://www.jstacs.de/downloads/dimont-example.fa +.. _script: http://www.jstacs.de/index.php/Dimont#Data_preparation +.. _Dimont: http://jstacs.de/index.php/Dimont +.. _contact: mailto:grau@informatik.uni-halle.de + + + diff -r dec223357d6b -r 85fd336b5b45 DimontWeb.xml --- a/DimontWeb.xml Wed Nov 06 17:10:20 2013 -0500 +++ b/DimontWeb.xml Thu Nov 07 15:16:11 2013 -0500 @@ -40,6 +40,7 @@ JAR_PATH + java diff -r dec223357d6b -r 85fd336b5b45 extract_data_single_galaxy.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_data_single_galaxy.pl Thu Nov 07 15:16:11 2013 -0500 @@ -0,0 +1,128 @@ +use strict; + +if(@ARGV == 0){ +die < + + : the chromosome FastA containing all chromosome sequences + : the file containing the peaks in tabular format, + e.g., bed, gff, narrowPeak + : the column of containing the chromosome + : the column of containing the start position relative to + the chromosome start + : center: "Center of peak (relative to start)", end: "End of peak (global coordinates)" + : the column of containing the peak center position (center) relative to + or the column of containing the end position (end) + : fixed width of all regions + : the column of containing the peak statistic + or a similar measure of confidence + : the path to the output file, written as FastA +USAGE +} + + +my $chromFa = $ARGV[0]; +my $bed = $ARGV[1]; +my $chromcol = $ARGV[2]-1; +my $startcol = $ARGV[3]-1; +my $seccolm = $ARGV[4]; +my $seccol = $ARGV[5]-1; +my $width = $ARGV[6]; +my $statcol = $ARGV[7]-1; +my $outfile = $ARGV[8]; + +my $sort = 1; + + +sub loadSeq{ + my $prefix = shift; + print $prefix," "; + open(FA,$chromFa); + my $head = ""; + my @lines = (); + while(){ + chomp(); + if(/^>/){ + if($head){ + last; + } + if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){ + $head = $_; + } + }elsif($head){ + push(@lines,lc($_)); + } + } + my $str = join("",@lines); + print "loaded\n"; + return $str; +} + + + +open(IN,$ARGV[1]); + +my @lines = (); + +while(){ + chomp(); + my @parts = split("\t",$_); + $parts[$chromcol] =~ s/chr0/chr/g; + my @vals = (); + if($seccolm eq "center"){ + @vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]); + }else{ + @vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]); + } + push(@vals,$width); + push(@lines,\@vals); +} + +close(IN); +print "Read input file ".$bed."\n"; + + +if($sort){ + + @lines = sort { ${$a}[0] cmp ${$b}[0] } @lines; + +} + +open(OUT,">".$outfile); + +print "Extracting sequences...\n\n"; + +my $oldchr = ""; +my $sequence = ""; +for my $line (@lines){ + my @ar = @{$line}; + my $chr = $ar[0]; + unless($chr eq $oldchr){ + $sequence = loadSeq($chr); + } + $oldchr = $chr; + my $w = $ar[3]; + if($w <= 0){ + print $w," -> next\n"; + next; + } + if($w % 2 == 0){ + $w = $w/2; + }else{ + $w = ($w-1)/2; + } + + my $start = $ar[1]-$w-1; + + my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n"; + my $curr = substr($sequence,$start,$ar[3]); + if($curr =~ /[^ACGTacgt]/){ + print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n"; + }else{ + print OUT $head,$curr,"\n"; + } +} + +close(OUT); +print "\nDone.\n"; \ No newline at end of file diff -r dec223357d6b -r 85fd336b5b45 test-data/.DS_Store Binary file test-data/.DS_Store has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/._.DS_Store Binary file test-data/._.DS_Store has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/._Test Binary file test-data/._Test has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/._dimont_test.fasta Binary file test-data/._dimont_test.fasta has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/Test/._Motif_(rc)1.png Binary file test-data/Test/._Motif_(rc)1.png has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/Test/._Motif_(rc)3.png Binary file test-data/Test/._Motif_(rc)3.png has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/Test/._Motif_0.png Binary file test-data/Test/._Motif_0.png has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/Test/._Motif_2.png Binary file test-data/Test/._Motif_2.png has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/Test/._Test_html.html Binary file test-data/Test/._Test_html.html has changed diff -r dec223357d6b -r 85fd336b5b45 test-data/mini.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mini.bed Thu Nov 07 15:16:11 2013 -0500 @@ -0,0 +1,3 @@ +chr1 20 250 id1 0 . 12.3 +chr1 374 450 id2 0 . 11.1 +chr2 53 273 id3 0 . 3.45 diff -r dec223357d6b -r 85fd336b5b45 test-data/mini2.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mini2.bed Thu Nov 07 15:16:11 2013 -0500 @@ -0,0 +1,3 @@ +chr1 90 70 id1 0 . 12.3 +chr1 374 74 id2 0 . 11.1 +chr2 53 120 id3 0 . 3.45 diff -r dec223357d6b -r 85fd336b5b45 test-data/mini2_extracted.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mini2_extracted.fa Thu Nov 07 15:16:11 2013 -0500 @@ -0,0 +1,6 @@ +> chr: chr1; start: 59; peak: 101; signal: 12.3 +catattatagggagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgcgtatgcgagagtagtgccaacatat +> chr: chr1; start: 347; peak: 101; signal: 11.1 +atttagattgcctattaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctctatataatgactgcctctcattctgtcttattttaccgcaaacccaaatcgacaatgcacgacagaggaagcagaacagatatttagattgcctctcattttctctcccatattatagggagaaatatgatcgcgtat +> chr: chr2; start: 72; peak: 101; signal: 3.45 +gccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgcgtatgcgagagtagtgccaacatattgtgctaatgagtgcctctcgttctctgtcttatattaccg diff -r dec223357d6b -r 85fd336b5b45 test-data/mini_extracted.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mini_extracted.fa Thu Nov 07 15:16:11 2013 -0500 @@ -0,0 +1,6 @@ +> chr: chr1; start: 34; peak: 101; signal: 12.3 +ttagattgcctctcattttctctcccatattatagggagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgc +> chr: chr1; start: 311; peak: 101; signal: 11.1 +agacaatacacgacagagagagagagcagcggagatatttagattgcctattaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctctctatataatgactgcctctcattctgtcttattttaccgcaaacccaaatcgacaatgcacgacagaggaagcagaacagatatttagattgcctctcatttt +> chr: chr2; start: 62; peak: 101; signal: 3.45 +cgagagtagtgccaacatattgtgctctttgattttttggcaacccaaaatggtggcggatgaacgagatgataatatattcaagttgccgctaatcagaaataaattcattgcaacgttaaatacagcacaatatatgatcgcgtatgcgagagtagtgccaacatattgtgctaatgagtgcctctcgttctctgtct diff -r dec223357d6b -r 85fd336b5b45 test-data/minigenome.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/minigenome.fa Thu Nov 07 15:16:11 2013 -0500 @@ -0,0 +1,44 @@ +> chr1 +Cgacaatgcacgacagaggaagcagaacagatatttagattgcctctcat +tttctctcccatattatagggagaaatatgatcgcgtatgcgagagtagt +gccaacatattgtgctctttgattttttggcaacccaaaatggtggcgga +tgaaCGAGATGATAATATATTCAAGTTGCCGCTAATCAGAAATAAATTCA +TTGCAACGTTAAATACAGCACAATATATGATCGCGTATGCGAGAGTAGTG +CCAACATATTGTGCTAATGAGTGCCTCTCGTTCTCTGTCTTATATTACCG +CAAACCCAAAAAgacaatacacgacagagagagagagcagcggagatatt +tagattgcctattaaatatgatcgcgtatgcgagagtagtgccaacatat +tgtgctctCTATATAATGACTGCCTCTCATTCTGTCTTATTTTACCGCAA +ACCCAAatcgacaatgcacgacagaggaagcagaacagatatttagattg +cctctcattttctctcccatattatagggagaaatatgatcgcgtatgcg +agagtagtgccaacatattgtgctctttgattttttggcaacccaaaatg +gtggcggatgaaCGAGATGATAATATATTCAAGTTGCCGCTAATCAGAAA +TAAATTCATTGCAACGTTAAATACAGCACAATATATGATCGCGTATGCGA +GAGTAGTGCCAACATATTGTGCTAATGAGTGCCTCTCGTTCTCTGTCTTA +TATTACCGCAAACCCAAAAAgacaatacacgacagagagagagagcagcg +gagatatttagattgcctattaaatatgatcgcgtatgcgagagtagtgc +caacatattgtgctctCTATATAATGACTGCCTCTCATTCTGTCTTATTT +TACCGCAAACCCAAatcgacaatgcacgacagaggaagcagaacagatat +ttagattgcctctcattttctctcccatattatagggagaaatatgatcg +cgtatgcgagagtagtgccaacatattgtgctctttgattttttggcaac +ccaaaatggtggcggatgaaCGAGATGATAATATATTCAAGTTGCCGCTA +ATCAGAAATAAATTCATTGCAACGTTAAATACAGCACAATATATGATCGC +GTATGCGAGAGTAGTGCCAACATATTGTGCTAATGAGTGCCTCTCGTTCT +CTGTCTTATATTACCGCAAACCCAAAAAgacaatacacgacagagagaga +gagcagcggagatatttagattgcctattaaatatgatcgcgtatgcgag +> chr2 +agatatttagattgcctctcattttctctcccatattatagggagaaata +tgatcgcgtatgcgagagtagtgccaacatattgtgctctttgatttttt +ggcaacccaaaatggtggcggatgaaCGAGATGATAATATATTCAAGTTG +CCGCTAATCAGAAATAAATTCATTGCAACGTTAAATACAGCACAATATAT +GATCGCGTATGCGAGAGTAGTGCCAACATATTGTGCTAATGAGTGCCTCT +CGTTCTCTGTCTTATATTACCGCAAACCCAAAAAgacaatacacgacaga +gagagagagcagcggagatatttagattgcctattaaatatgatcgcgta +tgcgagagtagtgccaacatattgtgctctCTATATAATGACTGCCTCTC +ATTCTGTCTTATTTTACCGCAAACCCAAatcgacaatgcacgacagagga +agcagaacagatatttagattgcctctcattttctctcccatattatagg +gagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctcttt +gattttttggcaacccaaaatggtggcggatgaaCGAGATGATAATATAT +TCAAGTTGCCGCTAATCAGAAATAAATTCATTGCAACGTTAAATACAGCA +CAATATATGATCGCGTATGCGAGAGTAGTGCCAACATATTGTGCTAATGA +GTGCCTCTCGTTCTCTGTCTTATATTACCGCAAACCCAAAAAgacaatac +acgacagagagagagagcagcggagatatttagattgcctattaaatatg \ No newline at end of file