Repository 'covacs_intersect_snps'
hg clone https://testtoolshed.g2.bx.psu.edu/repos/elixir-it/covacs_intersect_snps

Changeset 0:3edc7bb490d3 (2018-11-08)
Commit message:
Uploaded
added:
14_june_18_intersect.snps.with.infos.pl
intersect_snp.xml
b
diff -r 000000000000 -r 3edc7bb490d3 14_june_18_intersect.snps.with.infos.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/14_june_18_intersect.snps.with.infos.pl Thu Nov 08 12:56:30 2018 -0500
[
@@ -0,0 +1,172 @@
+#!/usr/bin/perl -w
+#
+
+use strict;
+
+
+my $f1=shift; #varscan
+my $f2=shift; #freebayes
+my $f3=shift; #GATK
+my $fout=shift;
+my %ALL=();
+my %L=();
+my %H=();
+
+die("non trovo il file $f1") unless -e $f1;
+die("non trovo il file $f2") unless -e $f2;
+die("non trovo il file $f2") unless -e $f3;
+
+die ("non specificato il file di output") unless $fout;
+
+
+open(OUT,">$fout.common.snps.vcf");
+open(UN,">$fout.unique.snps.vcf");
+
+open(IN,$f1);
+while(<IN>)
+{
+ next if $_=~/^\#/;
+ my ($chr,$st,$b1,$b2)=(split())[0,1,3,4];
+ my @all_vl=(split(/\t/));
+ next unless (length($b1)==length($b2));
+ next if ($b1=~/\./ || $b1=~/\-/  || $b2=~/\./ || $b2=~/\-/);
+ my @b1=(split('',$b1));
+ my @b2=(split('',$b2));
+ for (my $i=0;$i<=$#b1;$i++)
+ {
+ my $lb1=$b1[$i];
+                my $lb2=$b2[$i];
+ next if $lb1 eq $lb2;
+ next if ($lb1=~/\./ || $lb1=~/\-/  || $lb2=~/\./ || $lb2=~/\-/ || $lb2=~/\,/);
+                my $lst=$st+$i;
+ $all_vl[1]=$lst;
+                $all_vl[3]=$lb1;
+                $all_vl[4]=$lb2;
+                my $J=join("\t",@all_vl);
+                $H{$chr}{$lst}{$lb1}{$lb2}++;
+                $ALL{$chr}{$lst}{$lb1}{$lb2}=$J;
+                push(@{$L{$chr}{$lst}{$lb1}{$lb2}},"VS");
+ }
+}
+close(IN);
+
+open(IN,$f2);
+while(<IN>)
+{
+ next if $_=~/^\#/;
+        my ($chr,$st,$b1,$b2)=(split())[0,1,3,4];
+ my @all_vl=(split(/\t/));
+ next unless (length($b1)==length($b2));
+ next if ($b1=~/\./ || $b1=~/\-/  || $b2=~/\./ || $b2=~/\-/);
+ my @b1=(split('',$b1));
+        my @b2=(split('',$b2));
+
+        for (my $i=0;$i<=$#b1;$i++)
+        {
+                my $lb1=$b1[$i];
+                my $lb2=$b2[$i];
+ next if $lb1 eq $lb2;
+ next if ($lb1=~/\./ || $lb1=~/\-/  || $lb2=~/\./ || $lb2=~/\-/ || $lb2=~/\,/);
+ my $lst=$st+$i;
+ $all_vl[1]=$lst;
+                $all_vl[3]=$lb1;
+                $all_vl[4]=$lb2;
+                my $J=join("\t",@all_vl);
+                $H{$chr}{$lst}{$lb1}{$lb2}++;
+                $ALL{$chr}{$lst}{$lb1}{$lb2}=$J;
+                push(@{$L{$chr}{$lst}{$lb1}{$lb2}},"FB");
+        }
+
+}
+close(IN);
+
+open(IN,$f3);
+while(<IN>)
+{
+        if ($_=~/^\#/)
+ {
+ print OUT if $_=~/fileformat/;
+                print OUT if $_=~/contig/;
+                print OUT if $_=~/CHROM/;
+                print UN if  $_=~/fileformat/;
+                print UN if  $_=~/contig/;
+                print UN if  $_=~/CHROM/;
+ next;
+ }
+ my ($chr,$st,$b1,$b2)=(split())[0,1,3,4];
+ my @all_vl=(split(/\t/));
+ next unless (length($b1)==length($b2));
+ next if ($b1=~/\./ || $b1=~/\-/  || $b2=~/\./ || $b2=~/\-/);
+ my @b1=(split('',$b1));
+        my @b2=(split('',$b2));
+
+        for (my $i=0;$i<=$#b1;$i++)
+        {
+ my $lb1=$b1[$i];
+                my $lb2=$b2[$i];
+ next if $lb1 eq $lb2;
+         next if ($lb1=~/\./ || $lb1=~/\-/  || $lb2=~/\./ || $lb2=~/\-/ || $lb2=~/\,/);
+                my $lst=$st+$i;
+ $all_vl[1]=$lst;
+ $all_vl[3]=$lb1;
+ $all_vl[4]=$lb2;
+ my $J=join("\t",@all_vl);
+                $H{$chr}{$lst}{$lb1}{$lb2}++;
+                $ALL{$chr}{$lst}{$lb1}{$lb2}=$J;
+                push(@{$L{$chr}{$lst}{$lb1}{$lb2}},"GA");
+        }
+}
+close(IN);
+
+foreach my $chr (sort keys %H)
+{
+ foreach my $s (sort{$a<=>$b} keys %{$H{$chr}})
+ {
+ foreach my $b1 (keys %{$H{$chr}{$s}})
+ {
+ foreach my $b2 (keys %{$H{$chr}{$s}{$b1}})
+ {
+
+ my $vl=$H{$chr}{$s}{$b1}{$b2};
+ unless ($H{$chr}{$s}{$b1}{$b2})
+ {
+ warn("non trovo il numero di occorrenze di $chr $s $b1 $b2\n");
+ next;
+ }
+ my $out_vl=$ALL{$chr}{$s}{$b1}{$b2};
+
+ unless ($ALL{$chr}{$s}{$b1}{$b2})
+                                {
+                                        warn("non trovo le info di $chr $s $b1 $b2\n");
+                                        next;
+                                }
+ my @vls=(split(/\s+/,$out_vl));
+ my $comm=$vls[7];
+
+ if ($vl>=2)
+ {
+ my $mm=join("_",@{$L{$chr}{$s}{$b1}{$b2}});
+                                 my $comm2=$comm;
+                                 $comm2.=";NM=$vl;LM=$mm";
+                                 $vls[7]=$comm2;
+                                 my $out_vl=join("\t",@vls);
+                                 print OUT "$out_vl\n";
+
+ }else{
+                                        my $mm=$L{$chr}{$s}{$b1}{$b2}[0];
+                                 my $comm2=$comm;
+                                 $comm2.=";NM=$vl;LM=$mm";
+                                 $vls[7]=$comm2;
+                                 my $out_vl=join("\t",@vls);
+                                 print UN "$out_vl\n";
+
+ }
+ }
+ }
+ }
+}
+
+#foreach $V (keys %V)
+#{
+# print "$V $V{$V}\n";
+#}
b
diff -r 000000000000 -r 3edc7bb490d3 intersect_snp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/intersect_snp.xml Thu Nov 08 12:56:30 2018 -0500
b
@@ -0,0 +1,41 @@
+<tool id="intersect_snp" name="intersect_snp" version="0">
+  <description>tool for snp VCF file intersection </description>
+  <requirements>
+  </requirements>
+  <command>
+ perl  $__tool_directory__/14_june_18_intersect.snps.with.infos.pl
+  $input1 $input2 $input3 intersect 2>log
+  </command>
+  <inputs>
+    <param name="input1" format="vcf" type="data" label="VCF File" help="Varscan2 snp output" />
+    <param name="input2" format="vcf" type="data" label="VCF File" help="Frebayes snp output" />
+    <param name="input3" format="vcf" type="data" label="VCF File" help="GATK HaplotypeCaller snp output" />
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output1" from_work_dir="intersect.common.snps.vcf" label="${tool.name} on ${on_string} common snps" />
+    <data format="vcf" name="output2" from_work_dir="intersect.unique.snps.vcf" label="${tool.name} on ${on_string} unique snps" />
+    <data format="txt" name="output3" from_work_dir="log" label="${tool.name} on ${on_string} log" />
+  </outputs>
+  <stdio>
+  </stdio>
+  <help>
+**WHAT IT DOES**
+
+Predictions of SNPs from Varscan2, GATK and Freebayes are consolidated into a single call-set following a simple majority consensus rule. Variants  identified by at least two methods are incorporated into a final “high confidence” call set.Singleton variants predicted by only one method are considered less reliable and are included in a low quality call-set.
+
+**The final output consists in two VCF files:**
+
+-SNPs common to at least 2 methods
+
+-Singleton SNPs obtained from a single method
+
+  </help>
+  <citations>
+       <citation type="doi">10.1186/s12864-018-4508-1</citation>
+  </citations>
+  <tests>
+    <test>
+    </test>
+  </tests>
+
+</tool>