diff A.sh @ 0:1955f03f092e

initial commit
author biomonika <biomonika@psu.edu>
date Tue, 02 Sep 2014 14:16:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/A.sh	Tue Sep 02 14:16:04 2014 -0400
@@ -0,0 +1,242 @@
+#!/bin/bash
+set -e;
+email_address_of_user=$1;
+echo "email: " $email_address_of_user;
+
+newline () {
+	echo -e "\n---";
+}
+#variables
+mother_contigs=`cat references/reference_contig_names | wc -l`;
+echo -e "A variant"; newline; dir=A/; mkdir -p $dir;
+
+#DAUGHTER PREPARE FILES
+
+	grep -E '#|AF1=0.' vcf/aln_0.85_daughter.vcf | cut -f1 | uniq >${dir}daughter_on_mother_one_allele_heterozygous_ids.vcf
+	perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_on_mother_one_allele_heterozygous_ids.vcf"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if exists $a{$res}' vcf/aln_0.85_daughter.vcf | sort | uniq | sed '/#/d' >${dir}daughter_on_mother_heterozygous.vcf
+
+	#be aware that if there are more snps, one with allele frequency '0.' is enough for the whole contig to be in contig_pos_ref_heterozygous
+	sed '/#/d' ${dir}daughter_on_mother_heterozygous.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter_heterozygous
+	cut -f 1 ${dir}contig_pos_ref_alt_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughter_heterozygous
+
+	sed '/#/d' vcf/aln_0.85_daughter.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter
+
+#FATHER PREPARE FILES
+
+	sed '/#/d' vcf/aln_0.85_father.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_father
+
+#SON PREPARE FILES
+
+	sed '/#/d' vcf/aln_0.85_son.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_son
+
+
+echo -e "Step 0 of 3.\nMother must be homozygote.";
+	sed '/#/d' vcf/aln_0.85_mother.vcf | cut -f1 | sort | uniq >${dir}contigs_where_mother_differs_from_mother;
+	differing=`cat ${dir}contigs_where_mother_differs_from_mother | wc -l`;
+
+	perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_mother_differs_from_mother"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_mother_same_as_mother_unchecked
+
+	#check whether mother really does not contain any father variants
+	wrong_hits=0;
+	cat ${dir}contigs_where_mother_same_as_mother_unchecked | (while read line; do 
+		contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_father bam/aln_cleaned_sorted_deduplicated_mother.bam A`
+		if [[ $contig_check -gt 0 ]]; then
+			echo "contig: " ${line} "contig_check:" $contig_check >>kontrola;
+			echo "DID NOT PASS "${line}; 
+			wrong_hits=`expr $wrong_hits + 1`
+		else
+			#echo "0K"; 
+			echo ${line} >>${dir}contigs_where_mother_same_as_mother
+		fi
+	done;
+	echo "There is this number of wrong variants: "$wrong_hits;
+	)
+
+	same=`cat ${dir}contigs_where_mother_same_as_mother | wc -l`;
+	echo -e "Number of contigs, where mother differs: $differing.";
+	echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
+
+	newline;
+
+echo -e "Step 1 of 3.\nSons must be same as mother.";
+	#First we have to get rid of the intersection with father, possible Y variants do not matter in the calculation
+	comm -12 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_father | sort | uniq >${dir}possible_Yvariants
+	cut -f 1 ${dir}possible_Yvariants | sort | uniq >${dir}contigs_with_possible_Yvariants
+	
+	#if these possible_Yvariants are in the daughter, they are not Y variants
+	comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}possible_Yvariants | sort | uniq >${dir}NOT_Yvariants
+	cut -f 1 ${dir}NOT_Yvariants  | sort | uniq >${dir}contigs_with_NOT_Yvariants
+
+	perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_with_NOT_Yvariants"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_with_possible_Yvariants | sort | uniq >${dir}contigs_with_Yvariants
+
+	#contig_pos_ref_alt_Yvariants_confirmed; we deplet just those contigs that contain Y variants and then decide if sons are same as mother
+
+cat ${dir}contigs_with_Yvariants | (while read line; do 
+	grep ${line} ${dir}contig_pos_ref_alt_father | sort >>${dir}contig_pos_ref_alt_Yvariants_confirmed_pro
+done;
+)
+	cat ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro | sort >${dir}contig_pos_ref_alt_Yvariants_confirmed;
+	
+	comm -23 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_Yvariants_confirmed | cut -f 1 | sort | uniq >${dir}contigs_where_sons_variation_not_caused_by_father
+	#rm ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro;
+
+	perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_variation_not_caused_by_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_sons_same_as_mother
+
+	same=`cat ${dir}contigs_where_sons_same_as_mother | wc -l`;
+	echo -e "Number of contigs, where sons differ: $differing.";
+	echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
+
+
+	newline;
+echo -e "Step 2 of 3.\nFather must differ from mother.";
+	sed '/#/d' vcf/aln_0.85_father.vcf | cut -f1 | sort | uniq >${dir}/contigs_where_father_differs_from_mother;
+	differing=`cat ${dir}contigs_where_father_differs_from_mother | wc -l`;
+	echo -e "NUMBER of contigs AFTER just this filter: $differing/$mother_contigs";
+
+	newline;
+
+echo -e "Step 3 of 3.\nDaughter must have mother and father alleles.";
+	#we need daughter to have alleles from both parents (AF stands for 'allele frequency')
+
+	#unique lines in first file mean that those are not subsets of father variants - otherwise they would be in common
+	comm -23 ${dir}contig_pos_ref_alt_daughter_heterozygous ${dir}contig_pos_ref_alt_father | cut -f 1 | sort | uniq >${dir}daughter_NOT_subset_of_father
+
+	perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_NOT_subset_of_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}/contigs_where_daughter_from_parents
+	
+
+same=`cat ${dir}contigs_where_daughter_from_parents | wc -l`;
+	echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
+	
+	newline;
+
+echo -e "Step 4 of 3.\nDaughters and sons can not have any intersection.";
+	comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}contig_pos_ref_alt_son >${dir}sons_and_daughters_similiar
+	cut -f 1 ${dir}sons_and_daughters_similiar | sort | uniq >${dir}contigs_where_sons_and_daughters_similiar
+
+	perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_and_daughters_similiar"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughters_and_sons_different_pro
+
+	#check whether they are really different (small amount of variants is also wrong)
+
+	echo -e "daughter contains son variants?"
+	#check whether daughter really does not contain any of son variants
+	wrong_hits=0;
+	cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do 
+		contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_daughter bam/aln_cleaned_sorted_deduplicated_son.bam A`
+		if [[ $contig_check -gt 0 ]]; then
+			echo "DID NOT PASS "${line}; 
+			echo "daughter contains son variants DID NOT PASS "${line} >>kontrola
+			wrong_hits=`expr $wrong_hits + 1`
+		else
+			#echo "0K"; 
+			echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un1
+		fi
+	done;
+	echo "There is this number of wrong variants: "$wrong_hits;
+	)
+
+	echo -e "son contains daughter variants?"
+	#check whether son really does not contain any of daughter variants
+	wrong_hits=0;
+	cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do 
+		contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_son bam/aln_cleaned_sorted_deduplicated_daughter.bam A`
+		if [[ $contig_check -gt 0 ]]; then
+			echo "DID NOT PASS "${line}; 
+			echo "son contains daughter variants DID NOT PASS "${line} >>kontrola
+			wrong_hits=`expr $wrong_hits + 1`
+		else
+			#echo "0K"; 
+			echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un2
+		fi
+	done;
+	echo "There is this number of wrong variants: "$wrong_hits;
+	)
+
+	comm -12 ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 | sort >${dir}contigs_where_daughters_and_sons_different_un
+
+	cat ${dir}contigs_where_daughters_and_sons_different_un | sort | uniq >${dir}contigs_where_daughters_and_sons_different;
+	
+#rm ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 ${dir}contigs_where_daughters_and_sons_different_un;
+
+	same=`cat ${dir}contigs_where_daughters_and_sons_different | wc -l`;
+	echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
+
+echo -e "Calculating intersection of candidate contigs from steps 0-3.";
+
+	comm -12 ${dir}contigs_where_mother_same_as_mother ${dir}contigs_where_sons_same_as_mother | sort >${dir}first_couple;
+        comm -12 ${dir}contigs_where_father_differs_from_mother ${dir}contigs_where_daughter_from_parents | sort >${dir}second_couple;
+	comm -12 ${dir}first_couple ${dir}second_couple | sort >${dir}set1
+	comm -12 ${dir}contigs_where_daughters_and_sons_different ${dir}set1 | sort >${dir}set2; 
+
+echo -e "Step -1 of 3.\nCoverage must be sufficient.";
+	touch ${dir}contigs_with_sufficient_coverage;
+
+cat ${dir}set2 | (while read line; do 
+
+	coverage_check1=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_mother.bam 2>/dev/null &`;
+	coverage_check2=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_father.bam 2>/dev/null &`;
+	coverage_check3=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_son.bam 2>/dev/null &`;
+	coverage_check4=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_daughter.bam 2>/dev/null &`;
+
+	coverage_check=$(($coverage_check1 + $coverage_check2 + $coverage_check3 + $coverage_check4))
+		if [[ $coverage_check -gt 0 ]]; then
+			echo "LOW COVERAGE "${line}; 
+		else
+			#echo "0K"; 
+			echo ${line} >>${dir}contigs_with_sufficient_coverage
+		fi
+done;
+)
+
+	
+	cat ${dir}contigs_with_sufficient_coverage | sort >X_results.txt; 
+	#rm ${dir}first_couple; #rm ${dir}second_couple; #rm ${dir}set1; #rm ${dir}set2;
+	
+	#have we found any genes?
+
+	final_contigs=`cat X_results.txt | wc -l`;
+	echo -e "$final_contigs contigs found fulfilling criteria.";
+
+
+	if [ $final_contigs -gt 0 ]; then
+		#we found SOME genes
+
+		#prepare files for sorting into the categories
+		samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_mother.bam >${dir}high_score_mother.bam &
+		samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_father.bam >${dir}high_score_father.bam &
+		samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_daughter.bam >${dir}high_score_daughter.bam &
+		samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_son.bam >${dir}high_score_son.bam &
+		wait;
+
+		samtools index ${dir}high_score_mother.bam &
+		samtools index ${dir}high_score_father.bam &
+		samtools index ${dir}high_score_daughter.bam &
+		samtools index ${dir}high_score_son.bam &
+		wait;
+
+		#get sequences based on ids
+		perl scripts/get_sequences_based_on_ids.pl references/reference.fasta X_results.txt >X_results_sequences.txt;
+
+		get_results_in_bam_file () 
+	{
+	  for ARG in "$@";
+	    do
+	    	./scripts/createBamFromResults.sh X_results.txt bam/aln_cleaned_sorted_deduplicated_${ARG}.bam bam/${ARG}.bam;
+	    	samtools index bam/${ARG}.bam;
+	    done;
+	}
+
+	get_results_in_bam_file mother father daughter son;
+	wait;
+	#send BAM files with corresponding contigs
+	mutt -s "LINKYX: Results of computation for X-linked genes" $email_address_of_user -a bam/mother.bam -a bam/mother.bam.bai -a bam/father.bam -a bam/father.bam.bai -a bam/daughter.bam -a bam/daughter.bam.bai -a bam/son.bam -a bam/son.bam.bai < messages_to_user/message_results_sentBamX;
+
+	else
+		#we found NO genes
+		echo "No sequences found." >X_results.txt;
+		echo "No sequences found." >X_results_sequences.txt;
+	fi
+
+	newline;
+echo -e "Writing output to X_results.txt";
+echo -e "Total number of found X-linked genes: $final_contigs";
+