annotate A.sh @ 13:ef5c8ed727b0

toolshed12
author biomonika <biomonika@psu.edu>
date Wed, 10 Sep 2014 10:49:56 -0400
parents 1955f03f092e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
1 #!/bin/bash
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
2 set -e;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
3 email_address_of_user=$1;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
4 echo "email: " $email_address_of_user;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
5
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
6 newline () {
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
7 echo -e "\n---";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
8 }
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
9 #variables
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
10 mother_contigs=`cat references/reference_contig_names | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
11 echo -e "A variant"; newline; dir=A/; mkdir -p $dir;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
12
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
13 #DAUGHTER PREPARE FILES
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
14
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
15 grep -E '#|AF1=0.' vcf/aln_0.85_daughter.vcf | cut -f1 | uniq >${dir}daughter_on_mother_one_allele_heterozygous_ids.vcf
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
16 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_on_mother_one_allele_heterozygous_ids.vcf"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if exists $a{$res}' vcf/aln_0.85_daughter.vcf | sort | uniq | sed '/#/d' >${dir}daughter_on_mother_heterozygous.vcf
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
17
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
18 #be aware that if there are more snps, one with allele frequency '0.' is enough for the whole contig to be in contig_pos_ref_heterozygous
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
19 sed '/#/d' ${dir}daughter_on_mother_heterozygous.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter_heterozygous
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
20 cut -f 1 ${dir}contig_pos_ref_alt_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughter_heterozygous
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
21
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
22 sed '/#/d' vcf/aln_0.85_daughter.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
23
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
24 #FATHER PREPARE FILES
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
25
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
26 sed '/#/d' vcf/aln_0.85_father.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_father
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
27
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
28 #SON PREPARE FILES
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
29
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
30 sed '/#/d' vcf/aln_0.85_son.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_son
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
31
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
32
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
33 echo -e "Step 0 of 3.\nMother must be homozygote.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
34 sed '/#/d' vcf/aln_0.85_mother.vcf | cut -f1 | sort | uniq >${dir}contigs_where_mother_differs_from_mother;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
35 differing=`cat ${dir}contigs_where_mother_differs_from_mother | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
36
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
37 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_mother_differs_from_mother"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_mother_same_as_mother_unchecked
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
38
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
39 #check whether mother really does not contain any father variants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
40 wrong_hits=0;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
41 cat ${dir}contigs_where_mother_same_as_mother_unchecked | (while read line; do
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
42 contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_father bam/aln_cleaned_sorted_deduplicated_mother.bam A`
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
43 if [[ $contig_check -gt 0 ]]; then
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
44 echo "contig: " ${line} "contig_check:" $contig_check >>kontrola;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
45 echo "DID NOT PASS "${line};
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
46 wrong_hits=`expr $wrong_hits + 1`
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
47 else
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
48 #echo "0K";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
49 echo ${line} >>${dir}contigs_where_mother_same_as_mother
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
50 fi
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
51 done;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
52 echo "There is this number of wrong variants: "$wrong_hits;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
53 )
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
54
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
55 same=`cat ${dir}contigs_where_mother_same_as_mother | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
56 echo -e "Number of contigs, where mother differs: $differing.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
57 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
58
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
59 newline;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
60
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
61 echo -e "Step 1 of 3.\nSons must be same as mother.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
62 #First we have to get rid of the intersection with father, possible Y variants do not matter in the calculation
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
63 comm -12 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_father | sort | uniq >${dir}possible_Yvariants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
64 cut -f 1 ${dir}possible_Yvariants | sort | uniq >${dir}contigs_with_possible_Yvariants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
65
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
66 #if these possible_Yvariants are in the daughter, they are not Y variants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
67 comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}possible_Yvariants | sort | uniq >${dir}NOT_Yvariants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
68 cut -f 1 ${dir}NOT_Yvariants | sort | uniq >${dir}contigs_with_NOT_Yvariants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
69
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
70 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_with_NOT_Yvariants"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_with_possible_Yvariants | sort | uniq >${dir}contigs_with_Yvariants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
71
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
72 #contig_pos_ref_alt_Yvariants_confirmed; we deplet just those contigs that contain Y variants and then decide if sons are same as mother
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
73
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
74 cat ${dir}contigs_with_Yvariants | (while read line; do
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
75 grep ${line} ${dir}contig_pos_ref_alt_father | sort >>${dir}contig_pos_ref_alt_Yvariants_confirmed_pro
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
76 done;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
77 )
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
78 cat ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro | sort >${dir}contig_pos_ref_alt_Yvariants_confirmed;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
79
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
80 comm -23 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_Yvariants_confirmed | cut -f 1 | sort | uniq >${dir}contigs_where_sons_variation_not_caused_by_father
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
81 #rm ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
82
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
83 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_variation_not_caused_by_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_sons_same_as_mother
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
84
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
85 same=`cat ${dir}contigs_where_sons_same_as_mother | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
86 echo -e "Number of contigs, where sons differ: $differing.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
87 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
88
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
89
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
90 newline;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
91 echo -e "Step 2 of 3.\nFather must differ from mother.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
92 sed '/#/d' vcf/aln_0.85_father.vcf | cut -f1 | sort | uniq >${dir}/contigs_where_father_differs_from_mother;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
93 differing=`cat ${dir}contigs_where_father_differs_from_mother | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
94 echo -e "NUMBER of contigs AFTER just this filter: $differing/$mother_contigs";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
95
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
96 newline;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
97
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
98 echo -e "Step 3 of 3.\nDaughter must have mother and father alleles.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
99 #we need daughter to have alleles from both parents (AF stands for 'allele frequency')
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
100
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
101 #unique lines in first file mean that those are not subsets of father variants - otherwise they would be in common
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
102 comm -23 ${dir}contig_pos_ref_alt_daughter_heterozygous ${dir}contig_pos_ref_alt_father | cut -f 1 | sort | uniq >${dir}daughter_NOT_subset_of_father
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
103
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
104 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_NOT_subset_of_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}/contigs_where_daughter_from_parents
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
105
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
106
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
107 same=`cat ${dir}contigs_where_daughter_from_parents | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
108 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
109
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
110 newline;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
111
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
112 echo -e "Step 4 of 3.\nDaughters and sons can not have any intersection.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
113 comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}contig_pos_ref_alt_son >${dir}sons_and_daughters_similiar
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
114 cut -f 1 ${dir}sons_and_daughters_similiar | sort | uniq >${dir}contigs_where_sons_and_daughters_similiar
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
115
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
116 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_and_daughters_similiar"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughters_and_sons_different_pro
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
117
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
118 #check whether they are really different (small amount of variants is also wrong)
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
119
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
120 echo -e "daughter contains son variants?"
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
121 #check whether daughter really does not contain any of son variants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
122 wrong_hits=0;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
123 cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
124 contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_daughter bam/aln_cleaned_sorted_deduplicated_son.bam A`
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
125 if [[ $contig_check -gt 0 ]]; then
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
126 echo "DID NOT PASS "${line};
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
127 echo "daughter contains son variants DID NOT PASS "${line} >>kontrola
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
128 wrong_hits=`expr $wrong_hits + 1`
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
129 else
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
130 #echo "0K";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
131 echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un1
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
132 fi
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
133 done;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
134 echo "There is this number of wrong variants: "$wrong_hits;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
135 )
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
136
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
137 echo -e "son contains daughter variants?"
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
138 #check whether son really does not contain any of daughter variants
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
139 wrong_hits=0;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
140 cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
141 contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_son bam/aln_cleaned_sorted_deduplicated_daughter.bam A`
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
142 if [[ $contig_check -gt 0 ]]; then
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
143 echo "DID NOT PASS "${line};
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
144 echo "son contains daughter variants DID NOT PASS "${line} >>kontrola
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
145 wrong_hits=`expr $wrong_hits + 1`
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
146 else
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
147 #echo "0K";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
148 echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un2
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
149 fi
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
150 done;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
151 echo "There is this number of wrong variants: "$wrong_hits;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
152 )
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
153
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
154 comm -12 ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 | sort >${dir}contigs_where_daughters_and_sons_different_un
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
155
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
156 cat ${dir}contigs_where_daughters_and_sons_different_un | sort | uniq >${dir}contigs_where_daughters_and_sons_different;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
157
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
158 #rm ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 ${dir}contigs_where_daughters_and_sons_different_un;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
159
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
160 same=`cat ${dir}contigs_where_daughters_and_sons_different | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
161 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
162
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
163 echo -e "Calculating intersection of candidate contigs from steps 0-3.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
164
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
165 comm -12 ${dir}contigs_where_mother_same_as_mother ${dir}contigs_where_sons_same_as_mother | sort >${dir}first_couple;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
166 comm -12 ${dir}contigs_where_father_differs_from_mother ${dir}contigs_where_daughter_from_parents | sort >${dir}second_couple;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
167 comm -12 ${dir}first_couple ${dir}second_couple | sort >${dir}set1
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
168 comm -12 ${dir}contigs_where_daughters_and_sons_different ${dir}set1 | sort >${dir}set2;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
169
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
170 echo -e "Step -1 of 3.\nCoverage must be sufficient.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
171 touch ${dir}contigs_with_sufficient_coverage;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
172
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
173 cat ${dir}set2 | (while read line; do
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
174
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
175 coverage_check1=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_mother.bam 2>/dev/null &`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
176 coverage_check2=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_father.bam 2>/dev/null &`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
177 coverage_check3=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_son.bam 2>/dev/null &`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
178 coverage_check4=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_daughter.bam 2>/dev/null &`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
179
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
180 coverage_check=$(($coverage_check1 + $coverage_check2 + $coverage_check3 + $coverage_check4))
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
181 if [[ $coverage_check -gt 0 ]]; then
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
182 echo "LOW COVERAGE "${line};
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
183 else
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
184 #echo "0K";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
185 echo ${line} >>${dir}contigs_with_sufficient_coverage
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
186 fi
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
187 done;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
188 )
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
189
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
190
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
191 cat ${dir}contigs_with_sufficient_coverage | sort >X_results.txt;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
192 #rm ${dir}first_couple; #rm ${dir}second_couple; #rm ${dir}set1; #rm ${dir}set2;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
193
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
194 #have we found any genes?
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
195
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
196 final_contigs=`cat X_results.txt | wc -l`;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
197 echo -e "$final_contigs contigs found fulfilling criteria.";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
198
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
199
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
200 if [ $final_contigs -gt 0 ]; then
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
201 #we found SOME genes
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
202
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
203 #prepare files for sorting into the categories
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
204 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_mother.bam >${dir}high_score_mother.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
205 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_father.bam >${dir}high_score_father.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
206 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_daughter.bam >${dir}high_score_daughter.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
207 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_son.bam >${dir}high_score_son.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
208 wait;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
209
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
210 samtools index ${dir}high_score_mother.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
211 samtools index ${dir}high_score_father.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
212 samtools index ${dir}high_score_daughter.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
213 samtools index ${dir}high_score_son.bam &
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
214 wait;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
215
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
216 #get sequences based on ids
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
217 perl scripts/get_sequences_based_on_ids.pl references/reference.fasta X_results.txt >X_results_sequences.txt;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
218
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
219 get_results_in_bam_file ()
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
220 {
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
221 for ARG in "$@";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
222 do
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
223 ./scripts/createBamFromResults.sh X_results.txt bam/aln_cleaned_sorted_deduplicated_${ARG}.bam bam/${ARG}.bam;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
224 samtools index bam/${ARG}.bam;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
225 done;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
226 }
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
227
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
228 get_results_in_bam_file mother father daughter son;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
229 wait;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
230 #send BAM files with corresponding contigs
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
231 mutt -s "LINKYX: Results of computation for X-linked genes" $email_address_of_user -a bam/mother.bam -a bam/mother.bam.bai -a bam/father.bam -a bam/father.bam.bai -a bam/daughter.bam -a bam/daughter.bam.bai -a bam/son.bam -a bam/son.bam.bai < messages_to_user/message_results_sentBamX;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
232
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
233 else
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
234 #we found NO genes
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
235 echo "No sequences found." >X_results.txt;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
236 echo "No sequences found." >X_results_sequences.txt;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
237 fi
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
238
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
239 newline;
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
240 echo -e "Writing output to X_results.txt";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
241 echo -e "Total number of found X-linked genes: $final_contigs";
1955f03f092e initial commit
biomonika <biomonika@psu.edu>
parents:
diff changeset
242