|
0
|
1 #!/bin/bash
|
|
|
2 set -e;
|
|
|
3 email_address_of_user=$1;
|
|
|
4 echo "email: " $email_address_of_user;
|
|
|
5
|
|
|
6 newline () {
|
|
|
7 echo -e "\n---";
|
|
|
8 }
|
|
|
9 #variables
|
|
|
10 mother_contigs=`cat references/reference_contig_names | wc -l`;
|
|
|
11 echo -e "A variant"; newline; dir=A/; mkdir -p $dir;
|
|
|
12
|
|
|
13 #DAUGHTER PREPARE FILES
|
|
|
14
|
|
|
15 grep -E '#|AF1=0.' vcf/aln_0.85_daughter.vcf | cut -f1 | uniq >${dir}daughter_on_mother_one_allele_heterozygous_ids.vcf
|
|
|
16 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_on_mother_one_allele_heterozygous_ids.vcf"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if exists $a{$res}' vcf/aln_0.85_daughter.vcf | sort | uniq | sed '/#/d' >${dir}daughter_on_mother_heterozygous.vcf
|
|
|
17
|
|
|
18 #be aware that if there are more snps, one with allele frequency '0.' is enough for the whole contig to be in contig_pos_ref_heterozygous
|
|
|
19 sed '/#/d' ${dir}daughter_on_mother_heterozygous.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter_heterozygous
|
|
|
20 cut -f 1 ${dir}contig_pos_ref_alt_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughter_heterozygous
|
|
|
21
|
|
|
22 sed '/#/d' vcf/aln_0.85_daughter.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_daughter
|
|
|
23
|
|
|
24 #FATHER PREPARE FILES
|
|
|
25
|
|
|
26 sed '/#/d' vcf/aln_0.85_father.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_father
|
|
|
27
|
|
|
28 #SON PREPARE FILES
|
|
|
29
|
|
|
30 sed '/#/d' vcf/aln_0.85_son.vcf | perl -pe 's/(comp\S+)\t(\d+)\t\.\t(\S+)\t(\S+).+/$1\t$2\t$3\t$4/' | sort >${dir}contig_pos_ref_alt_son
|
|
|
31
|
|
|
32
|
|
|
33 echo -e "Step 0 of 3.\nMother must be homozygote.";
|
|
|
34 sed '/#/d' vcf/aln_0.85_mother.vcf | cut -f1 | sort | uniq >${dir}contigs_where_mother_differs_from_mother;
|
|
|
35 differing=`cat ${dir}contigs_where_mother_differs_from_mother | wc -l`;
|
|
|
36
|
|
|
37 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_mother_differs_from_mother"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_mother_same_as_mother_unchecked
|
|
|
38
|
|
|
39 #check whether mother really does not contain any father variants
|
|
|
40 wrong_hits=0;
|
|
|
41 cat ${dir}contigs_where_mother_same_as_mother_unchecked | (while read line; do
|
|
|
42 contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_father bam/aln_cleaned_sorted_deduplicated_mother.bam A`
|
|
|
43 if [[ $contig_check -gt 0 ]]; then
|
|
|
44 echo "contig: " ${line} "contig_check:" $contig_check >>kontrola;
|
|
|
45 echo "DID NOT PASS "${line};
|
|
|
46 wrong_hits=`expr $wrong_hits + 1`
|
|
|
47 else
|
|
|
48 #echo "0K";
|
|
|
49 echo ${line} >>${dir}contigs_where_mother_same_as_mother
|
|
|
50 fi
|
|
|
51 done;
|
|
|
52 echo "There is this number of wrong variants: "$wrong_hits;
|
|
|
53 )
|
|
|
54
|
|
|
55 same=`cat ${dir}contigs_where_mother_same_as_mother | wc -l`;
|
|
|
56 echo -e "Number of contigs, where mother differs: $differing.";
|
|
|
57 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
|
|
|
58
|
|
|
59 newline;
|
|
|
60
|
|
|
61 echo -e "Step 1 of 3.\nSons must be same as mother.";
|
|
|
62 #First we have to get rid of the intersection with father, possible Y variants do not matter in the calculation
|
|
|
63 comm -12 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_father | sort | uniq >${dir}possible_Yvariants
|
|
|
64 cut -f 1 ${dir}possible_Yvariants | sort | uniq >${dir}contigs_with_possible_Yvariants
|
|
|
65
|
|
|
66 #if these possible_Yvariants are in the daughter, they are not Y variants
|
|
|
67 comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}possible_Yvariants | sort | uniq >${dir}NOT_Yvariants
|
|
|
68 cut -f 1 ${dir}NOT_Yvariants | sort | uniq >${dir}contigs_with_NOT_Yvariants
|
|
|
69
|
|
|
70 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_with_NOT_Yvariants"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_with_possible_Yvariants | sort | uniq >${dir}contigs_with_Yvariants
|
|
|
71
|
|
|
72 #contig_pos_ref_alt_Yvariants_confirmed; we deplet just those contigs that contain Y variants and then decide if sons are same as mother
|
|
|
73
|
|
|
74 cat ${dir}contigs_with_Yvariants | (while read line; do
|
|
|
75 grep ${line} ${dir}contig_pos_ref_alt_father | sort >>${dir}contig_pos_ref_alt_Yvariants_confirmed_pro
|
|
|
76 done;
|
|
|
77 )
|
|
|
78 cat ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro | sort >${dir}contig_pos_ref_alt_Yvariants_confirmed;
|
|
|
79
|
|
|
80 comm -23 ${dir}contig_pos_ref_alt_son ${dir}contig_pos_ref_alt_Yvariants_confirmed | cut -f 1 | sort | uniq >${dir}contigs_where_sons_variation_not_caused_by_father
|
|
|
81 #rm ${dir}contig_pos_ref_alt_Yvariants_confirmed_pro;
|
|
|
82
|
|
|
83 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_variation_not_caused_by_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' references/reference_contig_names | sort | uniq >${dir}contigs_where_sons_same_as_mother
|
|
|
84
|
|
|
85 same=`cat ${dir}contigs_where_sons_same_as_mother | wc -l`;
|
|
|
86 echo -e "Number of contigs, where sons differ: $differing.";
|
|
|
87 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
|
|
|
88
|
|
|
89
|
|
|
90 newline;
|
|
|
91 echo -e "Step 2 of 3.\nFather must differ from mother.";
|
|
|
92 sed '/#/d' vcf/aln_0.85_father.vcf | cut -f1 | sort | uniq >${dir}/contigs_where_father_differs_from_mother;
|
|
|
93 differing=`cat ${dir}contigs_where_father_differs_from_mother | wc -l`;
|
|
|
94 echo -e "NUMBER of contigs AFTER just this filter: $differing/$mother_contigs";
|
|
|
95
|
|
|
96 newline;
|
|
|
97
|
|
|
98 echo -e "Step 3 of 3.\nDaughter must have mother and father alleles.";
|
|
|
99 #we need daughter to have alleles from both parents (AF stands for 'allele frequency')
|
|
|
100
|
|
|
101 #unique lines in first file mean that those are not subsets of father variants - otherwise they would be in common
|
|
|
102 comm -23 ${dir}contig_pos_ref_alt_daughter_heterozygous ${dir}contig_pos_ref_alt_father | cut -f 1 | sort | uniq >${dir}daughter_NOT_subset_of_father
|
|
|
103
|
|
|
104 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/daughter_NOT_subset_of_father"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}/contigs_where_daughter_from_parents
|
|
|
105
|
|
|
106
|
|
|
107 same=`cat ${dir}contigs_where_daughter_from_parents | wc -l`;
|
|
|
108 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
|
|
|
109
|
|
|
110 newline;
|
|
|
111
|
|
|
112 echo -e "Step 4 of 3.\nDaughters and sons can not have any intersection.";
|
|
|
113 comm -12 ${dir}contig_pos_ref_alt_daughter ${dir}contig_pos_ref_alt_son >${dir}sons_and_daughters_similiar
|
|
|
114 cut -f 1 ${dir}sons_and_daughters_similiar | sort | uniq >${dir}contigs_where_sons_and_daughters_similiar
|
|
|
115
|
|
|
116 perl -MFile::Slurp -ne 'BEGIN{@a{split(/\s/,read_file("A/contigs_where_sons_and_daughters_similiar"))}=()}; $b=$_; @data = split(/\s/, $b); $res=$data[0]; print if !exists $a{$res}' ${dir}contigs_where_daughter_heterozygous | sort | uniq >${dir}contigs_where_daughters_and_sons_different_pro
|
|
|
117
|
|
|
118 #check whether they are really different (small amount of variants is also wrong)
|
|
|
119
|
|
|
120 echo -e "daughter contains son variants?"
|
|
|
121 #check whether daughter really does not contain any of son variants
|
|
|
122 wrong_hits=0;
|
|
|
123 cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do
|
|
|
124 contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_daughter bam/aln_cleaned_sorted_deduplicated_son.bam A`
|
|
|
125 if [[ $contig_check -gt 0 ]]; then
|
|
|
126 echo "DID NOT PASS "${line};
|
|
|
127 echo "daughter contains son variants DID NOT PASS "${line} >>kontrola
|
|
|
128 wrong_hits=`expr $wrong_hits + 1`
|
|
|
129 else
|
|
|
130 #echo "0K";
|
|
|
131 echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un1
|
|
|
132 fi
|
|
|
133 done;
|
|
|
134 echo "There is this number of wrong variants: "$wrong_hits;
|
|
|
135 )
|
|
|
136
|
|
|
137 echo -e "son contains daughter variants?"
|
|
|
138 #check whether son really does not contain any of daughter variants
|
|
|
139 wrong_hits=0;
|
|
|
140 cat ${dir}contigs_where_daughters_and_sons_different_pro | (while read line; do
|
|
|
141 contig_check=`./scripts/test.sh ${line} 4 ${dir}contig_pos_ref_alt_son bam/aln_cleaned_sorted_deduplicated_daughter.bam A`
|
|
|
142 if [[ $contig_check -gt 0 ]]; then
|
|
|
143 echo "DID NOT PASS "${line};
|
|
|
144 echo "son contains daughter variants DID NOT PASS "${line} >>kontrola
|
|
|
145 wrong_hits=`expr $wrong_hits + 1`
|
|
|
146 else
|
|
|
147 #echo "0K";
|
|
|
148 echo ${line} >>${dir}contigs_where_daughters_and_sons_different_un2
|
|
|
149 fi
|
|
|
150 done;
|
|
|
151 echo "There is this number of wrong variants: "$wrong_hits;
|
|
|
152 )
|
|
|
153
|
|
|
154 comm -12 ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 | sort >${dir}contigs_where_daughters_and_sons_different_un
|
|
|
155
|
|
|
156 cat ${dir}contigs_where_daughters_and_sons_different_un | sort | uniq >${dir}contigs_where_daughters_and_sons_different;
|
|
|
157
|
|
|
158 #rm ${dir}contigs_where_daughters_and_sons_different_un1 ${dir}contigs_where_daughters_and_sons_different_un2 ${dir}contigs_where_daughters_and_sons_different_un;
|
|
|
159
|
|
|
160 same=`cat ${dir}contigs_where_daughters_and_sons_different | wc -l`;
|
|
|
161 echo -e "NUMBER of contigs AFTER just this filter: $same/$mother_contigs";
|
|
|
162
|
|
|
163 echo -e "Calculating intersection of candidate contigs from steps 0-3.";
|
|
|
164
|
|
|
165 comm -12 ${dir}contigs_where_mother_same_as_mother ${dir}contigs_where_sons_same_as_mother | sort >${dir}first_couple;
|
|
|
166 comm -12 ${dir}contigs_where_father_differs_from_mother ${dir}contigs_where_daughter_from_parents | sort >${dir}second_couple;
|
|
|
167 comm -12 ${dir}first_couple ${dir}second_couple | sort >${dir}set1
|
|
|
168 comm -12 ${dir}contigs_where_daughters_and_sons_different ${dir}set1 | sort >${dir}set2;
|
|
|
169
|
|
|
170 echo -e "Step -1 of 3.\nCoverage must be sufficient.";
|
|
|
171 touch ${dir}contigs_with_sufficient_coverage;
|
|
|
172
|
|
|
173 cat ${dir}set2 | (while read line; do
|
|
|
174
|
|
|
175 coverage_check1=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_mother.bam 2>/dev/null &`;
|
|
|
176 coverage_check2=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_father.bam 2>/dev/null &`;
|
|
|
177 coverage_check3=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_son.bam 2>/dev/null &`;
|
|
|
178 coverage_check4=`./scripts/get_average_contig_coverage.sh 0.75 ${line} bam/aln_cleaned_sorted_deduplicated_daughter.bam 2>/dev/null &`;
|
|
|
179
|
|
|
180 coverage_check=$(($coverage_check1 + $coverage_check2 + $coverage_check3 + $coverage_check4))
|
|
|
181 if [[ $coverage_check -gt 0 ]]; then
|
|
|
182 echo "LOW COVERAGE "${line};
|
|
|
183 else
|
|
|
184 #echo "0K";
|
|
|
185 echo ${line} >>${dir}contigs_with_sufficient_coverage
|
|
|
186 fi
|
|
|
187 done;
|
|
|
188 )
|
|
|
189
|
|
|
190
|
|
|
191 cat ${dir}contigs_with_sufficient_coverage | sort >X_results.txt;
|
|
|
192 #rm ${dir}first_couple; #rm ${dir}second_couple; #rm ${dir}set1; #rm ${dir}set2;
|
|
|
193
|
|
|
194 #have we found any genes?
|
|
|
195
|
|
|
196 final_contigs=`cat X_results.txt | wc -l`;
|
|
|
197 echo -e "$final_contigs contigs found fulfilling criteria.";
|
|
|
198
|
|
|
199
|
|
|
200 if [ $final_contigs -gt 0 ]; then
|
|
|
201 #we found SOME genes
|
|
|
202
|
|
|
203 #prepare files for sorting into the categories
|
|
|
204 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_mother.bam >${dir}high_score_mother.bam &
|
|
|
205 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_father.bam >${dir}high_score_father.bam &
|
|
|
206 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_daughter.bam >${dir}high_score_daughter.bam &
|
|
|
207 samtools view -bq 1 bam/aln_cleaned_sorted_deduplicated_son.bam >${dir}high_score_son.bam &
|
|
|
208 wait;
|
|
|
209
|
|
|
210 samtools index ${dir}high_score_mother.bam &
|
|
|
211 samtools index ${dir}high_score_father.bam &
|
|
|
212 samtools index ${dir}high_score_daughter.bam &
|
|
|
213 samtools index ${dir}high_score_son.bam &
|
|
|
214 wait;
|
|
|
215
|
|
|
216 #get sequences based on ids
|
|
|
217 perl scripts/get_sequences_based_on_ids.pl references/reference.fasta X_results.txt >X_results_sequences.txt;
|
|
|
218
|
|
|
219 get_results_in_bam_file ()
|
|
|
220 {
|
|
|
221 for ARG in "$@";
|
|
|
222 do
|
|
|
223 ./scripts/createBamFromResults.sh X_results.txt bam/aln_cleaned_sorted_deduplicated_${ARG}.bam bam/${ARG}.bam;
|
|
|
224 samtools index bam/${ARG}.bam;
|
|
|
225 done;
|
|
|
226 }
|
|
|
227
|
|
|
228 get_results_in_bam_file mother father daughter son;
|
|
|
229 wait;
|
|
|
230 #send BAM files with corresponding contigs
|
|
|
231 mutt -s "LINKYX: Results of computation for X-linked genes" $email_address_of_user -a bam/mother.bam -a bam/mother.bam.bai -a bam/father.bam -a bam/father.bam.bai -a bam/daughter.bam -a bam/daughter.bam.bai -a bam/son.bam -a bam/son.bam.bai < messages_to_user/message_results_sentBamX;
|
|
|
232
|
|
|
233 else
|
|
|
234 #we found NO genes
|
|
|
235 echo "No sequences found." >X_results.txt;
|
|
|
236 echo "No sequences found." >X_results_sequences.txt;
|
|
|
237 fi
|
|
|
238
|
|
|
239 newline;
|
|
|
240 echo -e "Writing output to X_results.txt";
|
|
|
241 echo -e "Total number of found X-linked genes: $final_contigs";
|
|
|
242
|