diff runLinkYX.sh @ 0:1955f03f092e

initial commit
author biomonika <biomonika@psu.edu>
date Tue, 02 Sep 2014 14:16:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/runLinkYX.sh	Tue Sep 02 14:16:04 2014 -0400
@@ -0,0 +1,205 @@
+#!/bin/bash
+set -e; mkdir -p vcf; mkdir -p bam; mkdir -p references; mkdir -p scripts;
+
+email_address_of_user=$1;
+name_of_user_computer=$2;
+
+#read config variables
+echo "Reading config...." >&2
+#echo `ls -l`
+computer_name=`hostname`;
+if [ $computer_name == $name_of_user_computer ]; then
+	#local_computer
+	source config_devel;
+else 
+	#metacentrum machine
+	source config_meta;
+fi
+
+#initally we believe that all files are present
+ok=true;
+
+#we get to know if files are provided and if they are in .fasta or .fastq format
+#first, rename files to shorter versions (fastq->fq, fasta->fa)
+rename 's/fasta$/fa/' input/*.fasta 2>/dev/null
+rename 's/fastq$/fq/' input/*.fastq 2>/dev/null
+
+check_if_exists () 
+{
+for ARG in "$@";
+    do
+	if [[ -e "input/${ARG}_1.fq" ]]; then
+		eval ${ARG}_ext=fq
+	elif [[ -e "input/${ARG}_1.fa" ]]; then
+		eval ${ARG}_ext=fa
+	else
+		ok=false;
+	fi
+    done;
+}
+
+check_if_paired_end ()
+{
+for ARG in "$@";
+    do
+	extension=$(eval "echo \$${ARG}_ext"); #echo $extension;
+      
+	if [[ (-e "input/${ARG}_1.${extension}") && (-e "input/${ARG}_2.${extension}") ]]; then
+		eval ${ARG}_paired=paired;
+	else
+		eval ${ARG}_paired=single;
+	fi
+    done;
+}
+
+#check if file with reads exist
+check_if_exists mother father daughter son;
+#and determine whether they are single or paired-end reads
+check_if_paired_end  mother father daughter son;
+
+if [ $ok == true ]; then
+	echo "mother reads names:" $mother_ext $mother_paired;
+	echo "father reads names:" $father_ext $father_paired;
+	echo "daughter reads names:" $daughter_ext $daughter_paired;
+	echo "son reads names:" $son_ext $son_paired;
+
+if [[ $father_ext =~ (fq) ]] && [[ $son_ext =~ (fq) ]]; then
+		#no conversion needed, both fastq files
+		male_ext=fq;
+else
+		#convert to fasta (we need to merge father and son reads in order to build male reference contigs)
+		#father OR son is fasta
+		echo "converting..";
+		male_ext=fa;
+		
+		if [[ father_1.$father_ext =~ (fq) ]]; then
+		#father is fastq and must be converted to fasta
+		  fastq_to_fasta -n -Q33 -i input/father_1.$father_ext -o input/father_1.$male_ext;
+		  fastq_to_fasta -n -Q33 -i input/father_2.$father_ext -o input/father_2.$male_ext;
+		  father_ext=fa;
+		
+		else 
+		#son is fastq and must be converted to fasta
+		  fastq_to_fasta -n -Q33 -i input/son_1.$father_ext -o input/son_1.$male_ext;
+		  fastq_to_fasta -n -Q33 -i input/son_2.$father_ext -o input/son_2.$male_ext;
+		  son_ext=fa;
+		fi
+fi
+		#merging of father and son files
+		echo "concatenating..";
+		  cat input/father_1.$father_ext input/son_1.$son_ext >input/father_and_sons_1.$male_ext;
+		
+		if [[ $father_paired =~ (paired) ]] && [[ $son_paired =~ (paired) ]]; then
+		  cat input/father_2.$father_ext input/son_2.$son_ext >input/father_and_sons_2.$male_ext;
+		fi
+	
+	
+	#NOW run Trinity to assemble mother contigs
+	#in: mother reads out: reference.fasta
+	PBS_NUM_PPN_used=$(($PBS_NUM_PPN_used - 2));
+	echo "processors used:" $PBS_NUM_PPN_used;
+
+#<<PREVENT_TRINITY_MOTHER
+	#assemble mother reads in single or paired-end mode
+	if [[ $mother_paired =~ (paired) ]]; then
+	    $trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --left input/mother_1.$mother_ext --right input/mother_2.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1
+	    echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --left input/mother_1.$mother_ext --right input/mother_2.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1";
+	else
+	    $trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --single input/mother_1.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1
+	    echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --single input/mother_1.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1";
+	fi
+
+	#retrieve assembled contigs and name them as reference.fasta
+	mv trinity_out_dir_mother/Trinity.fasta reference_pro.fasta
+	perl -pe 's/(>[\S]+).+/$1/' reference_pro.fasta >references/reference.fasta; rm reference_pro.fasta;
+#PREVENT_TRINITY_MOTHER
+
+	grep ">" references/reference.fasta | sed 's/>//g' >references/reference_contig_names
+
+	#now call to_get_vcf_files.sh to map reads to reference contigs
+	./scripts/to_get_vcf_files.sh references/reference.fasta mother_1.$mother_ext mother_2.$mother_ext mother $mother_paired
+	echo "Mother mapping successful.";
+	./scripts/to_get_vcf_files.sh references/reference.fasta father_1.$mother_ext father_2.$mother_ext father $father_paired
+	echo "Father mapping successful.";
+	./scripts/to_get_vcf_files.sh references/reference.fasta daughter_1.$mother_ext daughter_2.$mother_ext daughter $daughter_paired
+	echo "Daughter mapping successful.";
+	./scripts/to_get_vcf_files.sh references/reference.fasta son_1.$mother_ext son_2.$mother_ext son $son_paired
+	echo "Son mapping successful.";
+
+	wait;
+
+	#NOW run Trinity to assemble male contigs
+	#in: male reads out: reference_male.fasta
+#<<PREVENT_TRINITY_MALES
+	if [[ $father_paired =~ (paired) ]] && [[ $son_paired =~ (paired) ]]; then
+	    $trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --left input/father_and_sons_1.$male_ext --right input/father_and_sons_2.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1
+	    echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --left input/father_and_sons_1.$male_ext --right input/father_and_sons_2.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1";
+	else
+	    $trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --single input/father_and_sons_1.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1
+	    echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --single input/father_and_sons_1.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1";
+	fi
+
+	#retrieve assembled contigs and name them as reference_male.fasta
+	mv trinity_out_dir_male/Trinity.fasta reference_male_pro.fasta
+	perl -pe 's/(>[\S]+).+/$1/' reference_male_pro.fasta >references/reference_male.fasta; rm reference_male_pro.fasta;
+#PREVENT_TRINITY_MALES
+
+	grep ">" references/reference_male.fasta | sed 's/>//g' >references/male_contig_names
+
+	#now call to_get_vcf_files.sh to map reads to reference contigs
+	./scripts/to_get_Y_variants.sh references/reference_male.fasta mother_1.$mother_ext mother_2.$mother_ext mother $mother_paired
+	echo "Mother mapping to males successful.";
+	./scripts/to_get_Y_variants.sh references/reference_male.fasta father_1.$mother_ext father_2.$mother_ext father $father_paired
+	echo "Father mapping to males successful.";
+	./scripts/to_get_Y_variants.sh references/reference_male.fasta daughter_1.$mother_ext daughter_2.$mother_ext daughter $daughter_paired
+	echo "Daughter mapping to males successful.";
+	./scripts/to_get_Y_variants.sh references/reference_male.fasta son_1.$mother_ext son_2.$mother_ext son $son_paired
+	echo "Son mapping to males successful.";
+
+	#index bam files
+	
+	index_bam_file () 
+	{
+	  for ARG in "$@";
+	    do
+		if [ -e "bam/aln_cleaned_sorted_deduplicated_${ARG}.bam.bai" ];
+		then	
+			echo "Index already exists. Not being created again.";
+		else
+			samtools index bam/aln_cleaned_sorted_deduplicated_${ARG}.bam &
+		fi
+	    done;
+	}
+
+	#index bam files
+	index_bam_file  mother father daughter son mother_male father_male daughter_male son_male;
+	wait;
+	echo "Indexes successfully created."
+	
+	echo -e "Starting analysis";
+
+	#A variant X1X1 X2Y
+	dir="A/";
+	bash -x ./A.sh $email_address_of_user;
+	perl scripts/sorting.pl A/;
+
+	#Y variant
+	dir="Y/";
+	bash -x ./Y.sh $email_address_of_user;
+	
+	wait;
+	
+	#sending results (not BAM files)
+	mutt -s "LINKYX: Results of computation for Y-linked genes" $email_address_of_user -a Y_results_sequences.txt < messages_to_user/message_results_sentY;
+	mutt -s "LINKYX: Results of computation for X-linked genes" $email_address_of_user -a X_results_sequences.txt < messages_to_user/message_results_sentX;
+
+
+echo -e "Done."
+else 
+	echo "not all input files are present in proper form";
+	mutt -s "LINKYX: Not all input files are present in proper form." $email_address_of_user < messages_to_user/message_input_files_error
+fi
+
+
+
+