Mercurial > repos > biomonika > linkyx
diff runLinkYX.sh @ 0:1955f03f092e
initial commit
| author | biomonika <biomonika@psu.edu> |
|---|---|
| date | Tue, 02 Sep 2014 14:16:04 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/runLinkYX.sh Tue Sep 02 14:16:04 2014 -0400 @@ -0,0 +1,205 @@ +#!/bin/bash +set -e; mkdir -p vcf; mkdir -p bam; mkdir -p references; mkdir -p scripts; + +email_address_of_user=$1; +name_of_user_computer=$2; + +#read config variables +echo "Reading config...." >&2 +#echo `ls -l` +computer_name=`hostname`; +if [ $computer_name == $name_of_user_computer ]; then + #local_computer + source config_devel; +else + #metacentrum machine + source config_meta; +fi + +#initally we believe that all files are present +ok=true; + +#we get to know if files are provided and if they are in .fasta or .fastq format +#first, rename files to shorter versions (fastq->fq, fasta->fa) +rename 's/fasta$/fa/' input/*.fasta 2>/dev/null +rename 's/fastq$/fq/' input/*.fastq 2>/dev/null + +check_if_exists () +{ +for ARG in "$@"; + do + if [[ -e "input/${ARG}_1.fq" ]]; then + eval ${ARG}_ext=fq + elif [[ -e "input/${ARG}_1.fa" ]]; then + eval ${ARG}_ext=fa + else + ok=false; + fi + done; +} + +check_if_paired_end () +{ +for ARG in "$@"; + do + extension=$(eval "echo \$${ARG}_ext"); #echo $extension; + + if [[ (-e "input/${ARG}_1.${extension}") && (-e "input/${ARG}_2.${extension}") ]]; then + eval ${ARG}_paired=paired; + else + eval ${ARG}_paired=single; + fi + done; +} + +#check if file with reads exist +check_if_exists mother father daughter son; +#and determine whether they are single or paired-end reads +check_if_paired_end mother father daughter son; + +if [ $ok == true ]; then + echo "mother reads names:" $mother_ext $mother_paired; + echo "father reads names:" $father_ext $father_paired; + echo "daughter reads names:" $daughter_ext $daughter_paired; + echo "son reads names:" $son_ext $son_paired; + +if [[ $father_ext =~ (fq) ]] && [[ $son_ext =~ (fq) ]]; then + #no conversion needed, both fastq files + male_ext=fq; +else + #convert to fasta (we need to merge father and son reads in order to build male reference contigs) + #father OR son is fasta + echo "converting.."; + male_ext=fa; + + if [[ father_1.$father_ext =~ (fq) ]]; then + #father is fastq and must be converted to fasta + fastq_to_fasta -n -Q33 -i input/father_1.$father_ext -o input/father_1.$male_ext; + fastq_to_fasta -n -Q33 -i input/father_2.$father_ext -o input/father_2.$male_ext; + father_ext=fa; + + else + #son is fastq and must be converted to fasta + fastq_to_fasta -n -Q33 -i input/son_1.$father_ext -o input/son_1.$male_ext; + fastq_to_fasta -n -Q33 -i input/son_2.$father_ext -o input/son_2.$male_ext; + son_ext=fa; + fi +fi + #merging of father and son files + echo "concatenating.."; + cat input/father_1.$father_ext input/son_1.$son_ext >input/father_and_sons_1.$male_ext; + + if [[ $father_paired =~ (paired) ]] && [[ $son_paired =~ (paired) ]]; then + cat input/father_2.$father_ext input/son_2.$son_ext >input/father_and_sons_2.$male_ext; + fi + + + #NOW run Trinity to assemble mother contigs + #in: mother reads out: reference.fasta + PBS_NUM_PPN_used=$(($PBS_NUM_PPN_used - 2)); + echo "processors used:" $PBS_NUM_PPN_used; + +#<<PREVENT_TRINITY_MOTHER + #assemble mother reads in single or paired-end mode + if [[ $mother_paired =~ (paired) ]]; then + $trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --left input/mother_1.$mother_ext --right input/mother_2.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1 + echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --left input/mother_1.$mother_ext --right input/mother_2.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1"; + else + $trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --single input/mother_1.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1 + echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $mother_ext --JM 20G --single input/mother_1.$mother_ext --output trinity_out_dir_mother 1>$trinity_stdout 2>&1"; + fi + + #retrieve assembled contigs and name them as reference.fasta + mv trinity_out_dir_mother/Trinity.fasta reference_pro.fasta + perl -pe 's/(>[\S]+).+/$1/' reference_pro.fasta >references/reference.fasta; rm reference_pro.fasta; +#PREVENT_TRINITY_MOTHER + + grep ">" references/reference.fasta | sed 's/>//g' >references/reference_contig_names + + #now call to_get_vcf_files.sh to map reads to reference contigs + ./scripts/to_get_vcf_files.sh references/reference.fasta mother_1.$mother_ext mother_2.$mother_ext mother $mother_paired + echo "Mother mapping successful."; + ./scripts/to_get_vcf_files.sh references/reference.fasta father_1.$mother_ext father_2.$mother_ext father $father_paired + echo "Father mapping successful."; + ./scripts/to_get_vcf_files.sh references/reference.fasta daughter_1.$mother_ext daughter_2.$mother_ext daughter $daughter_paired + echo "Daughter mapping successful."; + ./scripts/to_get_vcf_files.sh references/reference.fasta son_1.$mother_ext son_2.$mother_ext son $son_paired + echo "Son mapping successful."; + + wait; + + #NOW run Trinity to assemble male contigs + #in: male reads out: reference_male.fasta +#<<PREVENT_TRINITY_MALES + if [[ $father_paired =~ (paired) ]] && [[ $son_paired =~ (paired) ]]; then + $trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --left input/father_and_sons_1.$male_ext --right input/father_and_sons_2.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1 + echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --left input/father_and_sons_1.$male_ext --right input/father_and_sons_2.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1"; + else + $trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --single input/father_and_sons_1.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1 + echo "$trinity --CPU $PBS_NUM_PPN_used --seqType $male_ext --JM 20G --single input/father_and_sons_1.$male_ext --output trinity_out_dir_male 1>$trinity_stdout 2>&1"; + fi + + #retrieve assembled contigs and name them as reference_male.fasta + mv trinity_out_dir_male/Trinity.fasta reference_male_pro.fasta + perl -pe 's/(>[\S]+).+/$1/' reference_male_pro.fasta >references/reference_male.fasta; rm reference_male_pro.fasta; +#PREVENT_TRINITY_MALES + + grep ">" references/reference_male.fasta | sed 's/>//g' >references/male_contig_names + + #now call to_get_vcf_files.sh to map reads to reference contigs + ./scripts/to_get_Y_variants.sh references/reference_male.fasta mother_1.$mother_ext mother_2.$mother_ext mother $mother_paired + echo "Mother mapping to males successful."; + ./scripts/to_get_Y_variants.sh references/reference_male.fasta father_1.$mother_ext father_2.$mother_ext father $father_paired + echo "Father mapping to males successful."; + ./scripts/to_get_Y_variants.sh references/reference_male.fasta daughter_1.$mother_ext daughter_2.$mother_ext daughter $daughter_paired + echo "Daughter mapping to males successful."; + ./scripts/to_get_Y_variants.sh references/reference_male.fasta son_1.$mother_ext son_2.$mother_ext son $son_paired + echo "Son mapping to males successful."; + + #index bam files + + index_bam_file () + { + for ARG in "$@"; + do + if [ -e "bam/aln_cleaned_sorted_deduplicated_${ARG}.bam.bai" ]; + then + echo "Index already exists. Not being created again."; + else + samtools index bam/aln_cleaned_sorted_deduplicated_${ARG}.bam & + fi + done; + } + + #index bam files + index_bam_file mother father daughter son mother_male father_male daughter_male son_male; + wait; + echo "Indexes successfully created." + + echo -e "Starting analysis"; + + #A variant X1X1 X2Y + dir="A/"; + bash -x ./A.sh $email_address_of_user; + perl scripts/sorting.pl A/; + + #Y variant + dir="Y/"; + bash -x ./Y.sh $email_address_of_user; + + wait; + + #sending results (not BAM files) + mutt -s "LINKYX: Results of computation for Y-linked genes" $email_address_of_user -a Y_results_sequences.txt < messages_to_user/message_results_sentY; + mutt -s "LINKYX: Results of computation for X-linked genes" $email_address_of_user -a X_results_sequences.txt < messages_to_user/message_results_sentX; + + +echo -e "Done." +else + echo "not all input files are present in proper form"; + mutt -s "LINKYX: Not all input files are present in proper form." $email_address_of_user < messages_to_user/message_input_files_error +fi + + + +
