view ncPRO-QC.sh @ 4:b044e98c81d2 draft

Uploaded
author jbrayet
date Thu, 28 Jan 2016 07:43:58 -0500
parents f8c76a03d758
children
line wrap: on
line source

#!/bin/bash

# Copyleft ↄ⃝ 2012 Institut Curie
# Author(s): Jocelyn Brayet, Laurene Syx, Chongjian Chen, Nicolas Servant(Institut Curie) 2012 - 2015
# Contact: bioinfo.ncproseq@curie.fr
# This software is distributed without any guarantee under the terms of the GNU General
# Public License, either Version 2, June 1991 or Version 3, June 2007.

while getopts "i:s:n:g:m:o:f:r:h:p:l:t:a:q:d:" optionName; do
case "$optionName" in

i) INPUT+="$OPTARG,";;
s) SAMPLENAME+="$OPTARG,";;
n) PROJECTNAME="$OPTARG";;
g) GENOME="$OPTARG";;
m) RMSK="$OPTARG";;
o) BAM_OUT+="$OPTARG,";;
f) RFAM="$OPTARG";;
r) REPORT="$OPTARG";;
h) HTML_REPORT="$OPTARG";;
p) PDF_REPORT="$OPTARG";;
l) LOG_FILE="$OPTARG";;
t) INPUT_TYPE="$OPTARG";;
a) ALIGNMENT="$OPTARG";;
q) FASTQ_FORMAT+="$OPTARG,";;
d) ROOT_DIR="$OPTARG";;

esac
done

##### ncPRO-seq annotation - Galaxy #####

GENOME_2=`echo $GENOME | cut -d"_" -f2`

databasePath=$ROOT_DIR/database/files

mkdir -p $databasePath/ncproseqAnnotation
mkdir -p $databasePath/ncproseqAnnotation/annotation
annotationPath=$databasePath/ncproseqAnnotation/annotation
[ ! -d $annotationPath/$GENOME_2 ] && wget http://ncpro.curie.fr/ncproseq/install_dir/annotation/$GENOME.tar.gz -P $annotationPath -q && cd $annotationPath && tar -zxf $GENOME.tar.gz && rm -rf $GENOME.tar.gz

#########


sampleArray=(${INPUT//,/ })
nameArray=(${SAMPLENAME//,/ })
bamArray=(${BAM_OUT//,/ })

if [[ $REPORT == "pdf" ]];then
    OUTPUT_PATH_DIR=`dirname $LOG_FILE`
    OUTPUT_PATH_NAME=`basename $LOG_FILE .dat`
else
    OUTPUT_PATH_DIR=`dirname $HTML_REPORT`
    OUTPUT_PATH_NAME=`basename $HTML_REPORT .dat`
fi

OUTPUT_PATH="${OUTPUT_PATH_DIR}/${OUTPUT_PATH_NAME}_files"



#ME
mkdir -p $OUTPUT_PATH

chmod 777 -R $OUTPUT_PATH

#VERSION=`echo $OUTPUT_PATH | cut -d"/" -f3`
#VERSION=`echo $VERSION | cut -d"_" -f2`

#DEBUG_MODE

DEBUG_MODE="on"
DEBUG="/dev/null"

if [[ $DEBUG_MODE == "on" ]];then

	DEBUG="$OUTPUT_PATH/ncPRO-QC.debug"

fi

#Deploy ncPRO directories structure

/usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-deploy -o $OUTPUT_PATH > $DEBUG

echo "$INPUT" >> $DEBUG
echo "$SAMPLENAME" >> $DEBUG
echo "$PROJECTNAME" >> $DEBUG
echo "$GENOME_2" >> $DEBUG
echo "$RMSK" >> $DEBUG
echo "$BAM_OUT" >> $DEBUG
echo "$RFAM" >> $DEBUG
echo "$REPORT" >> $DEBUG
echo "$HTML_REPORT" >> $DEBUG
echo "$PDF_REPORT" >> $DEBUG
echo "$LOG_FILE" >> $DEBUG
echo "$INPUT_TYPE" >> $DEBUG
echo "$ALIGNMENT" >> $DEBUG
        
echo "$sampleArray" >> $DEBUG
echo "$nameArray" >> $DEBUG
echo "$bamArray" >> $DEBUG

#Go to working directory

cp -rf /usr/curie_ngs/ncproseq_v1.6.5/annotation/*.item $annotationPath
cp -rf /usr/curie_ngs/ncproseq_v1.6.5/annotation/*_items.txt $annotationPath

if [[ ! -L "$databasePath/ncproseqBowtieIndexes" && ! -d "$databasePath/ncproseqBowtieIndexes" ]]
then
    mkdir -p $databasePath/ncproseqBowtieIndexes
fi

cd /usr/curie_ngs
rm -rf bowtie_indexes
ln -s $databasePath/ncproseqBowtieIndexes bowtie_indexes

cd $OUTPUT_PATH

rm annotation

ln -s $annotationPath annotation

rm manuals

#Create symbolic link to input 
#********************************************************************************* NEW: for BAM files, check if reads in input are grouped or not and change cmd accordignly ***********
if [[ $INPUT_TYPE == "fastq" ]];then

    count=0
    for i in ${sampleArray[*]}
    do
            ln -s $i ${OUTPUT_PATH}/rawdata/${nameArray[count]}.fastq
            count=$(( $count + 1 ))
        done
fi

if [[ $INPUT_TYPE == "bam" ]];then

    count=0
    for i in ${sampleArray[*]}
        do
            ln -s $i ${OUTPUT_PATH}/rawdata/${nameArray[count]}.bam
            count=$(( $count + 1 ))
            
            #check if grouped
            RG=`samtools view $i | awk --posix 'BEGIN {RG=1} { if ($1 !~ /^[0-9]{1,}_[0-9]{1,}$/) {RG=0 ; exit} } END { print RG}'`
        done	
fi

#Edit config-ncrna.txt ##### A REFAIRE ####
CONFIG_FILE=config-ncrna.txt

sed -i "s:^BOWTIE_GENOME_REFERENCE =.*$:BOWTIE_GENOME_REFERENCE = $GENOME_2:g" $CONFIG_FILE
sed -i "s:^ORGANISM.*$:ORGANISM = $GENOME_2:g" $CONFIG_FILE

sed -i "/N_CPU/c\N_CPU = 6" $CONFIG_FILE  #****** Make sure this value matches universe.ini files
sed -i "s/test_Curie/$PROJECTNAME/g" $CONFIG_FILE
sed -i "s:^FASTQ_FORMAT =.*$:FASTQ_FORMAT = $FASTQ_FORMAT:g" $CONFIG_FILE


if [[ -f "$annotationPath/$GENOME_2/cluster_pirna.gff" ]]
then
    ANNO_CATALOG="$annotationPath/$GENOME_2/precursor_miRNA.gff $annotationPath/$GENOME_2/rfam.gff $annotationPath/$GENOME_2/cluster_pirna.gff $annotationPath/$GENOME_2/rmsk.gff $annotationPath/$GENOME_2/coding_gene.gff"
else
    if [[ -f "$annotationPath/$GENOME_2/pirna.gff" ]]
    then
        ANNO_CATALOG="$annotationPath/$GENOME_2/precursor_miRNA.gff $annotationPath/$GENOME_2/rfam.gff $annotationPath/$GENOME_2/pirna.gff $annotationPath/$GENOME_2/rmsk.gff $annotationPath/$GENOME_2/coding_gene.gff"
    else
    ANNO_CATALOG="$annotationPath/$GENOME_2/precursor_miRNA.gff $annotationPath/$GENOME_2/rfam.gff $annotationPath/$GENOME_2/rmsk.gff $annotationPath/$GENOME_2/coding_gene.gff"
    fi
fi

sed -i "s:^ANNO_CATALOG.*$:ANNO_CATALOG = $ANNO_CATALOG:g" $CONFIG_FILE

#Build command line 

if [[ $INPUT_TYPE == "fastq" ]];then

	if [[ $ALIGNMENT == "True" ]]; then

		
		COMMAND_LINE="-c $CONFIG_FILE -s processRead -s mapGenome -s mapGenomeStat -s mapAnnOverview"

		if [[ $RFAM == "True" ]];then

			COMMAND_LINE="$COMMAND_LINE -s overviewRfam -s overviewRmsk"

		fi

	else

		COMMAND_LINE="-c $CONFIG_FILE -s processRead"

	fi

fi



#### NEW if BAM already grouped, omit [ -s processBam ] + put input.bam in /bowtie_results
if [[ $INPUT_TYPE == "bam" ]];then

	if [[ $RG = 0 ]]; then #if bam file is NOT grouped
	
		COMMAND_LINE="-c $CONFIG_FILE -s processBam -s mapGenomeStat -s mapAnnOverview"

	else 
	
            count=0
            for i in ${sampleArray[*]}
            do
                ln -s $i ${OUTPUT_PATH}/rawdata/${nameArray[count]}.bam
                count=$(( $count + 1 ))
            done	
	    COMMAND_LINE="-c $CONFIG_FILE -s mapGenomeStat -s mapAnnOverview"
	fi
	
		
	if [[ $RFAM == "True" ]];then

		COMMAND_LINE="$COMMAND_LINE -s overviewRfam -s overviewRmsk"

	fi

fi

#************************* new

##### Function to create HTML report in Galaxy ######

function createHtmlReport
{
    
    # galaxy part :

    #Reformat html output

    tr '>' '\n' < ${OUTPUT_PATH}/report.html | sed -ne "s@.*<img src='\([^']*\)'.*@\1@p" -e 's@.*<img src="\([^"]*\)".*@\1@p' > ${OUTPUT_PATH}/img_list.txt

    NC_LOGO=`head -1 ${OUTPUT_PATH}/img_list.txt`

    #HTTP_PATH=`echo $OUTPUT_PATH | sed "s/\/data\/kdi_${VERSION}/http:\/\/data-kdi-${VERSION}.curie.fr\/file/g"`

    i=1

    if [[ $INPUT_TYPE == "fastq" ]];then

    	echo "<p align=center><img src=$NC_LOGO></p><p align=center><b><u>QUALITY CONTROL</u></b></p>" > ${HTML_REPORT}
    fi

    if [[ $INPUT_TYPE == "bam" ]];then

	echo "<p align=center><img src=$NC_LOGO></p><p align=center><b><u>DATA MAPPING</u></b></p>" > $HTML_REPORT
    fi

    while read line 
    do 

	if [[ $line != "" ]];then

		if [[ $i == "3" ]];then

			ahref=`sed "s/html\/thumb/pic/" <<< $line`

			if [[ $INPUT_TYPE == "fastq" ]];then
				echo "<p align=center><b>Base Composition Information</b></p><table align=center><tr><td align=center><a href=$ahref><img src=$line></a></td>" >> $HTML_REPORT
			fi

			if [[ $INPUT_TYPE == "bam" ]];then
				echo "<p align=center><b>Mapping proportions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi

		fi

		if [[ $i == "4" ]];then
			
			ahref=`sed "s/html\/thumb/pic/" <<< $line`

			if [[ $INPUT_TYPE == "fastq" ]];then
				echo "<td align=center><a href=$ahref><img src=$line></td></a></td><table>" >> $HTML_REPORT
			fi

			if [[ $INPUT_TYPE == "bam" ]];then
				echo "<hr width=500><p align=center><b>Distinct Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi

		fi

		if [[ $i == "5" ]];then

			ahref=`sed "s/html\/thumb/pic/" <<< $line`

			if [[ $INPUT_TYPE == "fastq" ]];then
				echo "<hr width=500><p align=center><b>Distinct Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi

			if [[ $INPUT_TYPE == "bam" ]];then
				echo "<hr width=500><p align=center><b>Abundant Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi

		fi

		if [[ $i == "6" ]];then

			ahref=`sed "s/html\/thumb/pic/" <<< $line`

			if [[ $INPUT_TYPE == "fastq" ]];then
				echo "<hr width=500><p align=center><b>Quality Score</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi

			if [[ $INPUT_TYPE == "bam" ]];then
				echo "<hr size=20><p align=center style=font-size:25px;><b><u>ncRNAs OVERVIEW</u></b><p align=center><b>Reads Annotation Overview</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi

		fi

		if [[ $i == "7" ]];then

			ahref=`sed "s/html\/thumb/pic/" <<< $line`
			if [[ $INPUT_TYPE == "fastq" ]];then
				echo "<hr width=500><p align=center><b>Abundant Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi
			if [[ $INPUT_TYPE == "bam" ]];then

				if [[ $RFAM == "True" ]];then
					echo "<p align=center><b>Precursor miRNAs Annotation</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
				else
					break
				fi
			fi
		fi

		if [[ $i == "8" ]];then

			ahref=`sed "s/html\/thumb/pic/" <<< $line`
			if [[ $INPUT_TYPE == "fastq" ]];then

				if [[ $ALIGNMENT == "True" ]]; then

					echo "<hr size=20><p align=center style=font-size:25px;><b><u>DATA MAPPING</u></b></p><p align=center><b>Mapping proportions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
				else
					break
				fi
			fi
			if [[ $INPUT_TYPE == "bam" ]];then
			
				echo "<p align=center><b>Annotation of ncRNAs from RFAM</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi
		fi

		if [[ $i == "9" ]];then
		
			ahref=`sed "s/html\/thumb/pic/" <<< $line`
			if [[ $INPUT_TYPE == "fastq" ]];then
				echo "<hr width=500><p align=center><b>Distinct Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
			fi

			if [[ $INPUT_TYPE == "bam" ]];then

				echo "<p align=center><b>Annotation of Repetitive Regions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
				break
			fi

		fi

		if [[ $i == "10" ]];then

			ahref=`sed "s/html\/thumb/pic/" <<< $line`

			echo "<hr width=500><p align=center><b>Abundant Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT

		fi

		if [[ $i == "11" ]];then

			ahref=`sed "s/html\/thumb/pic/" <<< $line`

			echo "<hr size=20><p align=center style=font-size:25px;><b><u>ncRNAs OVERVIEW</u></b><p align=center><b>Reads Annotation Overview</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT

		fi

		if [[ $RFAM == "True" ]];then

			if [[ $i == "12" ]];then

				ahref=`sed "s/html\/thumb/pic/" <<< $line`

				echo "<p align=center><b>Precursor miRNAs Annotation</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT

			fi

			if [[ $i == "13" ]];then

				ahref=`sed "s/html\/thumb/pic/" <<< $line`

				echo "<p align=center><b>Annotation of ncRNAs from RFAM</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT

			fi

			if [[ $i == "14" ]];then

				ahref=`sed "s/html\/thumb/pic/" <<< $line`

				echo "<p align=center><b>Annotation of Repetitive Regions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT

			fi

		fi

		i=$(( $i + 1 ))
	
	fi


    done < ${OUTPUT_PATH}/img_list.txt

    rm ${OUTPUT_PATH}/img_list.txt
    
    
    
}


#Launch ncPRO analysis

#FAIRE une boucle pour -s hrml_builder ou -s pdf_builder

if [[ $REPORT == "all" ]];then

    
    /usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-seq $COMMAND_LINE -s html_builder -s pdf_builder>> $DEBUG

    createHtmlReport

    cp ${OUTPUT_PATH}/Analysis_report_ncPRO-seq.pdf $PDF_REPORT

fi

if [[ $REPORT == "pdf" ]];then


    /usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-seq $COMMAND_LINE  -s pdf_builder>> $DEBUG

    cp ${OUTPUT_PATH}/Analysis_report_ncPRO-seq.pdf $PDF_REPORT


fi

if [[ $REPORT == "html" ]];then


    /usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-seq $COMMAND_LINE -s html_builder>> $DEBUG

    createHtmlReport

fi

#Galaxy output handling

cp ${OUTPUT_PATH}/pipeline.log $LOG_FILE

if [[ $ALIGNMENT == "True" ]];then
    
    count=0
    for i in ${bamArray[*]}
    do
        cp ${OUTPUT_PATH}/bowtie_results/${nameArray[count]/_/.}_$GENOME_2.bam $i 
        count=$(( $count + 1 ))
    done
fi