changeset 2:f8c76a03d758 draft

Uploaded
author jbrayet
date Thu, 28 Jan 2016 07:43:32 -0500
parents b21296a98eb2
children 8f3347849abf
files ncPRO-QC.sh
diffstat 1 files changed, 469 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ncPRO-QC.sh	Thu Jan 28 07:43:32 2016 -0500
@@ -0,0 +1,469 @@
+#!/bin/bash
+
+# Copyleft ↄ⃝ 2012 Institut Curie
+# Author(s): Jocelyn Brayet, Laurene Syx, Chongjian Chen, Nicolas Servant(Institut Curie) 2012 - 2015
+# Contact: bioinfo.ncproseq@curie.fr
+# This software is distributed without any guarantee under the terms of the GNU General
+# Public License, either Version 2, June 1991 or Version 3, June 2007.
+
+while getopts "i:s:n:g:m:o:f:r:h:p:l:t:a:q:d:" optionName; do
+case "$optionName" in
+
+i) INPUT+="$OPTARG,";;
+s) SAMPLENAME+="$OPTARG,";;
+n) PROJECTNAME="$OPTARG";;
+g) GENOME="$OPTARG";;
+m) RMSK="$OPTARG";;
+o) BAM_OUT+="$OPTARG,";;
+f) RFAM="$OPTARG";;
+r) REPORT="$OPTARG";;
+h) HTML_REPORT="$OPTARG";;
+p) PDF_REPORT="$OPTARG";;
+l) LOG_FILE="$OPTARG";;
+t) INPUT_TYPE="$OPTARG";;
+a) ALIGNMENT="$OPTARG";;
+q) FASTQ_FORMAT+="$OPTARG,";;
+d) ROOT_DIR="$OPTARG";;
+
+esac
+done
+
+##### ncPRO-seq annotation - Galaxy #####
+
+GENOME_2=`echo $GENOME | cut -d"_" -f2`
+
+databasePath=$ROOT_DIR/database/files
+
+mkdir -p $databasePath/ncproseqAnnotation
+mkdir -p $databasePath/ncproseqAnnotation/annotation
+annotationPath=$databasePath/ncproseqAnnotation/annotation
+[ ! -d $annotationPath/$GENOME_2 ] && wget http://ncpro.curie.fr/ncproseq/install_dir/annotation/$GENOME.tar.gz -P $annotationPath -q && cd $annotationPath && tar -zxf $GENOME.tar.gz && rm -rf $GENOME.tar.gz
+
+#########
+
+
+sampleArray=(${INPUT//,/ })
+nameArray=(${SAMPLENAME//,/ })
+bamArray=(${BAM_OUT//,/ })
+
+if [[ $REPORT == "pdf" ]];then
+    OUTPUT_PATH_DIR=`dirname $LOG_FILE`
+    OUTPUT_PATH_NAME=`basename $LOG_FILE .dat`
+else
+    OUTPUT_PATH_DIR=`dirname $HTML_REPORT`
+    OUTPUT_PATH_NAME=`basename $HTML_REPORT .dat`
+fi
+
+OUTPUT_PATH="${OUTPUT_PATH_DIR}/${OUTPUT_PATH_NAME}_files"
+
+
+
+#ME
+mkdir -p $OUTPUT_PATH
+
+chmod 777 -R $OUTPUT_PATH
+
+#VERSION=`echo $OUTPUT_PATH | cut -d"/" -f3`
+#VERSION=`echo $VERSION | cut -d"_" -f2`
+
+#DEBUG_MODE
+
+DEBUG_MODE="on"
+DEBUG="/dev/null"
+
+if [[ $DEBUG_MODE == "on" ]];then
+
+	DEBUG="$OUTPUT_PATH/ncPRO-QC.debug"
+
+fi
+
+#Deploy ncPRO directories structure
+
+/usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-deploy -o $OUTPUT_PATH > $DEBUG
+
+echo "$INPUT" >> $DEBUG
+echo "$SAMPLENAME" >> $DEBUG
+echo "$PROJECTNAME" >> $DEBUG
+echo "$GENOME_2" >> $DEBUG
+echo "$RMSK" >> $DEBUG
+echo "$BAM_OUT" >> $DEBUG
+echo "$RFAM" >> $DEBUG
+echo "$REPORT" >> $DEBUG
+echo "$HTML_REPORT" >> $DEBUG
+echo "$PDF_REPORT" >> $DEBUG
+echo "$LOG_FILE" >> $DEBUG
+echo "$INPUT_TYPE" >> $DEBUG
+echo "$ALIGNMENT" >> $DEBUG
+        
+echo "$sampleArray" >> $DEBUG
+echo "$nameArray" >> $DEBUG
+echo "$bamArray" >> $DEBUG
+
+#Go to working directory
+
+cp -rf /usr/curie_ngs/ncproseq_v1.6.5/annotation/*.item $annotationPath
+cp -rf /usr/curie_ngs/ncproseq_v1.6.5/annotation/*_items.txt $annotationPath
+
+if [[ ! -L "$databasePath/ncproseqBowtieIndexes" && ! -d "$databasePath/ncproseqBowtieIndexes" ]]
+then
+    mkdir -p $databasePath/ncproseqBowtieIndexes
+fi
+
+cd /usr/curie_ngs
+rm -rf bowtie_indexes
+ln -s $databasePath/ncproseqBowtieIndexes bowtie_indexes
+
+cd $OUTPUT_PATH
+
+rm annotation
+
+ln -s $annotationPath annotation
+
+rm manuals
+
+#Create symbolic link to input 
+#********************************************************************************* NEW: for BAM files, check if reads in input are grouped or not and change cmd accordignly ***********
+if [[ $INPUT_TYPE == "fastq" ]];then
+
+    count=0
+    for i in ${sampleArray[*]}
+    do
+            ln -s $i ${OUTPUT_PATH}/rawdata/${nameArray[count]}.fastq
+            count=$(( $count + 1 ))
+        done
+fi
+
+if [[ $INPUT_TYPE == "bam" ]];then
+
+    count=0
+    for i in ${sampleArray[*]}
+        do
+            ln -s $i ${OUTPUT_PATH}/rawdata/${nameArray[count]}.bam
+            count=$(( $count + 1 ))
+            
+            #check if grouped
+            RG=`samtools view $i | awk --posix 'BEGIN {RG=1} { if ($1 !~ /^[0-9]{1,}_[0-9]{1,}$/) {RG=0 ; exit} } END { print RG}'`
+        done	
+fi
+
+#Edit config-ncrna.txt ##### A REFAIRE ####
+CONFIG_FILE=config-ncrna.txt
+
+sed -i "s:^BOWTIE_GENOME_REFERENCE =.*$:BOWTIE_GENOME_REFERENCE = $GENOME_2:g" $CONFIG_FILE
+sed -i "s:^ORGANISM.*$:ORGANISM = $GENOME_2:g" $CONFIG_FILE
+
+sed -i "/N_CPU/c\N_CPU = 6" $CONFIG_FILE  #****** Make sure this value matches universe.ini files
+sed -i "s/test_Curie/$PROJECTNAME/g" $CONFIG_FILE
+sed -i "s:^FASTQ_FORMAT =.*$:FASTQ_FORMAT = $FASTQ_FORMAT:g" $CONFIG_FILE
+
+
+if [[ -f "$annotationPath/$GENOME_2/cluster_pirna.gff" ]]
+then
+    ANNO_CATALOG="$annotationPath/$GENOME_2/precursor_miRNA.gff $annotationPath/$GENOME_2/rfam.gff $annotationPath/$GENOME_2/cluster_pirna.gff $annotationPath/$GENOME_2/rmsk.gff $annotationPath/$GENOME_2/coding_gene.gff"
+else
+    if [[ -f "$annotationPath/$GENOME_2/pirna.gff" ]]
+    then
+        ANNO_CATALOG="$annotationPath/$GENOME_2/precursor_miRNA.gff $annotationPath/$GENOME_2/rfam.gff $annotationPath/$GENOME_2/pirna.gff $annotationPath/$GENOME_2/rmsk.gff $annotationPath/$GENOME_2/coding_gene.gff"
+    else
+    ANNO_CATALOG="$annotationPath/$GENOME_2/precursor_miRNA.gff $annotationPath/$GENOME_2/rfam.gff $annotationPath/$GENOME_2/rmsk.gff $annotationPath/$GENOME_2/coding_gene.gff"
+    fi
+fi
+
+sed -i "s:^ANNO_CATALOG.*$:ANNO_CATALOG = $ANNO_CATALOG:g" $CONFIG_FILE
+
+#Build command line 
+
+if [[ $INPUT_TYPE == "fastq" ]];then
+
+	if [[ $ALIGNMENT == "True" ]]; then
+
+		
+		COMMAND_LINE="-c $CONFIG_FILE -s processRead -s mapGenome -s mapGenomeStat -s mapAnnOverview"
+
+		if [[ $RFAM == "True" ]];then
+
+			COMMAND_LINE="$COMMAND_LINE -s overviewRfam -s overviewRmsk"
+
+		fi
+
+	else
+
+		COMMAND_LINE="-c $CONFIG_FILE -s processRead"
+
+	fi
+
+fi
+
+
+
+#### NEW if BAM already grouped, omit [ -s processBam ] + put input.bam in /bowtie_results
+if [[ $INPUT_TYPE == "bam" ]];then
+
+	if [[ $RG = 0 ]]; then #if bam file is NOT grouped
+	
+		COMMAND_LINE="-c $CONFIG_FILE -s processBam -s mapGenomeStat -s mapAnnOverview"
+
+	else 
+	
+            count=0
+            for i in ${sampleArray[*]}
+            do
+                ln -s $i ${OUTPUT_PATH}/rawdata/${nameArray[count]}.bam
+                count=$(( $count + 1 ))
+            done	
+	    COMMAND_LINE="-c $CONFIG_FILE -s mapGenomeStat -s mapAnnOverview"
+	fi
+	
+		
+	if [[ $RFAM == "True" ]];then
+
+		COMMAND_LINE="$COMMAND_LINE -s overviewRfam -s overviewRmsk"
+
+	fi
+
+fi
+
+#************************* new
+
+##### Function to create HTML report in Galaxy ######
+
+function createHtmlReport
+{
+    
+    # galaxy part :
+
+    #Reformat html output
+
+    tr '>' '\n' < ${OUTPUT_PATH}/report.html | sed -ne "s@.*<img src='\([^']*\)'.*@\1@p" -e 's@.*<img src="\([^"]*\)".*@\1@p' > ${OUTPUT_PATH}/img_list.txt
+
+    NC_LOGO=`head -1 ${OUTPUT_PATH}/img_list.txt`
+
+    #HTTP_PATH=`echo $OUTPUT_PATH | sed "s/\/data\/kdi_${VERSION}/http:\/\/data-kdi-${VERSION}.curie.fr\/file/g"`
+
+    i=1
+
+    if [[ $INPUT_TYPE == "fastq" ]];then
+
+    	echo "<p align=center><img src=$NC_LOGO></p><p align=center><b><u>QUALITY CONTROL</u></b></p>" > ${HTML_REPORT}
+    fi
+
+    if [[ $INPUT_TYPE == "bam" ]];then
+
+	echo "<p align=center><img src=$NC_LOGO></p><p align=center><b><u>DATA MAPPING</u></b></p>" > $HTML_REPORT
+    fi
+
+    while read line 
+    do 
+
+	if [[ $line != "" ]];then
+
+		if [[ $i == "3" ]];then
+
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+			if [[ $INPUT_TYPE == "fastq" ]];then
+				echo "<p align=center><b>Base Composition Information</b></p><table align=center><tr><td align=center><a href=$ahref><img src=$line></a></td>" >> $HTML_REPORT
+			fi
+
+			if [[ $INPUT_TYPE == "bam" ]];then
+				echo "<p align=center><b>Mapping proportions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+
+		fi
+
+		if [[ $i == "4" ]];then
+			
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+			if [[ $INPUT_TYPE == "fastq" ]];then
+				echo "<td align=center><a href=$ahref><img src=$line></td></a></td><table>" >> $HTML_REPORT
+			fi
+
+			if [[ $INPUT_TYPE == "bam" ]];then
+				echo "<hr width=500><p align=center><b>Distinct Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+
+		fi
+
+		if [[ $i == "5" ]];then
+
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+			if [[ $INPUT_TYPE == "fastq" ]];then
+				echo "<hr width=500><p align=center><b>Distinct Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+
+			if [[ $INPUT_TYPE == "bam" ]];then
+				echo "<hr width=500><p align=center><b>Abundant Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+
+		fi
+
+		if [[ $i == "6" ]];then
+
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+			if [[ $INPUT_TYPE == "fastq" ]];then
+				echo "<hr width=500><p align=center><b>Quality Score</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+
+			if [[ $INPUT_TYPE == "bam" ]];then
+				echo "<hr size=20><p align=center style=font-size:25px;><b><u>ncRNAs OVERVIEW</u></b><p align=center><b>Reads Annotation Overview</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+
+		fi
+
+		if [[ $i == "7" ]];then
+
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+			if [[ $INPUT_TYPE == "fastq" ]];then
+				echo "<hr width=500><p align=center><b>Abundant Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+			if [[ $INPUT_TYPE == "bam" ]];then
+
+				if [[ $RFAM == "True" ]];then
+					echo "<p align=center><b>Precursor miRNAs Annotation</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+				else
+					break
+				fi
+			fi
+		fi
+
+		if [[ $i == "8" ]];then
+
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+			if [[ $INPUT_TYPE == "fastq" ]];then
+
+				if [[ $ALIGNMENT == "True" ]]; then
+
+					echo "<hr size=20><p align=center style=font-size:25px;><b><u>DATA MAPPING</u></b></p><p align=center><b>Mapping proportions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+				else
+					break
+				fi
+			fi
+			if [[ $INPUT_TYPE == "bam" ]];then
+			
+				echo "<p align=center><b>Annotation of ncRNAs from RFAM</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+		fi
+
+		if [[ $i == "9" ]];then
+		
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+			if [[ $INPUT_TYPE == "fastq" ]];then
+				echo "<hr width=500><p align=center><b>Distinct Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+			fi
+
+			if [[ $INPUT_TYPE == "bam" ]];then
+
+				echo "<p align=center><b>Annotation of Repetitive Regions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+				break
+			fi
+
+		fi
+
+		if [[ $i == "10" ]];then
+
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+			echo "<hr width=500><p align=center><b>Abundant Reads Length Distribution</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+
+		fi
+
+		if [[ $i == "11" ]];then
+
+			ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+			echo "<hr size=20><p align=center style=font-size:25px;><b><u>ncRNAs OVERVIEW</u></b><p align=center><b>Reads Annotation Overview</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+
+		fi
+
+		if [[ $RFAM == "True" ]];then
+
+			if [[ $i == "12" ]];then
+
+				ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+				echo "<p align=center><b>Precursor miRNAs Annotation</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+
+			fi
+
+			if [[ $i == "13" ]];then
+
+				ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+				echo "<p align=center><b>Annotation of ncRNAs from RFAM</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+
+			fi
+
+			if [[ $i == "14" ]];then
+
+				ahref=`sed "s/html\/thumb/pic/" <<< $line`
+
+				echo "<p align=center><b>Annotation of Repetitive Regions</b></p><p align=center><a href=$ahref><img src=$line></a></p>" >> $HTML_REPORT
+
+			fi
+
+		fi
+
+		i=$(( $i + 1 ))
+	
+	fi
+
+
+    done < ${OUTPUT_PATH}/img_list.txt
+
+    rm ${OUTPUT_PATH}/img_list.txt
+    
+    
+    
+}
+
+
+#Launch ncPRO analysis
+
+#FAIRE une boucle pour -s hrml_builder ou -s pdf_builder
+
+if [[ $REPORT == "all" ]];then
+
+    
+    /usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-seq $COMMAND_LINE -s html_builder -s pdf_builder>> $DEBUG
+
+    createHtmlReport
+
+    cp ${OUTPUT_PATH}/Analysis_report_ncPRO-seq.pdf $PDF_REPORT
+
+fi
+
+if [[ $REPORT == "pdf" ]];then
+
+
+    /usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-seq $COMMAND_LINE  -s pdf_builder>> $DEBUG
+
+    cp ${OUTPUT_PATH}/Analysis_report_ncPRO-seq.pdf $PDF_REPORT
+
+
+fi
+
+if [[ $REPORT == "html" ]];then
+
+
+    /usr/curie_ngs/ncproseq_v1.6.5/bin/ncPRO-seq $COMMAND_LINE -s html_builder>> $DEBUG
+
+    createHtmlReport
+
+fi
+
+#Galaxy output handling
+
+cp ${OUTPUT_PATH}/pipeline.log $LOG_FILE
+
+if [[ $ALIGNMENT == "True" ]];then
+    
+    count=0
+    for i in ${bamArray[*]}
+    do
+        cp ${OUTPUT_PATH}/bowtie_results/${nameArray[count]/_/.}_$GENOME_2.bam $i 
+        count=$(( $count + 1 ))
+    done
+fi