changeset 0:3f6d4e4340e8 draft

Uploaded
author elixir-it
date Fri, 30 Oct 2020 13:32:35 +0000
parents
children 2a5485758ae7
files multifasta/align.pl multifasta/multi_fasta_process.xml
diffstat 2 files changed, 335 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multifasta/align.pl	Fri Oct 30 13:32:35 2020 +0000
@@ -0,0 +1,292 @@
+use strict;
+use Cwd;
+
+my %arguments=
+
+(
+"--multi"=>"F",         	#F==FALSE, --multi <file> used to pass a multifasta input file
+"--filelist"=>"F",		#F==FALSE, --filelist <file> used to pass a file of file names. 
+"--suffix"=>"F",        	#F==FALSE, --suffix <value> specifies a file name suffix. All files with that suffix will be used
+"--clean"=>"T",			#BOLEAN: T: remove temporary directory of results. F: keep it. Defaule T
+"--tmpdir"=>"align.tmp", 	#Name of the temporary directory. Defaults to align.tmp. 
+"--refile"=>"GCF_009858895.2_ASM985889v3_genomic.fna", #Name of the reference genome 
+#####OUTPUT file#############################################
+"--out"=>"ALIGN_out.tsv" #file #OUTPUT #tabulare
+);
+
+my $dir=getcwd();#"/export/covid_wrapper/process_multifasta";#getcwd();
+
+############################################################
+#Process input arguments and check if valid
+check_arguments();
+check_input_arg_valid();
+
+###########################################################
+# download the ref genome.
+my $refile=$arguments{"--refile"};
+unless (-e $refile)
+{
+	download_ref();
+}
+
+###########################################################
+#create temporary dir for storing intermediate files
+check_exists_command('mkdir') or die "$0 requires mkdir to create a temporary directory\n";	
+check_exists_command('cp') or die "$0 requires cp to copy files to temporary directory\n";
+my $TGdir=$arguments{"--tmpdir"};
+if (-e $TGdir)
+{
+	warn ("Temporary directory $TGdir does already exist!. Please be aware that all the alignment files contained in that directory will be incorporated in the output of CorGAT!\n");
+}else{
+	system("mkdir $TGdir")==0||die ("can not create temporary directory $TGdir\n");
+}
+
+###########################################################
+# Compile the list of files to processed.
+my @target_files=();
+if ($arguments{"--filelist"} ne "F")
+{
+# if filelist, read name. Copy files to tmpdir
+	my $lfile=$arguments{"--filelist"};
+	open(IN,$lfile);
+	while(my $file=<IN>)
+	{
+		chomp($file);
+		push(@target_files,"$TGdir/$file");
+		system("cp $file $TGdir")==0||die("could not copy file $file to $TGdir\n");
+	}
+}elsif ($arguments{"--suffix"} ne "F"){
+# if suffix. use all files in the present folder with suffix. Copy files to tmpdir
+	my $suffix=$arguments{"--suffix"};
+	check_exists_command('cp') or die "$0 requires cp to copy files to temporary directory\n";
+	system("cp *.$suffix $TGdir")==0||die "$! could not copy files with the .$suffix extension to the target dir $TGdir\n";
+	@target_files=<$TGdir/*.$suffix>;
+}elsif ($arguments{"--multi"} ne "F"){
+#if multifasta, split the files. Directly in the target folder. Compile arguments files.
+	my $multifile=$arguments{"--multi"};
+	@target_files=@{split_fasta($multifile,$TGdir)};
+}
+
+###########################################################
+# Align
+align(\@target_files,$TGdir);
+my @alignments=<$TGdir/*_ref_qry.snps>;
+my $out_file=$arguments{"--out"};
+
+############################################################
+# Consolidate the alignments and write output
+consolidate(\@alignments,$out_file,$TGdir);
+
+
+############################################################
+# check if temp_files need to be removed
+if ($arguments{"--clean"} eq "T")
+{
+	print "--clean set to T=TRUE. I am going to delete the temporary file folder $TGdir\n";
+	system ("rm -rf $TGdir")==0||warn("For some reason, the temporary directory $TGdir could not be removed. Please check\n");
+}
+
+
+######################################################################################################################################################################
+sub check_arguments
+{
+	my @arguments=@ARGV;
+	for (my $i=0;$i<=$#ARGV;$i+=2)
+	{
+        	my $act=$ARGV[$i];
+        	my $val=$ARGV[$i+1];
+        	if (exists $arguments{$act})
+        	{
+                	$arguments{$act}=$val;
+        	}else{
+                	warn("$act: unknown argument\n");
+                	my @valid=keys %arguments;
+                	warn("Valid arguments are @valid\n");
+                	warn("All those moments will be lost in time, like tears in rain.\n Time to die!\n"); #HELP.txt
+        		print_help();
+		}	
+	}
+}
+
+
+sub download_ref
+{
+	print "Reference genome file, not in the current folder\n";
+        print "CorGAT will try to Download the reference genome from Genbank\n";
+        print "Please download this file manually, if this fails\n";
+        check_exists_command('wget') or die "$0 requires wget to download the genome\nHit <<which wget>> on the terminal to check if you have wget\n";
+        check_exists_command('gunzip') or die "$0 requires gunzip to unzip the genome\n";
+        system("wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz")==0||die("Could not retrieve the reference genome\n");
+        system("gunzip GCF_009858895.2_ASM985889v3_genomic.fna.gz")==0 ||die("Could not unzip the reference genome");
+
+}
+
+sub check_exists_command {
+    my $check = `sh -c 'command -v $_[0]'`;
+    return $check;
+}
+
+sub check_input_arg_valid
+{
+	if ($arguments{"--filelist"} eq "F" && $arguments{"--suffix"} eq "F" && $arguments{"--multi"} eq "F")
+	{
+		print_help();
+		die("No valid input mode provided. One of --filelist, --suffix or --multi needs to be provided. You set none!");
+	}
+	unless ($arguments{"--clean"} eq "T" || $arguments{"--clean"} eq "F")
+	{
+		print_help();
+		die("invalid value for --clean, valid options are either T or F\n");
+	}
+	if ($arguments{"--multi"} ne "F")
+	{
+		if ($arguments{"--filelist"} ne "F" || $arguments{"--suffix"} ne "F")
+		{
+			print_help();
+			print "Invalid options provided: --multi --filelist and --suffix are mutually exclusive\nOnly one can be T. You provided: multi->". $arguments{"--multi"} . "\tfilelist->". $arguments{"--filelist"}. "\tsuffix->". $arguments{"--suffix"}. "\n";
+		die ("Please check and revise\n");
+		}
+	}elsif ($arguments{"--filelist"} ne "F" && $arguments{"--suffix"} ne "F"){
+		print_help();
+		print "Invalid options provided: --multi --filelist and --suffix are mutually exclusive\nOnly one can be T. You provided: multi->". $arguments{"--multi"} . "\tfilelist->". $arguments{"--filelist"}. "\tsuffix->". $arguments{"--suffix"}. "\n"; 
+		die ("Please check and revise\n");
+	}
+}
+
+sub split_fasta
+{
+	my $multiF=$_[0];
+	die("multifasta input file does not exist $multiF\n") unless -e $multiF;
+	my $tgdir=$_[1];
+	my @list_files=();
+	open(IN,$multiF);
+	while(<IN>)
+	{
+		if ($_=~/^>(.*)/)
+		{
+			my $id=$1;
+			$id=(split(/\s+/,$id))[0];
+			$id=~s/\-//g;
+			open(OUT,">$tgdir/$id.fasta");
+			print OUT ">$id\n";
+			push(@list_files,"$tgdir/$id.fasta");
+		}else{
+			chomp();
+			print OUT;
+		}
+	}
+	return(\@list_files);
+}
+
+sub align
+{
+	my @target_files=@{$_[0]};
+	my $TGdir=$_[1];
+	die("Target directory does not exist\n") unless -e $TGdir;
+	check_exists_command('nucmer') or die "$0 requires nucmer to align genomes. Please check that nucmer is installed and can be executed. Hit <<which nucmer>> on\n your terminal to understand if the program is correctly installed";
+	check_exists_command('show-snps') or die "$0 requires show-snps from the mummer package to compute polymorphic sites. Please check that show-snps is installed and can be executed. Hit <<which show-snps>> on\n your terminal to understand if the program is correctly installed";
+	foreach my $tg (@target_files)
+	{
+        	my $name=$tg;
+		chomp($name);
+        	$name=~s/\.fasta//;
+        	$name=~s/\.fna//;
+        	$name=~s/\.fa//;
+		if (-e "$TGdir/$name\_ref_qry.snps")
+        	{
+                	print "output file $name\_ref_qry.snps already in folder. Alignment skipped\n"
+        	}else{
+                	system("nucmer --prefix=ref_qry $refile $tg")==0||die("no nucmer alignment\n");
+                	system("show-snps -Clr ref_qry.delta > $name\_ref_qry.snps")==0||warn("no nucmer snps $tg\n");
+        	}
+	}
+}
+
+sub consolidate
+{
+	my @files=@{$_[0]};
+	my $out_file=$_[1];
+	my $dir_prefix=$_[2];;	
+	my @genomes=();
+	my %dat_final=();
+	foreach my $f (@files)
+	{
+        	my $name=$f;
+        	$name=~s/_ref_qry.snps//;
+		$name=~s/$dir_prefix\///;
+        	push(@genomes,$name);
+        	open(IN,$f);
+        	my %ldata=();
+        	while(<IN>)
+        	{
+                	next unless $_=~/NC_045512.2/;
+                	my ($pos,$b1,$b2)=(split(/\s+/,$_))[1,2,3];
+                	$ldata{$pos}=[$b1,$b2];
+        	}
+        	my $prev_pos=0;
+		my $pos_append=0;
+        	my $prev_ref="na";
+        	my $prev_alt="na";
+        	foreach my $pos (sort{$a<=>$b} keys %ldata)
+        	{
+                	my $dist=$pos-$prev_pos;
+                	if ($dist>1)
+                	{
+                        	$pos_append=$prev_pos-length($prev_alt)+1;
+                        	$dat_final{"$pos_append\_$prev_ref|$prev_alt"}{$name}=1 unless $prev_ref eq "na";
+                        	$prev_ref=$ldata{$pos}[0];
+                        	$prev_alt=$ldata{$pos}[1];
+                	}else{
+                        	$prev_ref.=$ldata{$pos}[0];
+                        	$prev_alt.=$ldata{$pos}[1];
+                	}
+                	$prev_pos=$pos;
+        	}
+        	$pos_append=$prev_pos-length($prev_alt)+1;
+        	$dat_final{"$pos_append\_$prev_ref|$prev_alt"}{$name}=1 if $prev_ref ne "na";
+	}
+	open(OUT,">$out_file");
+	my $TOT=$#genomes+1;
+	my %AF=();
+	print OUT " @genomes\n";
+	foreach my $pos (sort{$a<=>$b} keys %dat_final)
+	{
+        	my $line="$pos ";
+        	my $sum=0;
+        	foreach my $g (@genomes)
+        	{
+                	my $val=$dat_final{$pos}{$g} ? 1 : 0;
+                	$sum+=$val;
+                	$line.="$val ";
+        	}
+        	chop($line);
+        	print OUT "$line\n";
+	}
+	close(OUT);
+}
+
+
+sub print_help
+{
+	print "This utility can be used to 1) download the reference SARS-CoV-2 genome from Genbank and 2) align it with a collection\n"; 
+	print "of SARS-CoV-2 genomes. And finally 3)Call/identify genomic variants.  On any *nix based system the script should be\n"; 
+	print "completely capable to download the reference genome by itself.  Please download the genome yourself if this fails.\n"; 
+	print "Input genomes, to be aligned to the reference, can be provided by means of 3 mutually exclusive (as in only one should be set)\n";
+	print "parameters:\n";
+	print "##INPUT PARAMETERS\n\n";
+	print "--multi <<filename>>\tprovides a multifasta of genome sequences\n";
+	print "--suffix <<text>>\tspecifies an extension. All the files with that extension in the current folder will be uses\n";
+	print "--listfile <<filename>>\tspecifies a file containing a list of file names. All files need to be in the current folder\n";
+	print "\nTo run the program you MUST provide one of the above options. Please notice that for --suffix and --listfile ,all\n";
+	print "files need to be in the current folder.\n\nAdditional (not strictly required) options are as follows:\n\n";
+	print "--tmpdir <<name>>\tname of a temporary directory. All intermediate files are saved there. Defaults to align.tmp\n";
+	print "--clean <<T/F>>\t\tif T, tmpdir is delete. Otherwise it is not.\n";
+	print "--refile <<file>>\tname of the reference genome file. Defaults to the name of the reference assembly of the SARS-CoV-2 genome\n";
+	print "		        in Genbank. Do not change uless you have a very valid reason\n";
+	print "\n##OUTPUT PARAMETERS\n\n";
+	print "--out <<name>>\tName of the output file. Defaults to ALIGN_out.tsv\n"; 
+	print "\n##EXAMPLES:\n\n";
+	print "1# input is multi-fasta (apollo.fa):\nperl align.pl --multi apollo.fa\n\n";
+	print "2# use all .fasta files from the current folder:\nperl align.pl --suffix fasta\n\n";
+	print "3# use a file of file names (lfile):\n perl align.pl --filelist lfile\n\n";
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multifasta/multi_fasta_process.xml	Fri Oct 30 13:32:35 2020 +0000
@@ -0,0 +1,43 @@
+<tool id="multiFasCorGAT" name="multiFC" version="1">
+  <description> Process multi-fasta files to derive a phenetic matrix of genetic variants.</description>
+  <requirements>
+    <requirement type="package" >perl</requirement>
+    <requirement type="package" >wget</requirement>
+    <requirement type="package" >mummer</requirement>
+  </requirements>
+  <command> <![CDATA[
+        ln -s $__tool_directory__/align.pl . 2>>$log &&
+	perl align.pl --multi $infile --out $ofile 2>>$log
+    ]]>
+  </command>
+<inputs>
+	<param format="fasta" name="infile" type="data" label="multifasta" help="Multifasta file of SARS-CoV-2 genomes"/>
+</inputs>
+
+<outputs>
+	<data format="txt" name="log" label="${tool.name} on ${on_string}: log file "/>
+	<data format="tsv" name="ofile" label="${tool.name} on ${on_string}: tsv "/>
+</outputs>
+
+<stdio>
+</stdio>
+
+<tests>
+	<test>
+	</test>
+</tests>
+
+<help>
+	        **What it does?**
+
+        This tool is used to align SARS-CoV-2 genes, in multifasta format. Genomes will be aligned to the reference SARS-CoV-2 genome using nucmer.
+	The output will consist in a single tabular file with as may columns as the number of genomes provided in input. And as many rows as the 
+	number of variants observed in the genomes. For every genome assembly and variant a simple binary code 1= present, 0=absent will be used to 
+	indicate whether that genome carries a specific variant. This table should be provided to the FunAnn tool to obtain the functional annotation 
+	of the variants.
+
+</help>
+  <citations>
+  </citations>
+</tool>
+