annotate multifasta/align.pl @ 1:2a5485758ae7 draft default tip

Uploaded
author elixir-it
date Tue, 16 Feb 2021 09:26:30 +0000
parents 3f6d4e4340e8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
1 use strict;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
2 use Cwd;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
3
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
4 my %arguments=
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
5
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
6 (
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
7 "--multi"=>"F", #F==FALSE, --multi <file> used to pass a multifasta input file
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
8 "--filelist"=>"F", #F==FALSE, --filelist <file> used to pass a file of file names.
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
9 "--suffix"=>"F", #F==FALSE, --suffix <value> specifies a file name suffix. All files with that suffix will be used
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
10 "--clean"=>"T", #BOLEAN: T: remove temporary directory of results. F: keep it. Defaule T
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
11 "--tmpdir"=>"align.tmp", #Name of the temporary directory. Defaults to align.tmp.
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
12 "--refile"=>"GCF_009858895.2_ASM985889v3_genomic.fna", #Name of the reference genome
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
13 #####OUTPUT file#############################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
14 "--out"=>"ALIGN_out.tsv" #file #OUTPUT #tabulare
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
15 );
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
16
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
17 my $dir=getcwd();#"/export/covid_wrapper/process_multifasta";#getcwd();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
18
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
19 ############################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
20 #Process input arguments and check if valid
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
21 check_arguments();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
22 check_input_arg_valid();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
23
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
24 ###########################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
25 # download the ref genome.
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
26 my $refile=$arguments{"--refile"};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
27 unless (-e $refile)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
28 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
29 download_ref();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
30 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
31
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
32 ###########################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
33 #create temporary dir for storing intermediate files
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
34 check_exists_command('mkdir') or die "$0 requires mkdir to create a temporary directory\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
35 check_exists_command('cp') or die "$0 requires cp to copy files to temporary directory\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
36 my $TGdir=$arguments{"--tmpdir"};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
37 if (-e $TGdir)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
38 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
39 warn ("Temporary directory $TGdir does already exist!. Please be aware that all the alignment files contained in that directory will be incorporated in the output of CorGAT!\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
40 }else{
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
41 system("mkdir $TGdir")==0||die ("can not create temporary directory $TGdir\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
42 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
43
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
44 ###########################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
45 # Compile the list of files to processed.
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
46 my @target_files=();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
47 if ($arguments{"--filelist"} ne "F")
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
48 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
49 # if filelist, read name. Copy files to tmpdir
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
50 my $lfile=$arguments{"--filelist"};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
51 open(IN,$lfile);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
52 while(my $file=<IN>)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
53 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
54 chomp($file);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
55 push(@target_files,"$TGdir/$file");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
56 system("cp $file $TGdir")==0||die("could not copy file $file to $TGdir\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
57 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
58 }elsif ($arguments{"--suffix"} ne "F"){
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
59 # if suffix. use all files in the present folder with suffix. Copy files to tmpdir
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
60 my $suffix=$arguments{"--suffix"};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
61 check_exists_command('cp') or die "$0 requires cp to copy files to temporary directory\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
62 system("cp *.$suffix $TGdir")==0||die "$! could not copy files with the .$suffix extension to the target dir $TGdir\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
63 @target_files=<$TGdir/*.$suffix>;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
64 }elsif ($arguments{"--multi"} ne "F"){
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
65 #if multifasta, split the files. Directly in the target folder. Compile arguments files.
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
66 my $multifile=$arguments{"--multi"};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
67 @target_files=@{split_fasta($multifile,$TGdir)};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
68 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
69
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
70 ###########################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
71 # Align
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
72 align(\@target_files,$TGdir);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
73 my @alignments=<$TGdir/*_ref_qry.snps>;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
74 my $out_file=$arguments{"--out"};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
75
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
76 ############################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
77 # Consolidate the alignments and write output
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
78 consolidate(\@alignments,$out_file,$TGdir);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
79
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
80
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
81 ############################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
82 # check if temp_files need to be removed
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
83 if ($arguments{"--clean"} eq "T")
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
84 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
85 print "--clean set to T=TRUE. I am going to delete the temporary file folder $TGdir\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
86 system ("rm -rf $TGdir")==0||warn("For some reason, the temporary directory $TGdir could not be removed. Please check\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
87 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
88
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
89
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
90 ######################################################################################################################################################################
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
91 sub check_arguments
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
92 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
93 my @arguments=@ARGV;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
94 for (my $i=0;$i<=$#ARGV;$i+=2)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
95 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
96 my $act=$ARGV[$i];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
97 my $val=$ARGV[$i+1];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
98 if (exists $arguments{$act})
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
99 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
100 $arguments{$act}=$val;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
101 }else{
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
102 warn("$act: unknown argument\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
103 my @valid=keys %arguments;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
104 warn("Valid arguments are @valid\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
105 warn("All those moments will be lost in time, like tears in rain.\n Time to die!\n"); #HELP.txt
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
106 print_help();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
107 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
108 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
109 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
110
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
111
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
112 sub download_ref
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
113 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
114 print "Reference genome file, not in the current folder\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
115 print "CorGAT will try to Download the reference genome from Genbank\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
116 print "Please download this file manually, if this fails\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
117 check_exists_command('wget') or die "$0 requires wget to download the genome\nHit <<which wget>> on the terminal to check if you have wget\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
118 check_exists_command('gunzip') or die "$0 requires gunzip to unzip the genome\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
119 system("wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz")==0||die("Could not retrieve the reference genome\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
120 system("gunzip GCF_009858895.2_ASM985889v3_genomic.fna.gz")==0 ||die("Could not unzip the reference genome");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
121
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
122 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
123
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
124 sub check_exists_command {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
125 my $check = `sh -c 'command -v $_[0]'`;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
126 return $check;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
127 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
128
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
129 sub check_input_arg_valid
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
130 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
131 if ($arguments{"--filelist"} eq "F" && $arguments{"--suffix"} eq "F" && $arguments{"--multi"} eq "F")
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
132 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
133 print_help();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
134 die("No valid input mode provided. One of --filelist, --suffix or --multi needs to be provided. You set none!");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
135 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
136 unless ($arguments{"--clean"} eq "T" || $arguments{"--clean"} eq "F")
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
137 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
138 print_help();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
139 die("invalid value for --clean, valid options are either T or F\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
140 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
141 if ($arguments{"--multi"} ne "F")
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
142 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
143 if ($arguments{"--filelist"} ne "F" || $arguments{"--suffix"} ne "F")
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
144 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
145 print_help();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
146 print "Invalid options provided: --multi --filelist and --suffix are mutually exclusive\nOnly one can be T. You provided: multi->". $arguments{"--multi"} . "\tfilelist->". $arguments{"--filelist"}. "\tsuffix->". $arguments{"--suffix"}. "\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
147 die ("Please check and revise\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
148 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
149 }elsif ($arguments{"--filelist"} ne "F" && $arguments{"--suffix"} ne "F"){
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
150 print_help();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
151 print "Invalid options provided: --multi --filelist and --suffix are mutually exclusive\nOnly one can be T. You provided: multi->". $arguments{"--multi"} . "\tfilelist->". $arguments{"--filelist"}. "\tsuffix->". $arguments{"--suffix"}. "\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
152 die ("Please check and revise\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
153 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
154 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
155
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
156 sub split_fasta
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
157 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
158 my $multiF=$_[0];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
159 die("multifasta input file does not exist $multiF\n") unless -e $multiF;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
160 my $tgdir=$_[1];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
161 my @list_files=();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
162 open(IN,$multiF);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
163 while(<IN>)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
164 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
165 if ($_=~/^>(.*)/)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
166 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
167 my $id=$1;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
168 $id=(split(/\s+/,$id))[0];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
169 $id=~s/\-//g;
1
2a5485758ae7 Uploaded
elixir-it
parents: 0
diff changeset
170 if ($id=~/\|/)
2a5485758ae7 Uploaded
elixir-it
parents: 0
diff changeset
171 {
2a5485758ae7 Uploaded
elixir-it
parents: 0
diff changeset
172 $id=(split(/\|/,$id))[1];
2a5485758ae7 Uploaded
elixir-it
parents: 0
diff changeset
173 }
2a5485758ae7 Uploaded
elixir-it
parents: 0
diff changeset
174 $id=~s/\//\_/g;
0
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
175 open(OUT,">$tgdir/$id.fasta");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
176 print OUT ">$id\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
177 push(@list_files,"$tgdir/$id.fasta");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
178 }else{
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
179 chomp();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
180 print OUT;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
181 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
182 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
183 return(\@list_files);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
184 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
185
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
186 sub align
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
187 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
188 my @target_files=@{$_[0]};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
189 my $TGdir=$_[1];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
190 die("Target directory does not exist\n") unless -e $TGdir;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
191 check_exists_command('nucmer') or die "$0 requires nucmer to align genomes. Please check that nucmer is installed and can be executed. Hit <<which nucmer>> on\n your terminal to understand if the program is correctly installed";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
192 check_exists_command('show-snps') or die "$0 requires show-snps from the mummer package to compute polymorphic sites. Please check that show-snps is installed and can be executed. Hit <<which show-snps>> on\n your terminal to understand if the program is correctly installed";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
193 foreach my $tg (@target_files)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
194 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
195 my $name=$tg;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
196 chomp($name);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
197 $name=~s/\.fasta//;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
198 $name=~s/\.fna//;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
199 $name=~s/\.fa//;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
200 if (-e "$TGdir/$name\_ref_qry.snps")
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
201 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
202 print "output file $name\_ref_qry.snps already in folder. Alignment skipped\n"
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
203 }else{
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
204 system("nucmer --prefix=ref_qry $refile $tg")==0||die("no nucmer alignment\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
205 system("show-snps -Clr ref_qry.delta > $name\_ref_qry.snps")==0||warn("no nucmer snps $tg\n");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
206 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
207 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
208 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
209
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
210 sub consolidate
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
211 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
212 my @files=@{$_[0]};
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
213 my $out_file=$_[1];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
214 my $dir_prefix=$_[2];;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
215 my @genomes=();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
216 my %dat_final=();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
217 foreach my $f (@files)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
218 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
219 my $name=$f;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
220 $name=~s/_ref_qry.snps//;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
221 $name=~s/$dir_prefix\///;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
222 push(@genomes,$name);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
223 open(IN,$f);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
224 my %ldata=();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
225 while(<IN>)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
226 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
227 next unless $_=~/NC_045512.2/;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
228 my ($pos,$b1,$b2)=(split(/\s+/,$_))[1,2,3];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
229 $ldata{$pos}=[$b1,$b2];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
230 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
231 my $prev_pos=0;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
232 my $pos_append=0;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
233 my $prev_ref="na";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
234 my $prev_alt="na";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
235 foreach my $pos (sort{$a<=>$b} keys %ldata)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
236 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
237 my $dist=$pos-$prev_pos;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
238 if ($dist>1)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
239 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
240 $pos_append=$prev_pos-length($prev_alt)+1;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
241 $dat_final{"$pos_append\_$prev_ref|$prev_alt"}{$name}=1 unless $prev_ref eq "na";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
242 $prev_ref=$ldata{$pos}[0];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
243 $prev_alt=$ldata{$pos}[1];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
244 }else{
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
245 $prev_ref.=$ldata{$pos}[0];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
246 $prev_alt.=$ldata{$pos}[1];
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
247 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
248 $prev_pos=$pos;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
249 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
250 $pos_append=$prev_pos-length($prev_alt)+1;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
251 $dat_final{"$pos_append\_$prev_ref|$prev_alt"}{$name}=1 if $prev_ref ne "na";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
252 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
253 open(OUT,">$out_file");
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
254 my $TOT=$#genomes+1;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
255 my %AF=();
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
256 print OUT " @genomes\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
257 foreach my $pos (sort{$a<=>$b} keys %dat_final)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
258 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
259 my $line="$pos ";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
260 my $sum=0;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
261 foreach my $g (@genomes)
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
262 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
263 my $val=$dat_final{$pos}{$g} ? 1 : 0;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
264 $sum+=$val;
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
265 $line.="$val ";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
266 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
267 chop($line);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
268 print OUT "$line\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
269 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
270 close(OUT);
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
271 }
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
272
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
273
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
274 sub print_help
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
275 {
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
276 print "This utility can be used to 1) download the reference SARS-CoV-2 genome from Genbank and 2) align it with a collection\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
277 print "of SARS-CoV-2 genomes. And finally 3)Call/identify genomic variants. On any *nix based system the script should be\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
278 print "completely capable to download the reference genome by itself. Please download the genome yourself if this fails.\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
279 print "Input genomes, to be aligned to the reference, can be provided by means of 3 mutually exclusive (as in only one should be set)\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
280 print "parameters:\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
281 print "##INPUT PARAMETERS\n\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
282 print "--multi <<filename>>\tprovides a multifasta of genome sequences\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
283 print "--suffix <<text>>\tspecifies an extension. All the files with that extension in the current folder will be uses\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
284 print "--listfile <<filename>>\tspecifies a file containing a list of file names. All files need to be in the current folder\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
285 print "\nTo run the program you MUST provide one of the above options. Please notice that for --suffix and --listfile ,all\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
286 print "files need to be in the current folder.\n\nAdditional (not strictly required) options are as follows:\n\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
287 print "--tmpdir <<name>>\tname of a temporary directory. All intermediate files are saved there. Defaults to align.tmp\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
288 print "--clean <<T/F>>\t\tif T, tmpdir is delete. Otherwise it is not.\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
289 print "--refile <<file>>\tname of the reference genome file. Defaults to the name of the reference assembly of the SARS-CoV-2 genome\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
290 print " in Genbank. Do not change uless you have a very valid reason\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
291 print "\n##OUTPUT PARAMETERS\n\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
292 print "--out <<name>>\tName of the output file. Defaults to ALIGN_out.tsv\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
293 print "\n##EXAMPLES:\n\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
294 print "1# input is multi-fasta (apollo.fa):\nperl align.pl --multi apollo.fa\n\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
295 print "2# use all .fasta files from the current folder:\nperl align.pl --suffix fasta\n\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
296 print "3# use a file of file names (lfile):\n perl align.pl --filelist lfile\n\n";
3f6d4e4340e8 Uploaded
elixir-it
parents:
diff changeset
297 }