Mercurial > repos > ryotas > export2sam
comparison illumina_export2sam.pl @ 0:61f3089ddc74 default tip
commit
| author | ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp> |
|---|---|
| date | Tue, 30 Dec 2014 18:53:23 +0900 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:61f3089ddc74 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 # | |
| 3 # | |
| 4 # illumina_export2sam.pl converts GERALD export files to SAM format. | |
| 5 # | |
| 6 # | |
| 7 # | |
| 8 ########## License: | |
| 9 # | |
| 10 # The MIT License | |
| 11 # | |
| 12 # Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. | |
| 13 # Modified SAMtools work copyright (c) 2010 Illumina, Inc. | |
| 14 # | |
| 15 # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 16 # of this software and associated documentation files (the "Software"), to deal | |
| 17 # in the Software without restriction, including without limitation the rights | |
| 18 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 19 # copies of the Software, and to permit persons to whom the Software is | |
| 20 # furnished to do so, subject to the following conditions: | |
| 21 # | |
| 22 # The above copyright notice and this permission notice shall be included in | |
| 23 # all copies or substantial portions of the Software. | |
| 24 # | |
| 25 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 26 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 27 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 28 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 29 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 30 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| 31 # THE SOFTWARE. | |
| 32 # | |
| 33 # | |
| 34 # | |
| 35 ########## Additional notice for CASAVA installation: | |
| 36 # | |
| 37 # This file [illumina_export2sam.pl] in the CASAVA installation has | |
| 38 # been copied from the file [export2sam.pl] in the SAMtools package | |
| 39 # and modified by Illumina as permitted under the MIT license that | |
| 40 # governs SAMtools. Illumina recommends the use of the modified | |
| 41 # version to convert data from the Illumina export format to the SAM | |
| 42 # file format. The terms of the MIT license specify your right to | |
| 43 # further modify and distribute the SAMtools code. For the avoidance | |
| 44 # of doubt, your rights with respect to copying, modifying, using and | |
| 45 # distributing CASAVA are more restricted than the rights in the MIT | |
| 46 # license, and are set forth in the Illumina Genome Analyzer Software | |
| 47 # License Agreement and the Illumina Source Code License Agreement. | |
| 48 # | |
| 49 ########## ChangeLog: | |
| 50 # | |
| 51 # Version: 2.3.1 (18MAR2011) | |
| 52 # | |
| 53 # - Restore file '-' as stdin input. | |
| 54 # | |
| 55 # Version: 2.3.0 (24JAN2011) | |
| 56 # | |
| 57 # - Add support for export reserved chromosome name "CONTROL", | |
| 58 # which is translated to optional field "XC:Z:CONTROL". | |
| 59 # - Check for ".gz" file extension on export files and open | |
| 60 # these as gzip pipes when the extension is found. | |
| 61 # | |
| 62 # Version: 2.2.0 (16NOV2010) | |
| 63 # | |
| 64 # - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y | |
| 65 # - For export records with reserved chromosome name identifiers | |
| 66 # "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" | |
| 67 # to the SAM record, so that these cases can be distinguished | |
| 68 # from other unmatched reads. | |
| 69 # | |
| 70 # Version: 2.1.0 (21SEP2010) | |
| 71 # | |
| 72 # - Additional export record error checking. | |
| 73 # - Convert export records with chromosome value of "RM" to unmapped | |
| 74 # SAM records. | |
| 75 # | |
| 76 # Version: 2.0.0 (15FEB2010) | |
| 77 # | |
| 78 # Script updated by Illumina in conjunction with CASAVA 1.7.0 | |
| 79 # release. | |
| 80 # | |
| 81 # Major changes are as follows: | |
| 82 # - The CIGAR string has been updated to include all gaps from | |
| 83 # ELANDv2 alignments. | |
| 84 # - The ELAND single read alignment score is always stored in the | |
| 85 # optional "SM" field and the ELAND paired read alignment score | |
| 86 # is stored in the optional "AS" field when it exists. | |
| 87 # - The MAPQ value is set to the higher of the two alignment scores, | |
| 88 # but no greater than 254, i.e. min(254,max(SM,AS)) | |
| 89 # - The SAM "proper pair" bit (0x0002) is now set for read pairs | |
| 90 # meeting ELAND's expected orientation and insert size criteria. | |
| 91 # - The default quality score translation is set for export files | |
| 92 # which contain Phread+64 quality values. An option, | |
| 93 # "--qlogodds", has been added to translate quality values from | |
| 94 # the Solexa+64 format used in export files prior to Pipeline | |
| 95 # 1.3 | |
| 96 # - The export match descriptor is now reverse-complemented when | |
| 97 # necessary such that it always corresponds to the forward | |
| 98 # strand of the reference, to be consistent with other | |
| 99 # information in the SAM record. It is now written to the | |
| 100 # optional 'XD' field (rather than 'MD') to acknowledge its | |
| 101 # minor differences from the samtools match descriptor (see | |
| 102 # additional detail below). | |
| 103 # - An option, "--nofilter", has been added to include reads which | |
| 104 # have failed primary analysis quality filtration. Such reads | |
| 105 # will have the corresponding SAM flag bit (0x0200) set. | |
| 106 # - Labels in the export 'contig' field are preserved by setting | |
| 107 # RNAME to "$export_chromosome/$export_contig" when the contig | |
| 108 # label exists. | |
| 109 # | |
| 110 # | |
| 111 # Contact: lh3 | |
| 112 # Version: 0.1.2 (03JAN2009) | |
| 113 # | |
| 114 # | |
| 115 # | |
| 116 ########## Known Conversion Limitations: | |
| 117 # | |
| 118 # - Export records for reads that map to a position < 1 (allowed | |
| 119 # in export format), are converted to unmapped reads in the SAM | |
| 120 # record. | |
| 121 # - Export records contain the reserved chromosome names: "NM", | |
| 122 # "QC","RM" and "CONTROL". "NM" indicates that the aligner could | |
| 123 # not map the read to the reference sequence set. "QC" means that | |
| 124 # the aligner did not attempt to map the read due to some | |
| 125 # technical limitation. "RM" means that the read mapped to a set | |
| 126 # of 'contaminant' sequences specified in GERALD's RNA-seq | |
| 127 # workflow. "CONTROL" means that the read is a control. All of | |
| 128 # these alignment types are collapsed to the single unmapped | |
| 129 # alignment state in the SAM record, but the optional SAM "XC" | |
| 130 # field is used to record the original reserved chromosome name of | |
| 131 # the read for all but the "NM" case. | |
| 132 # - The export match descriptor is slightly different than the | |
| 133 # samtools match descriptor. For this reason it is stored in the | |
| 134 # optional SAM field 'XD' (and not 'MD'). Note that the export | |
| 135 # match descriptor differs from the samtools version in two | |
| 136 # respects: (1) indels are explicitly closed with the '$' | |
| 137 # character and (2) insertions must be enumerated in the match | |
| 138 # descriptor. For example a 35-base read with a two-base insertion | |
| 139 # is described as: 20^2$14 | |
| 140 # | |
| 141 # | |
| 142 # | |
| 143 | |
| 144 my $version = "2.3.1"; | |
| 145 | |
| 146 use strict; | |
| 147 use warnings; | |
| 148 | |
| 149 use Getopt::Long; | |
| 150 use File::Spec; | |
| 151 use List::Util qw(min max); | |
| 152 | |
| 153 | |
| 154 use constant { | |
| 155 EXPORT_MACHINE => 0, | |
| 156 EXPORT_RUNNO => 1, | |
| 157 EXPORT_LANE => 2, | |
| 158 EXPORT_TILE => 3, | |
| 159 EXPORT_X => 4, | |
| 160 EXPORT_Y => 5, | |
| 161 EXPORT_INDEX => 6, | |
| 162 EXPORT_READNO => 7, | |
| 163 EXPORT_READ => 8, | |
| 164 EXPORT_QUAL => 9, | |
| 165 EXPORT_CHROM => 10, | |
| 166 EXPORT_CONTIG => 11, | |
| 167 EXPORT_POS => 12, | |
| 168 EXPORT_STRAND => 13, | |
| 169 EXPORT_MD => 14, | |
| 170 EXPORT_SEMAP => 15, | |
| 171 EXPORT_PEMAP => 16, | |
| 172 EXPORT_PASSFILT => 21, | |
| 173 EXPORT_SIZE => 22, | |
| 174 }; | |
| 175 | |
| 176 | |
| 177 use constant { | |
| 178 SAM_QNAME => 0, | |
| 179 SAM_FLAG => 1, | |
| 180 SAM_RNAME => 2, | |
| 181 SAM_POS => 3, | |
| 182 SAM_MAPQ => 4, | |
| 183 SAM_CIGAR => 5, | |
| 184 SAM_MRNM => 6, | |
| 185 SAM_MPOS => 7, | |
| 186 SAM_ISIZE => 8, | |
| 187 SAM_SEQ => 9, | |
| 188 SAM_QUAL => 10, | |
| 189 }; | |
| 190 | |
| 191 | |
| 192 # function prototypes for Richard's code | |
| 193 sub match_desc_to_cigar($); | |
| 194 sub match_desc_frag_length($); | |
| 195 sub reverse_compl_match_descriptor($); | |
| 196 sub write_header($;$;$); | |
| 197 | |
| 198 | |
| 199 &export2sam; | |
| 200 exit; | |
| 201 | |
| 202 | |
| 203 | |
| 204 | |
| 205 sub export2sam { | |
| 206 | |
| 207 my $cmdline = $0 . " " . join(" ",@ARGV); | |
| 208 my $arg_count = scalar @ARGV; | |
| 209 my $progname = "illumina_export2sam.pl"; | |
| 210 | |
| 211 my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values | |
| 212 my $is_nofilter = 0; | |
| 213 my $read1file; | |
| 214 my $read2file; | |
| 215 my $print_version = 0; | |
| 216 my $help = 0; | |
| 217 | |
| 218 my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, | |
| 219 "nofilter" => \$is_nofilter, | |
| 220 "read1=s" => \$read1file, | |
| 221 "read2=s" => \$read2file, | |
| 222 "version" => \$print_version, | |
| 223 "help" => \$help ); | |
| 224 | |
| 225 my $usage = <<END; | |
| 226 | |
| 227 $progname converts GERALD export files to SAM format. | |
| 228 | |
| 229 Usage: $progname --read1=FILENAME [ options ] | --version | --help | |
| 230 | |
| 231 --read1=FILENAME read1 export file or '-' for stdin (mandatory) | |
| 232 (file may be gzipped with ".gz" extension) | |
| 233 --read2=FILENAME read2 export file or '-' for stdin | |
| 234 (file may be gzipped with ".gz" extension) | |
| 235 --nofilter include reads that failed the basecaller | |
| 236 purity filter | |
| 237 --qlogodds assume export file(s) use logodds quality values | |
| 238 as reported by OLB (Pipeline) prior to v1.3 | |
| 239 (default: phred quality values) | |
| 240 | |
| 241 END | |
| 242 | |
| 243 my $version_msg = <<END; | |
| 244 | |
| 245 $progname version: $version | |
| 246 | |
| 247 END | |
| 248 | |
| 249 if((not $result) or $help or ($arg_count==0)) { | |
| 250 die($usage); | |
| 251 } | |
| 252 | |
| 253 if(@ARGV) { | |
| 254 print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n"; | |
| 255 die($usage); | |
| 256 } | |
| 257 | |
| 258 if($print_version) { | |
| 259 die($version_msg); | |
| 260 } | |
| 261 | |
| 262 if(not defined($read1file)) { | |
| 263 print STDERR "\nERROR: read1 export file must be specified\n\n"; | |
| 264 die($usage); | |
| 265 } | |
| 266 | |
| 267 unless((-f $read1file) or ($read1file eq '-')) { | |
| 268 die("\nERROR: Can't find read1 export file: '$read1file'\n\n"); | |
| 269 } | |
| 270 | |
| 271 if (defined $read2file) { | |
| 272 unless((-f $read2file) or ($read2file eq '-')) { | |
| 273 die("\nERROR: Can't find read2 export file: '$read2file'\n\n"); | |
| 274 } | |
| 275 if($read1file eq $read2file) { | |
| 276 die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n"); | |
| 277 } | |
| 278 } | |
| 279 | |
| 280 my ($fh1, $fh2, $is_paired); | |
| 281 | |
| 282 my $read1cmd="$read1file"; | |
| 283 $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/); | |
| 284 open($fh1, $read1cmd) | |
| 285 or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n"); | |
| 286 $is_paired = defined $read2file; | |
| 287 if ($is_paired) { | |
| 288 my $read2cmd="$read2file"; | |
| 289 $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/); | |
| 290 open($fh2, $read2cmd) | |
| 291 or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n"); | |
| 292 } | |
| 293 # quality value conversion table | |
| 294 my @conv_table; | |
| 295 if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3): | |
| 296 for (-64..64) { | |
| 297 $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499); | |
| 298 } | |
| 299 } else { # convert from phred+64 quality values (pipeline v1.3+): | |
| 300 for (-64..-1) { | |
| 301 $conv_table[$_+64] = undef; | |
| 302 } | |
| 303 for (0..64) { | |
| 304 $conv_table[$_+64] = int(33 + $_); | |
| 305 } | |
| 306 } | |
| 307 # write the header | |
| 308 print write_header( $progname, $version, $cmdline ); | |
| 309 # core loop | |
| 310 my $export_line_count = 0; | |
| 311 while (<$fh1>) { | |
| 312 $export_line_count++; | |
| 313 my (@s1, @s2); | |
| 314 &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); | |
| 315 if ($is_paired) { | |
| 316 my $read2line = <$fh2>; | |
| 317 if(not $read2line){ | |
| 318 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); | |
| 319 } | |
| 320 &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); | |
| 321 | |
| 322 if (@s1 && @s2) { # then set mate coordinate | |
| 323 if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ | |
| 324 die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); | |
| 325 } | |
| 326 | |
| 327 my $isize = 0; | |
| 328 if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize | |
| 329 my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; | |
| 330 my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; | |
| 331 $isize = $x2 - $x1; | |
| 332 } | |
| 333 | |
| 334 foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ | |
| 335 my ($sa,$sb,$is) = @{$_}; | |
| 336 if ($sb->[SAM_RNAME] ne '*') { | |
| 337 $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; | |
| 338 $sa->[SAM_MPOS] = $sb->[SAM_POS]; | |
| 339 $sa->[SAM_ISIZE] = $is; | |
| 340 $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); | |
| 341 } else { | |
| 342 $sa->[SAM_FLAG] |= 0x8; | |
| 343 } | |
| 344 } | |
| 345 } | |
| 346 } | |
| 347 print join("\t", @s1), "\n" if (@s1); | |
| 348 print join("\t", @s2), "\n" if (@s2 && $is_paired); | |
| 349 } | |
| 350 close($fh1); | |
| 351 if($is_paired) { | |
| 352 while(my $read2line = <$fh2>){ | |
| 353 $export_line_count++; | |
| 354 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); | |
| 355 } | |
| 356 close($fh2); | |
| 357 } | |
| 358 } | |
| 359 | |
| 360 sub export2sam_aux { | |
| 361 my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; | |
| 362 chomp($line); | |
| 363 my @t = split(/\t/, $line, -1); | |
| 364 my $isPassFilt = 1; | |
| 365 # Sorted files do not have passfilter column, so the number of columns can be total-1. | |
| 366 if(scalar(@t) < (EXPORT_SIZE - 1)) { | |
| 367 my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . (EXPORT_SIZE - 1) . ".\n"; | |
| 368 $msg.="\t...erroneous export record:\n" . $line . "\n\n"; | |
| 369 die($msg); | |
| 370 } | |
| 371 elsif(scalar(@t) == EXPORT_SIZE) { | |
| 372 $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); | |
| 373 return if(not ($isPassFilt or $is_nofilter)); | |
| 374 } | |
| 375 @$s = (); | |
| 376 # read name | |
| 377 my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); | |
| 378 $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), | |
| 379 int($t[EXPORT_X]), int($t[EXPORT_Y])); | |
| 380 # initial flag (will be updated later) | |
| 381 $s->[SAM_FLAG] = 0; | |
| 382 if($is_paired) { | |
| 383 if($t[EXPORT_READNO] != $read_no){ | |
| 384 die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); | |
| 385 } | |
| 386 $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); | |
| 387 } | |
| 388 $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); | |
| 389 | |
| 390 # read & quality | |
| 391 my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); | |
| 392 if ($is_export_rev) { # then reverse the sequence and quality | |
| 393 $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); | |
| 394 $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; | |
| 395 $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); | |
| 396 } else { | |
| 397 $s->[SAM_SEQ] = $t[EXPORT_READ]; | |
| 398 $s->[SAM_QUAL] = $t[EXPORT_QUAL]; | |
| 399 } | |
| 400 my @convqual = (); | |
| 401 foreach (unpack('C*', $s->[SAM_QUAL])){ | |
| 402 my $val=$ct->[$_]; | |
| 403 if(not defined $val){ | |
| 404 my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; | |
| 405 if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } | |
| 406 die($msg . "\n"); | |
| 407 } | |
| 408 push @convqual,$val; | |
| 409 } | |
| 410 | |
| 411 $s->[SAM_QUAL] = pack('C*',@convqual); # change coding | |
| 412 | |
| 413 | |
| 414 # coor | |
| 415 my $has_coor = 0; | |
| 416 $s->[SAM_RNAME] = "*"; | |
| 417 if (($t[EXPORT_CHROM] eq 'NM') or | |
| 418 ($t[EXPORT_CHROM] eq 'QC') or | |
| 419 ($t[EXPORT_CHROM] eq 'RM') or | |
| 420 ($t[EXPORT_CHROM] eq 'CONTROL')) { | |
| 421 $s->[SAM_FLAG] |= 0x4; # unmapped | |
| 422 push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); | |
| 423 } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { | |
| 424 $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? | |
| 425 push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") | |
| 426 } elsif ($t[EXPORT_POS] < 1) { | |
| 427 $s->[SAM_FLAG] |= 0x4; # unmapped | |
| 428 } else { | |
| 429 $s->[SAM_RNAME] = $t[EXPORT_CHROM]; | |
| 430 $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); | |
| 431 $has_coor = 1; | |
| 432 } | |
| 433 $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; | |
| 434 | |
| 435 # print STDERR "t[14] = " . $t[14] . "\n"; | |
| 436 my $matchDesc = ''; | |
| 437 $s->[SAM_CIGAR] = "*"; | |
| 438 if($has_coor){ | |
| 439 $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; | |
| 440 | |
| 441 if($matchDesc =~ /\^/){ | |
| 442 # construct CIGAR string using Richard's function | |
| 443 $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing | |
| 444 } else { | |
| 445 $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; | |
| 446 } | |
| 447 } | |
| 448 | |
| 449 # print STDERR "cigar_string = $cigar_string\n"; | |
| 450 | |
| 451 $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); | |
| 452 if($has_coor){ | |
| 453 my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; | |
| 454 my $pemap = 0; | |
| 455 if($is_paired) { | |
| 456 $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; | |
| 457 | |
| 458 # set `proper pair' bit if non-blank, non-zero PE alignment score: | |
| 459 $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); | |
| 460 } | |
| 461 $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); | |
| 462 } else { | |
| 463 $s->[SAM_MAPQ] = 0; | |
| 464 } | |
| 465 # mate coordinate | |
| 466 $s->[SAM_MRNM] = '*'; | |
| 467 $s->[SAM_MPOS] = 0; | |
| 468 $s->[SAM_ISIZE] = 0; | |
| 469 # aux | |
| 470 push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); | |
| 471 if($has_coor){ | |
| 472 # The export match descriptor differs slightly from the samtools match descriptor. | |
| 473 # In order for the converted SAM files to be as compliant as possible, | |
| 474 # we put the export match descriptor in optional field 'XD' rather than 'MD': | |
| 475 push(@$s, "XD:Z:$matchDesc"); | |
| 476 push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); | |
| 477 push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); | |
| 478 } | |
| 479 } | |
| 480 | |
| 481 | |
| 482 | |
| 483 # | |
| 484 # the following code is taken from Richard Shaw's sorted2sam.pl file | |
| 485 # | |
| 486 sub reverse_compl_match_descriptor($) | |
| 487 { | |
| 488 # print "\nREVERSING THE MATCH DESCRIPTOR!\n"; | |
| 489 my ($match_desc) = @_; | |
| 490 my $rev_compl_match_desc = reverse($match_desc); | |
| 491 $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; | |
| 492 | |
| 493 # Unreverse the digits of numbers. | |
| 494 $rev_compl_match_desc = join('', | |
| 495 map {($_ =~ /\d+/) | |
| 496 ? join('', reverse(split('', $_))) | |
| 497 : $_} split(/(\d+)/, | |
| 498 $rev_compl_match_desc)); | |
| 499 | |
| 500 return $rev_compl_match_desc; | |
| 501 } | |
| 502 | |
| 503 | |
| 504 | |
| 505 sub match_desc_to_cigar($) | |
| 506 { | |
| 507 my ($match_desc) = @_; | |
| 508 | |
| 509 my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); | |
| 510 my $cigar_str = ''; | |
| 511 my $cigar_del_ch = 'D'; | |
| 512 my $cigar_ins_ch = 'I'; | |
| 513 my $cigar_match_ch = 'M'; | |
| 514 | |
| 515 foreach my $match_desc_part (@match_desc_parts) { | |
| 516 next if (!$match_desc_part); | |
| 517 | |
| 518 if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { | |
| 519 # Deletion | |
| 520 $cigar_str .= (length($1) . $cigar_del_ch); | |
| 521 } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { | |
| 522 # Insertion | |
| 523 $cigar_str .= ($1 . $cigar_ins_ch); | |
| 524 } else { | |
| 525 $cigar_str .= (match_desc_frag_length($match_desc_part) | |
| 526 . $cigar_match_ch); | |
| 527 } | |
| 528 } | |
| 529 | |
| 530 return $cigar_str; | |
| 531 } | |
| 532 | |
| 533 | |
| 534 #------------------------------------------------------------------------------ | |
| 535 | |
| 536 sub match_desc_frag_length($) | |
| 537 { | |
| 538 my ($match_desc_str) = @_; | |
| 539 my $len = 0; | |
| 540 | |
| 541 my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); | |
| 542 | |
| 543 foreach my $match_desc_field (@match_desc_fields) { | |
| 544 next if ($match_desc_field eq ''); | |
| 545 | |
| 546 $len += (($match_desc_field =~ /(\d+)/) | |
| 547 ? $1 : length($match_desc_field)); | |
| 548 } | |
| 549 | |
| 550 return $len; | |
| 551 } | |
| 552 | |
| 553 | |
| 554 # argument holds the command line | |
| 555 sub write_header($;$;$) | |
| 556 { | |
| 557 my ($progname,$version,$cl) = @_; | |
| 558 my $complete_header = ""; | |
| 559 $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; | |
| 560 | |
| 561 return $complete_header; | |
| 562 } |
