# HG changeset patch
# User pitagora
# Date 1410504154 14400
# Node ID 4ebcbb69142627778cf8a27a11d25a84c75d0dca
Uploaded
diff -r 000000000000 -r 4ebcbb691426 export2sam/export2sam.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export2sam/export2sam.sh Fri Sep 12 02:42:34 2014 -0400
@@ -0,0 +1,42 @@
+#!/bin/sh
+# SET TOOL DIRECTORY
+
+CMDNAME=`basename $0`
+
+while getopts a:b:o:gr OPT
+do
+ case $OPT in
+ "a" ) FLG_A="TRUE" ; VALUE_A="$OPTARG" ;;
+ "b" ) FLG_B="TRUE" ; VALUE_B="$OPTARG" ;;
+ "o" ) FLG_O="TRUE" ; VALUE_O="$OPTARG" ;;
+ "g" ) FLG_G="TRUE" ;;
+ "r" ) FLG_R="TRUE" ;;
+ * ) echo "Usage: $CMDNAME [-a VALUE] [-b] [-o VALUE] [-gr]" 1>&2
+ exit 1 ;;
+ esac
+done
+
+# SINGLE END OR PAIR END
+if [ ! $FLG_B ] ; then
+ ARGS="--read1=$VALUE_A"
+else
+ ARGS="--read1=$VALUE_A --read2=$VALUE_B"
+fi
+
+# GZIPPED OR NOT
+if [ $FLG_G ] ; then
+ CMD1="./illumina_export2sam_gz.pl $ARGS"
+else
+ CMD1="./illumina_export2sam.pl $ARGS"
+fi
+
+# MODIFY ACCORDING TO THE REFERENCE
+if [ $FLG_R ] ; then
+ CMD2='$tmp=$_;$tmp=~s/chr_fragments\.fa\///g;$tmp=~s/(chr\w+)\.fa/$1/g;print $tmp;'
+ CMD="$CMD1 | perl -ne '$CMD2'"
+else
+ CMD=$CMD1
+fi
+
+echo $CMD
+eval $CMD > $VALUE_O
diff -r 000000000000 -r 4ebcbb691426 export2sam/export2sam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export2sam/export2sam.xml Fri Sep 12 02:42:34 2014 -0400
@@ -0,0 +1,33 @@
+
+
+
+
+ export2sam.sh
+ -a $read1
+ #if $pairCondition.pairSelect == "yes"
+ -b $read2
+ #end if
+ -o $output
+ $gunzip
+ $remove
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If thou thyself canst do it, attend no other's help or hand.
+
diff -r 000000000000 -r 4ebcbb691426 export2sam/illumina_export2sam.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export2sam/illumina_export2sam.pl Fri Sep 12 02:42:34 2014 -0400
@@ -0,0 +1,562 @@
+#!/usr/bin/env perl
+#
+#
+# illumina_export2sam.pl converts GERALD export files to SAM format.
+#
+#
+#
+########## License:
+#
+# The MIT License
+#
+# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
+# Modified SAMtools work copyright (c) 2010 Illumina, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+#
+#
+########## Additional notice for CASAVA installation:
+#
+# This file [illumina_export2sam.pl] in the CASAVA installation has
+# been copied from the file [export2sam.pl] in the SAMtools package
+# and modified by Illumina as permitted under the MIT license that
+# governs SAMtools. Illumina recommends the use of the modified
+# version to convert data from the Illumina export format to the SAM
+# file format. The terms of the MIT license specify your right to
+# further modify and distribute the SAMtools code. For the avoidance
+# of doubt, your rights with respect to copying, modifying, using and
+# distributing CASAVA are more restricted than the rights in the MIT
+# license, and are set forth in the Illumina Genome Analyzer Software
+# License Agreement and the Illumina Source Code License Agreement.
+#
+########## ChangeLog:
+#
+# Version: 2.3.1 (18MAR2011)
+#
+# - Restore file '-' as stdin input.
+#
+# Version: 2.3.0 (24JAN2011)
+#
+# - Add support for export reserved chromosome name "CONTROL",
+# which is translated to optional field "XC:Z:CONTROL".
+# - Check for ".gz" file extension on export files and open
+# these as gzip pipes when the extension is found.
+#
+# Version: 2.2.0 (16NOV2010)
+#
+# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y
+# - For export records with reserved chromosome name identifiers
+# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"
+# to the SAM record, so that these cases can be distinguished
+# from other unmatched reads.
+#
+# Version: 2.1.0 (21SEP2010)
+#
+# - Additional export record error checking.
+# - Convert export records with chromosome value of "RM" to unmapped
+# SAM records.
+#
+# Version: 2.0.0 (15FEB2010)
+#
+# Script updated by Illumina in conjunction with CASAVA 1.7.0
+# release.
+#
+# Major changes are as follows:
+# - The CIGAR string has been updated to include all gaps from
+# ELANDv2 alignments.
+# - The ELAND single read alignment score is always stored in the
+# optional "SM" field and the ELAND paired read alignment score
+# is stored in the optional "AS" field when it exists.
+# - The MAPQ value is set to the higher of the two alignment scores,
+# but no greater than 254, i.e. min(254,max(SM,AS))
+# - The SAM "proper pair" bit (0x0002) is now set for read pairs
+# meeting ELAND's expected orientation and insert size criteria.
+# - The default quality score translation is set for export files
+# which contain Phread+64 quality values. An option,
+# "--qlogodds", has been added to translate quality values from
+# the Solexa+64 format used in export files prior to Pipeline
+# 1.3
+# - The export match descriptor is now reverse-complemented when
+# necessary such that it always corresponds to the forward
+# strand of the reference, to be consistent with other
+# information in the SAM record. It is now written to the
+# optional 'XD' field (rather than 'MD') to acknowledge its
+# minor differences from the samtools match descriptor (see
+# additional detail below).
+# - An option, "--nofilter", has been added to include reads which
+# have failed primary analysis quality filtration. Such reads
+# will have the corresponding SAM flag bit (0x0200) set.
+# - Labels in the export 'contig' field are preserved by setting
+# RNAME to "$export_chromosome/$export_contig" when the contig
+# label exists.
+#
+#
+# Contact: lh3
+# Version: 0.1.2 (03JAN2009)
+#
+#
+#
+########## Known Conversion Limitations:
+#
+# - Export records for reads that map to a position < 1 (allowed
+# in export format), are converted to unmapped reads in the SAM
+# record.
+# - Export records contain the reserved chromosome names: "NM",
+# "QC","RM" and "CONTROL". "NM" indicates that the aligner could
+# not map the read to the reference sequence set. "QC" means that
+# the aligner did not attempt to map the read due to some
+# technical limitation. "RM" means that the read mapped to a set
+# of 'contaminant' sequences specified in GERALD's RNA-seq
+# workflow. "CONTROL" means that the read is a control. All of
+# these alignment types are collapsed to the single unmapped
+# alignment state in the SAM record, but the optional SAM "XC"
+# field is used to record the original reserved chromosome name of
+# the read for all but the "NM" case.
+# - The export match descriptor is slightly different than the
+# samtools match descriptor. For this reason it is stored in the
+# optional SAM field 'XD' (and not 'MD'). Note that the export
+# match descriptor differs from the samtools version in two
+# respects: (1) indels are explicitly closed with the '$'
+# character and (2) insertions must be enumerated in the match
+# descriptor. For example a 35-base read with a two-base insertion
+# is described as: 20^2$14
+#
+#
+#
+
+my $version = "2.3.1";
+
+use strict;
+use warnings;
+
+use Getopt::Long;
+use File::Spec;
+use List::Util qw(min max);
+
+
+use constant {
+ EXPORT_MACHINE => 0,
+ EXPORT_RUNNO => 1,
+ EXPORT_LANE => 2,
+ EXPORT_TILE => 3,
+ EXPORT_X => 4,
+ EXPORT_Y => 5,
+ EXPORT_INDEX => 6,
+ EXPORT_READNO => 7,
+ EXPORT_READ => 8,
+ EXPORT_QUAL => 9,
+ EXPORT_CHROM => 10,
+ EXPORT_CONTIG => 11,
+ EXPORT_POS => 12,
+ EXPORT_STRAND => 13,
+ EXPORT_MD => 14,
+ EXPORT_SEMAP => 15,
+ EXPORT_PEMAP => 16,
+ EXPORT_PASSFILT => 21,
+ EXPORT_SIZE => 22,
+};
+
+
+use constant {
+ SAM_QNAME => 0,
+ SAM_FLAG => 1,
+ SAM_RNAME => 2,
+ SAM_POS => 3,
+ SAM_MAPQ => 4,
+ SAM_CIGAR => 5,
+ SAM_MRNM => 6,
+ SAM_MPOS => 7,
+ SAM_ISIZE => 8,
+ SAM_SEQ => 9,
+ SAM_QUAL => 10,
+};
+
+
+# function prototypes for Richard's code
+sub match_desc_to_cigar($);
+sub match_desc_frag_length($);
+sub reverse_compl_match_descriptor($);
+sub write_header($;$;$);
+
+
+&export2sam;
+exit;
+
+
+
+
+sub export2sam {
+
+ my $cmdline = $0 . " " . join(" ",@ARGV);
+ my $arg_count = scalar @ARGV;
+ my $progname = "illumina_export2sam.pl";
+
+ my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
+ my $is_nofilter = 0;
+ my $read1file;
+ my $read2file;
+ my $print_version = 0;
+ my $help = 0;
+
+ my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
+ "nofilter" => \$is_nofilter,
+ "read1=s" => \$read1file,
+ "read2=s" => \$read2file,
+ "version" => \$print_version,
+ "help" => \$help );
+
+ my $usage = <) {
+ $export_line_count++;
+ my (@s1, @s2);
+ &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
+ if ($is_paired) {
+ my $read2line = <$fh2>;
+ if(not $read2line){
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
+ }
+ &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
+
+ if (@s1 && @s2) { # then set mate coordinate
+ if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
+ die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
+ }
+
+ my $isize = 0;
+ if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
+ my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
+ my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
+ $isize = $x2 - $x1;
+ }
+
+ foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
+ my ($sa,$sb,$is) = @{$_};
+ if ($sb->[SAM_RNAME] ne '*') {
+ $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
+ $sa->[SAM_MPOS] = $sb->[SAM_POS];
+ $sa->[SAM_ISIZE] = $is;
+ $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
+ } else {
+ $sa->[SAM_FLAG] |= 0x8;
+ }
+ }
+ }
+ }
+ print join("\t", @s1), "\n" if (@s1);
+ print join("\t", @s2), "\n" if (@s2 && $is_paired);
+ }
+ close($fh1);
+ if($is_paired) {
+ while(my $read2line = <$fh2>){
+ $export_line_count++;
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
+ }
+ close($fh2);
+ }
+}
+
+sub export2sam_aux {
+ my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
+ chomp($line);
+ my @t = split(/\t/, $line, -1);
+ my $isPassFilt = 1;
+ # Sorted files do not have passfilter column, so the number of columns can be total-1.
+ if(scalar(@t) < (EXPORT_SIZE - 1)) {
+ my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . (EXPORT_SIZE - 1) . ".\n";
+ $msg.="\t...erroneous export record:\n" . $line . "\n\n";
+ die($msg);
+ }
+ elsif(scalar(@t) == EXPORT_SIZE) {
+ $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
+ return if(not ($isPassFilt or $is_nofilter));
+ }
+ @$s = ();
+ # read name
+ my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");
+ $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),
+ int($t[EXPORT_X]), int($t[EXPORT_Y]));
+ # initial flag (will be updated later)
+ $s->[SAM_FLAG] = 0;
+ if($is_paired) {
+ if($t[EXPORT_READNO] != $read_no){
+ die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
+ }
+ $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
+ }
+ $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
+
+ # read & quality
+ my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
+ if ($is_export_rev) { # then reverse the sequence and quality
+ $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
+ $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
+ $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
+ } else {
+ $s->[SAM_SEQ] = $t[EXPORT_READ];
+ $s->[SAM_QUAL] = $t[EXPORT_QUAL];
+ }
+ my @convqual = ();
+ foreach (unpack('C*', $s->[SAM_QUAL])){
+ my $val=$ct->[$_];
+ if(not defined $val){
+ my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
+ if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
+ die($msg . "\n");
+ }
+ push @convqual,$val;
+ }
+
+ $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
+
+
+ # coor
+ my $has_coor = 0;
+ $s->[SAM_RNAME] = "*";
+ if (($t[EXPORT_CHROM] eq 'NM') or
+ ($t[EXPORT_CHROM] eq 'QC') or
+ ($t[EXPORT_CHROM] eq 'RM') or
+ ($t[EXPORT_CHROM] eq 'CONTROL')) {
+ $s->[SAM_FLAG] |= 0x4; # unmapped
+ push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');
+ } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
+ $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
+ push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
+ } elsif ($t[EXPORT_POS] < 1) {
+ $s->[SAM_FLAG] |= 0x4; # unmapped
+ } else {
+ $s->[SAM_RNAME] = $t[EXPORT_CHROM];
+ $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
+ $has_coor = 1;
+ }
+ $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
+
+# print STDERR "t[14] = " . $t[14] . "\n";
+ my $matchDesc = '';
+ $s->[SAM_CIGAR] = "*";
+ if($has_coor){
+ $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
+
+ if($matchDesc =~ /\^/){
+ # construct CIGAR string using Richard's function
+ $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
+ } else {
+ $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
+ }
+ }
+
+# print STDERR "cigar_string = $cigar_string\n";
+
+ $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
+ if($has_coor){
+ my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
+ my $pemap = 0;
+ if($is_paired) {
+ $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
+
+ # set `proper pair' bit if non-blank, non-zero PE alignment score:
+ $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
+ }
+ $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
+ } else {
+ $s->[SAM_MAPQ] = 0;
+ }
+ # mate coordinate
+ $s->[SAM_MRNM] = '*';
+ $s->[SAM_MPOS] = 0;
+ $s->[SAM_ISIZE] = 0;
+ # aux
+ push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
+ if($has_coor){
+ # The export match descriptor differs slightly from the samtools match descriptor.
+ # In order for the converted SAM files to be as compliant as possible,
+ # we put the export match descriptor in optional field 'XD' rather than 'MD':
+ push(@$s, "XD:Z:$matchDesc");
+ push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
+ push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
+ }
+}
+
+
+
+#
+# the following code is taken from Richard Shaw's sorted2sam.pl file
+#
+sub reverse_compl_match_descriptor($)
+{
+# print "\nREVERSING THE MATCH DESCRIPTOR!\n";
+ my ($match_desc) = @_;
+ my $rev_compl_match_desc = reverse($match_desc);
+ $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
+
+ # Unreverse the digits of numbers.
+ $rev_compl_match_desc = join('',
+ map {($_ =~ /\d+/)
+ ? join('', reverse(split('', $_)))
+ : $_} split(/(\d+)/,
+ $rev_compl_match_desc));
+
+ return $rev_compl_match_desc;
+}
+
+
+
+sub match_desc_to_cigar($)
+{
+ my ($match_desc) = @_;
+
+ my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
+ my $cigar_str = '';
+ my $cigar_del_ch = 'D';
+ my $cigar_ins_ch = 'I';
+ my $cigar_match_ch = 'M';
+
+ foreach my $match_desc_part (@match_desc_parts) {
+ next if (!$match_desc_part);
+
+ if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
+ # Deletion
+ $cigar_str .= (length($1) . $cigar_del_ch);
+ } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
+ # Insertion
+ $cigar_str .= ($1 . $cigar_ins_ch);
+ } else {
+ $cigar_str .= (match_desc_frag_length($match_desc_part)
+ . $cigar_match_ch);
+ }
+ }
+
+ return $cigar_str;
+}
+
+
+#------------------------------------------------------------------------------
+
+sub match_desc_frag_length($)
+ {
+ my ($match_desc_str) = @_;
+ my $len = 0;
+
+ my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
+
+ foreach my $match_desc_field (@match_desc_fields) {
+ next if ($match_desc_field eq '');
+
+ $len += (($match_desc_field =~ /(\d+)/)
+ ? $1 : length($match_desc_field));
+ }
+
+ return $len;
+}
+
+
+# argument holds the command line
+sub write_header($;$;$)
+{
+ my ($progname,$version,$cl) = @_;
+ my $complete_header = "";
+ $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
+
+ return $complete_header;
+}
diff -r 000000000000 -r 4ebcbb691426 export2sam/illumina_export2sam_gz.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export2sam/illumina_export2sam_gz.pl Fri Sep 12 02:42:34 2014 -0400
@@ -0,0 +1,562 @@
+#!/usr/bin/env perl
+#
+#
+# illumina_export2sam.pl converts GERALD export files to SAM format.
+#
+#
+#
+########## License:
+#
+# The MIT License
+#
+# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
+# Modified SAMtools work copyright (c) 2010 Illumina, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+#
+#
+########## Additional notice for CASAVA installation:
+#
+# This file [illumina_export2sam.pl] in the CASAVA installation has
+# been copied from the file [export2sam.pl] in the SAMtools package
+# and modified by Illumina as permitted under the MIT license that
+# governs SAMtools. Illumina recommends the use of the modified
+# version to convert data from the Illumina export format to the SAM
+# file format. The terms of the MIT license specify your right to
+# further modify and distribute the SAMtools code. For the avoidance
+# of doubt, your rights with respect to copying, modifying, using and
+# distributing CASAVA are more restricted than the rights in the MIT
+# license, and are set forth in the Illumina Genome Analyzer Software
+# License Agreement and the Illumina Source Code License Agreement.
+#
+########## ChangeLog:
+#
+# Version: 2.3.1 (18MAR2011)
+#
+# - Restore file '-' as stdin input.
+#
+# Version: 2.3.0 (24JAN2011)
+#
+# - Add support for export reserved chromosome name "CONTROL",
+# which is translated to optional field "XC:Z:CONTROL".
+# - Check for ".gz" file extension on export files and open
+# these as gzip pipes when the extension is found.
+#
+# Version: 2.2.0 (16NOV2010)
+#
+# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y
+# - For export records with reserved chromosome name identifiers
+# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"
+# to the SAM record, so that these cases can be distinguished
+# from other unmatched reads.
+#
+# Version: 2.1.0 (21SEP2010)
+#
+# - Additional export record error checking.
+# - Convert export records with chromosome value of "RM" to unmapped
+# SAM records.
+#
+# Version: 2.0.0 (15FEB2010)
+#
+# Script updated by Illumina in conjunction with CASAVA 1.7.0
+# release.
+#
+# Major changes are as follows:
+# - The CIGAR string has been updated to include all gaps from
+# ELANDv2 alignments.
+# - The ELAND single read alignment score is always stored in the
+# optional "SM" field and the ELAND paired read alignment score
+# is stored in the optional "AS" field when it exists.
+# - The MAPQ value is set to the higher of the two alignment scores,
+# but no greater than 254, i.e. min(254,max(SM,AS))
+# - The SAM "proper pair" bit (0x0002) is now set for read pairs
+# meeting ELAND's expected orientation and insert size criteria.
+# - The default quality score translation is set for export files
+# which contain Phread+64 quality values. An option,
+# "--qlogodds", has been added to translate quality values from
+# the Solexa+64 format used in export files prior to Pipeline
+# 1.3
+# - The export match descriptor is now reverse-complemented when
+# necessary such that it always corresponds to the forward
+# strand of the reference, to be consistent with other
+# information in the SAM record. It is now written to the
+# optional 'XD' field (rather than 'MD') to acknowledge its
+# minor differences from the samtools match descriptor (see
+# additional detail below).
+# - An option, "--nofilter", has been added to include reads which
+# have failed primary analysis quality filtration. Such reads
+# will have the corresponding SAM flag bit (0x0200) set.
+# - Labels in the export 'contig' field are preserved by setting
+# RNAME to "$export_chromosome/$export_contig" when the contig
+# label exists.
+#
+#
+# Contact: lh3
+# Version: 0.1.2 (03JAN2009)
+#
+#
+#
+########## Known Conversion Limitations:
+#
+# - Export records for reads that map to a position < 1 (allowed
+# in export format), are converted to unmapped reads in the SAM
+# record.
+# - Export records contain the reserved chromosome names: "NM",
+# "QC","RM" and "CONTROL". "NM" indicates that the aligner could
+# not map the read to the reference sequence set. "QC" means that
+# the aligner did not attempt to map the read due to some
+# technical limitation. "RM" means that the read mapped to a set
+# of 'contaminant' sequences specified in GERALD's RNA-seq
+# workflow. "CONTROL" means that the read is a control. All of
+# these alignment types are collapsed to the single unmapped
+# alignment state in the SAM record, but the optional SAM "XC"
+# field is used to record the original reserved chromosome name of
+# the read for all but the "NM" case.
+# - The export match descriptor is slightly different than the
+# samtools match descriptor. For this reason it is stored in the
+# optional SAM field 'XD' (and not 'MD'). Note that the export
+# match descriptor differs from the samtools version in two
+# respects: (1) indels are explicitly closed with the '$'
+# character and (2) insertions must be enumerated in the match
+# descriptor. For example a 35-base read with a two-base insertion
+# is described as: 20^2$14
+#
+#
+#
+
+my $version = "2.3.1";
+
+use strict;
+use warnings;
+
+use Getopt::Long;
+use File::Spec;
+use List::Util qw(min max);
+
+
+use constant {
+ EXPORT_MACHINE => 0,
+ EXPORT_RUNNO => 1,
+ EXPORT_LANE => 2,
+ EXPORT_TILE => 3,
+ EXPORT_X => 4,
+ EXPORT_Y => 5,
+ EXPORT_INDEX => 6,
+ EXPORT_READNO => 7,
+ EXPORT_READ => 8,
+ EXPORT_QUAL => 9,
+ EXPORT_CHROM => 10,
+ EXPORT_CONTIG => 11,
+ EXPORT_POS => 12,
+ EXPORT_STRAND => 13,
+ EXPORT_MD => 14,
+ EXPORT_SEMAP => 15,
+ EXPORT_PEMAP => 16,
+ EXPORT_PASSFILT => 21,
+ EXPORT_SIZE => 22,
+};
+
+
+use constant {
+ SAM_QNAME => 0,
+ SAM_FLAG => 1,
+ SAM_RNAME => 2,
+ SAM_POS => 3,
+ SAM_MAPQ => 4,
+ SAM_CIGAR => 5,
+ SAM_MRNM => 6,
+ SAM_MPOS => 7,
+ SAM_ISIZE => 8,
+ SAM_SEQ => 9,
+ SAM_QUAL => 10,
+};
+
+
+# function prototypes for Richard's code
+sub match_desc_to_cigar($);
+sub match_desc_frag_length($);
+sub reverse_compl_match_descriptor($);
+sub write_header($;$;$);
+
+
+&export2sam;
+exit;
+
+
+
+
+sub export2sam {
+
+ my $cmdline = $0 . " " . join(" ",@ARGV);
+ my $arg_count = scalar @ARGV;
+ my $progname = "illumina_export2sam.pl";
+
+ my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
+ my $is_nofilter = 0;
+ my $read1file;
+ my $read2file;
+ my $print_version = 0;
+ my $help = 0;
+
+ my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
+ "nofilter" => \$is_nofilter,
+ "read1=s" => \$read1file,
+ "read2=s" => \$read2file,
+ "version" => \$print_version,
+ "help" => \$help );
+
+ my $usage = <) {
+ $export_line_count++;
+ my (@s1, @s2);
+ &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
+ if ($is_paired) {
+ my $read2line = <$fh2>;
+ if(not $read2line){
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
+ }
+ &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
+
+ if (@s1 && @s2) { # then set mate coordinate
+ if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
+ die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
+ }
+
+ my $isize = 0;
+ if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
+ my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
+ my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
+ $isize = $x2 - $x1;
+ }
+
+ foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
+ my ($sa,$sb,$is) = @{$_};
+ if ($sb->[SAM_RNAME] ne '*') {
+ $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
+ $sa->[SAM_MPOS] = $sb->[SAM_POS];
+ $sa->[SAM_ISIZE] = $is;
+ $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
+ } else {
+ $sa->[SAM_FLAG] |= 0x8;
+ }
+ }
+ }
+ }
+ print join("\t", @s1), "\n" if (@s1);
+ print join("\t", @s2), "\n" if (@s2 && $is_paired);
+ }
+ close($fh1);
+ if($is_paired) {
+ while(my $read2line = <$fh2>){
+ $export_line_count++;
+ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
+ }
+ close($fh2);
+ }
+}
+
+sub export2sam_aux {
+ my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
+ chomp($line);
+ my @t = split(/\t/, $line, -1);
+ my $isPassFilt = 1;
+ # Sorted files do not have passfilter column, so the number of columns can be total-1.
+ if(scalar(@t) < (EXPORT_SIZE - 1)) {
+ my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . (EXPORT_SIZE - 1) . ".\n";
+ $msg.="\t...erroneous export record:\n" . $line . "\n\n";
+ die($msg);
+ }
+ elsif(scalar(@t) == EXPORT_SIZE) {
+ $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
+ return if(not ($isPassFilt or $is_nofilter));
+ }
+ @$s = ();
+ # read name
+ my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");
+ $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),
+ int($t[EXPORT_X]), int($t[EXPORT_Y]));
+ # initial flag (will be updated later)
+ $s->[SAM_FLAG] = 0;
+ if($is_paired) {
+ if($t[EXPORT_READNO] != $read_no){
+ die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
+ }
+ $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
+ }
+ $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
+
+ # read & quality
+ my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
+ if ($is_export_rev) { # then reverse the sequence and quality
+ $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
+ $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
+ $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
+ } else {
+ $s->[SAM_SEQ] = $t[EXPORT_READ];
+ $s->[SAM_QUAL] = $t[EXPORT_QUAL];
+ }
+ my @convqual = ();
+ foreach (unpack('C*', $s->[SAM_QUAL])){
+ my $val=$ct->[$_];
+ if(not defined $val){
+ my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
+ if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
+ die($msg . "\n");
+ }
+ push @convqual,$val;
+ }
+
+ $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
+
+
+ # coor
+ my $has_coor = 0;
+ $s->[SAM_RNAME] = "*";
+ if (($t[EXPORT_CHROM] eq 'NM') or
+ ($t[EXPORT_CHROM] eq 'QC') or
+ ($t[EXPORT_CHROM] eq 'RM') or
+ ($t[EXPORT_CHROM] eq 'CONTROL')) {
+ $s->[SAM_FLAG] |= 0x4; # unmapped
+ push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');
+ } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
+ $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
+ push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
+ } elsif ($t[EXPORT_POS] < 1) {
+ $s->[SAM_FLAG] |= 0x4; # unmapped
+ } else {
+ $s->[SAM_RNAME] = $t[EXPORT_CHROM];
+ $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
+ $has_coor = 1;
+ }
+ $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
+
+# print STDERR "t[14] = " . $t[14] . "\n";
+ my $matchDesc = '';
+ $s->[SAM_CIGAR] = "*";
+ if($has_coor){
+ $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
+
+ if($matchDesc =~ /\^/){
+ # construct CIGAR string using Richard's function
+ $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
+ } else {
+ $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
+ }
+ }
+
+# print STDERR "cigar_string = $cigar_string\n";
+
+ $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
+ if($has_coor){
+ my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
+ my $pemap = 0;
+ if($is_paired) {
+ $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
+
+ # set `proper pair' bit if non-blank, non-zero PE alignment score:
+ $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
+ }
+ $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
+ } else {
+ $s->[SAM_MAPQ] = 0;
+ }
+ # mate coordinate
+ $s->[SAM_MRNM] = '*';
+ $s->[SAM_MPOS] = 0;
+ $s->[SAM_ISIZE] = 0;
+ # aux
+ push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
+ if($has_coor){
+ # The export match descriptor differs slightly from the samtools match descriptor.
+ # In order for the converted SAM files to be as compliant as possible,
+ # we put the export match descriptor in optional field 'XD' rather than 'MD':
+ push(@$s, "XD:Z:$matchDesc");
+ push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
+ push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
+ }
+}
+
+
+
+#
+# the following code is taken from Richard Shaw's sorted2sam.pl file
+#
+sub reverse_compl_match_descriptor($)
+{
+# print "\nREVERSING THE MATCH DESCRIPTOR!\n";
+ my ($match_desc) = @_;
+ my $rev_compl_match_desc = reverse($match_desc);
+ $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
+
+ # Unreverse the digits of numbers.
+ $rev_compl_match_desc = join('',
+ map {($_ =~ /\d+/)
+ ? join('', reverse(split('', $_)))
+ : $_} split(/(\d+)/,
+ $rev_compl_match_desc));
+
+ return $rev_compl_match_desc;
+}
+
+
+
+sub match_desc_to_cigar($)
+{
+ my ($match_desc) = @_;
+
+ my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
+ my $cigar_str = '';
+ my $cigar_del_ch = 'D';
+ my $cigar_ins_ch = 'I';
+ my $cigar_match_ch = 'M';
+
+ foreach my $match_desc_part (@match_desc_parts) {
+ next if (!$match_desc_part);
+
+ if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
+ # Deletion
+ $cigar_str .= (length($1) . $cigar_del_ch);
+ } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
+ # Insertion
+ $cigar_str .= ($1 . $cigar_ins_ch);
+ } else {
+ $cigar_str .= (match_desc_frag_length($match_desc_part)
+ . $cigar_match_ch);
+ }
+ }
+
+ return $cigar_str;
+}
+
+
+#------------------------------------------------------------------------------
+
+sub match_desc_frag_length($)
+ {
+ my ($match_desc_str) = @_;
+ my $len = 0;
+
+ my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
+
+ foreach my $match_desc_field (@match_desc_fields) {
+ next if ($match_desc_field eq '');
+
+ $len += (($match_desc_field =~ /(\d+)/)
+ ? $1 : length($match_desc_field));
+ }
+
+ return $len;
+}
+
+
+# argument holds the command line
+sub write_header($;$;$)
+{
+ my ($progname,$version,$cl) = @_;
+ my $complete_header = "";
+ $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
+
+ return $complete_header;
+}