Mercurial > repos > pitagora > export2sam
changeset 3:670f78e0d8db draft
Uploaded
author | pitagora |
---|---|
date | Sat, 13 Sep 2014 02:31:55 -0400 |
parents | 939c7209f044 |
children | 8f4a43e0212c |
files | export2sam.sh export2sam.xml illumina_export2sam.pl illumina_export2sam_gz.pl |
diffstat | 4 files changed, 0 insertions(+), 1199 deletions(-) [+] |
line wrap: on
line diff
--- a/export2sam.sh Fri Sep 12 02:43:30 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -#!/bin/sh -# SET TOOL DIRECTORY - -CMDNAME=`basename $0` - -while getopts a:b:o:gr OPT -do - case $OPT in - "a" ) FLG_A="TRUE" ; VALUE_A="$OPTARG" ;; - "b" ) FLG_B="TRUE" ; VALUE_B="$OPTARG" ;; - "o" ) FLG_O="TRUE" ; VALUE_O="$OPTARG" ;; - "g" ) FLG_G="TRUE" ;; - "r" ) FLG_R="TRUE" ;; - * ) echo "Usage: $CMDNAME [-a VALUE] [-b] [-o VALUE] [-gr]" 1>&2 - exit 1 ;; - esac -done - -# SINGLE END OR PAIR END -if [ ! $FLG_B ] ; then - ARGS="--read1=$VALUE_A" -else - ARGS="--read1=$VALUE_A --read2=$VALUE_B" -fi - -# GZIPPED OR NOT -if [ $FLG_G ] ; then - CMD1="./illumina_export2sam_gz.pl $ARGS" -else - CMD1="./illumina_export2sam.pl $ARGS" -fi - -# MODIFY ACCORDING TO THE REFERENCE -if [ $FLG_R ] ; then - CMD2='$tmp=$_;$tmp=~s/chr_fragments\.fa\///g;$tmp=~s/(chr\w+)\.fa/$1/g;print $tmp;' - CMD="$CMD1 | perl -ne '$CMD2'" -else - CMD=$CMD1 -fi - -echo $CMD -eval $CMD > $VALUE_O
--- a/export2sam.xml Fri Sep 12 02:43:30 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -<tool id="export2sam" name="Export to SAM" version="0.1"> - <description></description> - <parallelism method="basic"></parallelism> - <command interpreter="sh"> - export2sam.sh - -a $read1 - #if $pairCondition.pairSelect == "yes" - -b $read2 - #end if - -o $output - $gunzip - $remove - </command> - <inputs> - <param name="read1" type="data" format="eland_export" label="ELAND export file" help="" /> - <param name="gunzip" type="boolean" truevalue="-g" falsevalue="" checked="True" label="Input file is gzipped" help="" /> - <param name="remove" type="boolean" truevalue="-r" falsevalue="" checked="True" label="Remove (chr).fa and chr_fragments.fa/" help="" /> - <conditional name="pairCondition"> - <param name="pairSelect" type="select" label="Pair end?"> - <option value="yes">Yes</option> - <option value="no" selected="True">No</option> - </param> - <when value="yes"> - <param name="read2" type="data" format="eland_export" label="ELAND export file" help="" /> - </when> - <when value="no"></when> - </conditional> - </inputs> - <outputs> - <data format="sam" name="output" label="${tool.name} on ${on_string}"/> - </outputs> - <help>If thou thyself canst do it, attend no other's help or hand.</help> -</tool>
--- a/illumina_export2sam.pl Fri Sep 12 02:43:30 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,562 +0,0 @@ -#!/usr/bin/env perl -# -# -# illumina_export2sam.pl converts GERALD export files to SAM format. -# -# -# -########## License: -# -# The MIT License -# -# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. -# Modified SAMtools work copyright (c) 2010 Illumina, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# -# -########## Additional notice for CASAVA installation: -# -# This file [illumina_export2sam.pl] in the CASAVA installation has -# been copied from the file [export2sam.pl] in the SAMtools package -# and modified by Illumina as permitted under the MIT license that -# governs SAMtools. Illumina recommends the use of the modified -# version to convert data from the Illumina export format to the SAM -# file format. The terms of the MIT license specify your right to -# further modify and distribute the SAMtools code. For the avoidance -# of doubt, your rights with respect to copying, modifying, using and -# distributing CASAVA are more restricted than the rights in the MIT -# license, and are set forth in the Illumina Genome Analyzer Software -# License Agreement and the Illumina Source Code License Agreement. -# -########## ChangeLog: -# -# Version: 2.3.1 (18MAR2011) -# -# - Restore file '-' as stdin input. -# -# Version: 2.3.0 (24JAN2011) -# -# - Add support for export reserved chromosome name "CONTROL", -# which is translated to optional field "XC:Z:CONTROL". -# - Check for ".gz" file extension on export files and open -# these as gzip pipes when the extension is found. -# -# Version: 2.2.0 (16NOV2010) -# -# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y -# - For export records with reserved chromosome name identifiers -# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" -# to the SAM record, so that these cases can be distinguished -# from other unmatched reads. -# -# Version: 2.1.0 (21SEP2010) -# -# - Additional export record error checking. -# - Convert export records with chromosome value of "RM" to unmapped -# SAM records. -# -# Version: 2.0.0 (15FEB2010) -# -# Script updated by Illumina in conjunction with CASAVA 1.7.0 -# release. -# -# Major changes are as follows: -# - The CIGAR string has been updated to include all gaps from -# ELANDv2 alignments. -# - The ELAND single read alignment score is always stored in the -# optional "SM" field and the ELAND paired read alignment score -# is stored in the optional "AS" field when it exists. -# - The MAPQ value is set to the higher of the two alignment scores, -# but no greater than 254, i.e. min(254,max(SM,AS)) -# - The SAM "proper pair" bit (0x0002) is now set for read pairs -# meeting ELAND's expected orientation and insert size criteria. -# - The default quality score translation is set for export files -# which contain Phread+64 quality values. An option, -# "--qlogodds", has been added to translate quality values from -# the Solexa+64 format used in export files prior to Pipeline -# 1.3 -# - The export match descriptor is now reverse-complemented when -# necessary such that it always corresponds to the forward -# strand of the reference, to be consistent with other -# information in the SAM record. It is now written to the -# optional 'XD' field (rather than 'MD') to acknowledge its -# minor differences from the samtools match descriptor (see -# additional detail below). -# - An option, "--nofilter", has been added to include reads which -# have failed primary analysis quality filtration. Such reads -# will have the corresponding SAM flag bit (0x0200) set. -# - Labels in the export 'contig' field are preserved by setting -# RNAME to "$export_chromosome/$export_contig" when the contig -# label exists. -# -# -# Contact: lh3 -# Version: 0.1.2 (03JAN2009) -# -# -# -########## Known Conversion Limitations: -# -# - Export records for reads that map to a position < 1 (allowed -# in export format), are converted to unmapped reads in the SAM -# record. -# - Export records contain the reserved chromosome names: "NM", -# "QC","RM" and "CONTROL". "NM" indicates that the aligner could -# not map the read to the reference sequence set. "QC" means that -# the aligner did not attempt to map the read due to some -# technical limitation. "RM" means that the read mapped to a set -# of 'contaminant' sequences specified in GERALD's RNA-seq -# workflow. "CONTROL" means that the read is a control. All of -# these alignment types are collapsed to the single unmapped -# alignment state in the SAM record, but the optional SAM "XC" -# field is used to record the original reserved chromosome name of -# the read for all but the "NM" case. -# - The export match descriptor is slightly different than the -# samtools match descriptor. For this reason it is stored in the -# optional SAM field 'XD' (and not 'MD'). Note that the export -# match descriptor differs from the samtools version in two -# respects: (1) indels are explicitly closed with the '$' -# character and (2) insertions must be enumerated in the match -# descriptor. For example a 35-base read with a two-base insertion -# is described as: 20^2$14 -# -# -# - -my $version = "2.3.1"; - -use strict; -use warnings; - -use Getopt::Long; -use File::Spec; -use List::Util qw(min max); - - -use constant { - EXPORT_MACHINE => 0, - EXPORT_RUNNO => 1, - EXPORT_LANE => 2, - EXPORT_TILE => 3, - EXPORT_X => 4, - EXPORT_Y => 5, - EXPORT_INDEX => 6, - EXPORT_READNO => 7, - EXPORT_READ => 8, - EXPORT_QUAL => 9, - EXPORT_CHROM => 10, - EXPORT_CONTIG => 11, - EXPORT_POS => 12, - EXPORT_STRAND => 13, - EXPORT_MD => 14, - EXPORT_SEMAP => 15, - EXPORT_PEMAP => 16, - EXPORT_PASSFILT => 21, - EXPORT_SIZE => 22, -}; - - -use constant { - SAM_QNAME => 0, - SAM_FLAG => 1, - SAM_RNAME => 2, - SAM_POS => 3, - SAM_MAPQ => 4, - SAM_CIGAR => 5, - SAM_MRNM => 6, - SAM_MPOS => 7, - SAM_ISIZE => 8, - SAM_SEQ => 9, - SAM_QUAL => 10, -}; - - -# function prototypes for Richard's code -sub match_desc_to_cigar($); -sub match_desc_frag_length($); -sub reverse_compl_match_descriptor($); -sub write_header($;$;$); - - -&export2sam; -exit; - - - - -sub export2sam { - - my $cmdline = $0 . " " . join(" ",@ARGV); - my $arg_count = scalar @ARGV; - my $progname = "illumina_export2sam.pl"; - - my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values - my $is_nofilter = 0; - my $read1file; - my $read2file; - my $print_version = 0; - my $help = 0; - - my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, - "nofilter" => \$is_nofilter, - "read1=s" => \$read1file, - "read2=s" => \$read2file, - "version" => \$print_version, - "help" => \$help ); - - my $usage = <<END; - -$progname converts GERALD export files to SAM format. - -Usage: $progname --read1=FILENAME [ options ] | --version | --help - - --read1=FILENAME read1 export file or '-' for stdin (mandatory) - (file may be gzipped with ".gz" extension) - --read2=FILENAME read2 export file or '-' for stdin - (file may be gzipped with ".gz" extension) - --nofilter include reads that failed the basecaller - purity filter - --qlogodds assume export file(s) use logodds quality values - as reported by OLB (Pipeline) prior to v1.3 - (default: phred quality values) - -END - - my $version_msg = <<END; - -$progname version: $version - -END - - if((not $result) or $help or ($arg_count==0)) { - die($usage); - } - - if(@ARGV) { - print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n"; - die($usage); - } - - if($print_version) { - die($version_msg); - } - - if(not defined($read1file)) { - print STDERR "\nERROR: read1 export file must be specified\n\n"; - die($usage); - } - - unless((-f $read1file) or ($read1file eq '-')) { - die("\nERROR: Can't find read1 export file: '$read1file'\n\n"); - } - - if (defined $read2file) { - unless((-f $read2file) or ($read2file eq '-')) { - die("\nERROR: Can't find read2 export file: '$read2file'\n\n"); - } - if($read1file eq $read2file) { - die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n"); - } - } - - my ($fh1, $fh2, $is_paired); - - my $read1cmd="$read1file"; - $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/); - open($fh1, $read1cmd) - or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n"); - $is_paired = defined $read2file; - if ($is_paired) { - my $read2cmd="$read2file"; - $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/); - open($fh2, $read2cmd) - or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n"); - } - # quality value conversion table - my @conv_table; - if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3): - for (-64..64) { - $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499); - } - } else { # convert from phred+64 quality values (pipeline v1.3+): - for (-64..-1) { - $conv_table[$_+64] = undef; - } - for (0..64) { - $conv_table[$_+64] = int(33 + $_); - } - } - # write the header - print write_header( $progname, $version, $cmdline ); - # core loop - my $export_line_count = 0; - while (<$fh1>) { - $export_line_count++; - my (@s1, @s2); - &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); - if ($is_paired) { - my $read2line = <$fh2>; - if(not $read2line){ - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); - } - &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); - - if (@s1 && @s2) { # then set mate coordinate - if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ - die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); - } - - my $isize = 0; - if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize - my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; - my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; - $isize = $x2 - $x1; - } - - foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ - my ($sa,$sb,$is) = @{$_}; - if ($sb->[SAM_RNAME] ne '*') { - $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; - $sa->[SAM_MPOS] = $sb->[SAM_POS]; - $sa->[SAM_ISIZE] = $is; - $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); - } else { - $sa->[SAM_FLAG] |= 0x8; - } - } - } - } - print join("\t", @s1), "\n" if (@s1); - print join("\t", @s2), "\n" if (@s2 && $is_paired); - } - close($fh1); - if($is_paired) { - while(my $read2line = <$fh2>){ - $export_line_count++; - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); - } - close($fh2); - } -} - -sub export2sam_aux { - my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; - chomp($line); - my @t = split(/\t/, $line, -1); - my $isPassFilt = 1; - # Sorted files do not have passfilter column, so the number of columns can be total-1. - if(scalar(@t) < (EXPORT_SIZE - 1)) { - my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . (EXPORT_SIZE - 1) . ".\n"; - $msg.="\t...erroneous export record:\n" . $line . "\n\n"; - die($msg); - } - elsif(scalar(@t) == EXPORT_SIZE) { - $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); - return if(not ($isPassFilt or $is_nofilter)); - } - @$s = (); - # read name - my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); - $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), - int($t[EXPORT_X]), int($t[EXPORT_Y])); - # initial flag (will be updated later) - $s->[SAM_FLAG] = 0; - if($is_paired) { - if($t[EXPORT_READNO] != $read_no){ - die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); - } - $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); - } - $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); - - # read & quality - my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); - if ($is_export_rev) { # then reverse the sequence and quality - $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); - $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; - $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); - } else { - $s->[SAM_SEQ] = $t[EXPORT_READ]; - $s->[SAM_QUAL] = $t[EXPORT_QUAL]; - } - my @convqual = (); - foreach (unpack('C*', $s->[SAM_QUAL])){ - my $val=$ct->[$_]; - if(not defined $val){ - my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; - if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } - die($msg . "\n"); - } - push @convqual,$val; - } - - $s->[SAM_QUAL] = pack('C*',@convqual); # change coding - - - # coor - my $has_coor = 0; - $s->[SAM_RNAME] = "*"; - if (($t[EXPORT_CHROM] eq 'NM') or - ($t[EXPORT_CHROM] eq 'QC') or - ($t[EXPORT_CHROM] eq 'RM') or - ($t[EXPORT_CHROM] eq 'CONTROL')) { - $s->[SAM_FLAG] |= 0x4; # unmapped - push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); - } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { - $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? - push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") - } elsif ($t[EXPORT_POS] < 1) { - $s->[SAM_FLAG] |= 0x4; # unmapped - } else { - $s->[SAM_RNAME] = $t[EXPORT_CHROM]; - $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); - $has_coor = 1; - } - $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; - -# print STDERR "t[14] = " . $t[14] . "\n"; - my $matchDesc = ''; - $s->[SAM_CIGAR] = "*"; - if($has_coor){ - $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; - - if($matchDesc =~ /\^/){ - # construct CIGAR string using Richard's function - $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing - } else { - $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; - } - } - -# print STDERR "cigar_string = $cigar_string\n"; - - $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); - if($has_coor){ - my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; - my $pemap = 0; - if($is_paired) { - $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; - - # set `proper pair' bit if non-blank, non-zero PE alignment score: - $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); - } - $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); - } else { - $s->[SAM_MAPQ] = 0; - } - # mate coordinate - $s->[SAM_MRNM] = '*'; - $s->[SAM_MPOS] = 0; - $s->[SAM_ISIZE] = 0; - # aux - push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); - if($has_coor){ - # The export match descriptor differs slightly from the samtools match descriptor. - # In order for the converted SAM files to be as compliant as possible, - # we put the export match descriptor in optional field 'XD' rather than 'MD': - push(@$s, "XD:Z:$matchDesc"); - push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); - push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); - } -} - - - -# -# the following code is taken from Richard Shaw's sorted2sam.pl file -# -sub reverse_compl_match_descriptor($) -{ -# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; - my ($match_desc) = @_; - my $rev_compl_match_desc = reverse($match_desc); - $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; - - # Unreverse the digits of numbers. - $rev_compl_match_desc = join('', - map {($_ =~ /\d+/) - ? join('', reverse(split('', $_))) - : $_} split(/(\d+)/, - $rev_compl_match_desc)); - - return $rev_compl_match_desc; -} - - - -sub match_desc_to_cigar($) -{ - my ($match_desc) = @_; - - my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); - my $cigar_str = ''; - my $cigar_del_ch = 'D'; - my $cigar_ins_ch = 'I'; - my $cigar_match_ch = 'M'; - - foreach my $match_desc_part (@match_desc_parts) { - next if (!$match_desc_part); - - if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { - # Deletion - $cigar_str .= (length($1) . $cigar_del_ch); - } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { - # Insertion - $cigar_str .= ($1 . $cigar_ins_ch); - } else { - $cigar_str .= (match_desc_frag_length($match_desc_part) - . $cigar_match_ch); - } - } - - return $cigar_str; -} - - -#------------------------------------------------------------------------------ - -sub match_desc_frag_length($) - { - my ($match_desc_str) = @_; - my $len = 0; - - my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); - - foreach my $match_desc_field (@match_desc_fields) { - next if ($match_desc_field eq ''); - - $len += (($match_desc_field =~ /(\d+)/) - ? $1 : length($match_desc_field)); - } - - return $len; -} - - -# argument holds the command line -sub write_header($;$;$) -{ - my ($progname,$version,$cl) = @_; - my $complete_header = ""; - $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; - - return $complete_header; -}
--- a/illumina_export2sam_gz.pl Fri Sep 12 02:43:30 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,562 +0,0 @@ -#!/usr/bin/env perl -# -# -# illumina_export2sam.pl converts GERALD export files to SAM format. -# -# -# -########## License: -# -# The MIT License -# -# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. -# Modified SAMtools work copyright (c) 2010 Illumina, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# -# -########## Additional notice for CASAVA installation: -# -# This file [illumina_export2sam.pl] in the CASAVA installation has -# been copied from the file [export2sam.pl] in the SAMtools package -# and modified by Illumina as permitted under the MIT license that -# governs SAMtools. Illumina recommends the use of the modified -# version to convert data from the Illumina export format to the SAM -# file format. The terms of the MIT license specify your right to -# further modify and distribute the SAMtools code. For the avoidance -# of doubt, your rights with respect to copying, modifying, using and -# distributing CASAVA are more restricted than the rights in the MIT -# license, and are set forth in the Illumina Genome Analyzer Software -# License Agreement and the Illumina Source Code License Agreement. -# -########## ChangeLog: -# -# Version: 2.3.1 (18MAR2011) -# -# - Restore file '-' as stdin input. -# -# Version: 2.3.0 (24JAN2011) -# -# - Add support for export reserved chromosome name "CONTROL", -# which is translated to optional field "XC:Z:CONTROL". -# - Check for ".gz" file extension on export files and open -# these as gzip pipes when the extension is found. -# -# Version: 2.2.0 (16NOV2010) -# -# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y -# - For export records with reserved chromosome name identifiers -# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" -# to the SAM record, so that these cases can be distinguished -# from other unmatched reads. -# -# Version: 2.1.0 (21SEP2010) -# -# - Additional export record error checking. -# - Convert export records with chromosome value of "RM" to unmapped -# SAM records. -# -# Version: 2.0.0 (15FEB2010) -# -# Script updated by Illumina in conjunction with CASAVA 1.7.0 -# release. -# -# Major changes are as follows: -# - The CIGAR string has been updated to include all gaps from -# ELANDv2 alignments. -# - The ELAND single read alignment score is always stored in the -# optional "SM" field and the ELAND paired read alignment score -# is stored in the optional "AS" field when it exists. -# - The MAPQ value is set to the higher of the two alignment scores, -# but no greater than 254, i.e. min(254,max(SM,AS)) -# - The SAM "proper pair" bit (0x0002) is now set for read pairs -# meeting ELAND's expected orientation and insert size criteria. -# - The default quality score translation is set for export files -# which contain Phread+64 quality values. An option, -# "--qlogodds", has been added to translate quality values from -# the Solexa+64 format used in export files prior to Pipeline -# 1.3 -# - The export match descriptor is now reverse-complemented when -# necessary such that it always corresponds to the forward -# strand of the reference, to be consistent with other -# information in the SAM record. It is now written to the -# optional 'XD' field (rather than 'MD') to acknowledge its -# minor differences from the samtools match descriptor (see -# additional detail below). -# - An option, "--nofilter", has been added to include reads which -# have failed primary analysis quality filtration. Such reads -# will have the corresponding SAM flag bit (0x0200) set. -# - Labels in the export 'contig' field are preserved by setting -# RNAME to "$export_chromosome/$export_contig" when the contig -# label exists. -# -# -# Contact: lh3 -# Version: 0.1.2 (03JAN2009) -# -# -# -########## Known Conversion Limitations: -# -# - Export records for reads that map to a position < 1 (allowed -# in export format), are converted to unmapped reads in the SAM -# record. -# - Export records contain the reserved chromosome names: "NM", -# "QC","RM" and "CONTROL". "NM" indicates that the aligner could -# not map the read to the reference sequence set. "QC" means that -# the aligner did not attempt to map the read due to some -# technical limitation. "RM" means that the read mapped to a set -# of 'contaminant' sequences specified in GERALD's RNA-seq -# workflow. "CONTROL" means that the read is a control. All of -# these alignment types are collapsed to the single unmapped -# alignment state in the SAM record, but the optional SAM "XC" -# field is used to record the original reserved chromosome name of -# the read for all but the "NM" case. -# - The export match descriptor is slightly different than the -# samtools match descriptor. For this reason it is stored in the -# optional SAM field 'XD' (and not 'MD'). Note that the export -# match descriptor differs from the samtools version in two -# respects: (1) indels are explicitly closed with the '$' -# character and (2) insertions must be enumerated in the match -# descriptor. For example a 35-base read with a two-base insertion -# is described as: 20^2$14 -# -# -# - -my $version = "2.3.1"; - -use strict; -use warnings; - -use Getopt::Long; -use File::Spec; -use List::Util qw(min max); - - -use constant { - EXPORT_MACHINE => 0, - EXPORT_RUNNO => 1, - EXPORT_LANE => 2, - EXPORT_TILE => 3, - EXPORT_X => 4, - EXPORT_Y => 5, - EXPORT_INDEX => 6, - EXPORT_READNO => 7, - EXPORT_READ => 8, - EXPORT_QUAL => 9, - EXPORT_CHROM => 10, - EXPORT_CONTIG => 11, - EXPORT_POS => 12, - EXPORT_STRAND => 13, - EXPORT_MD => 14, - EXPORT_SEMAP => 15, - EXPORT_PEMAP => 16, - EXPORT_PASSFILT => 21, - EXPORT_SIZE => 22, -}; - - -use constant { - SAM_QNAME => 0, - SAM_FLAG => 1, - SAM_RNAME => 2, - SAM_POS => 3, - SAM_MAPQ => 4, - SAM_CIGAR => 5, - SAM_MRNM => 6, - SAM_MPOS => 7, - SAM_ISIZE => 8, - SAM_SEQ => 9, - SAM_QUAL => 10, -}; - - -# function prototypes for Richard's code -sub match_desc_to_cigar($); -sub match_desc_frag_length($); -sub reverse_compl_match_descriptor($); -sub write_header($;$;$); - - -&export2sam; -exit; - - - - -sub export2sam { - - my $cmdline = $0 . " " . join(" ",@ARGV); - my $arg_count = scalar @ARGV; - my $progname = "illumina_export2sam.pl"; - - my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values - my $is_nofilter = 0; - my $read1file; - my $read2file; - my $print_version = 0; - my $help = 0; - - my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, - "nofilter" => \$is_nofilter, - "read1=s" => \$read1file, - "read2=s" => \$read2file, - "version" => \$print_version, - "help" => \$help ); - - my $usage = <<END; - -$progname converts GERALD export files to SAM format. - -Usage: $progname --read1=FILENAME [ options ] | --version | --help - - --read1=FILENAME read1 export file or '-' for stdin (mandatory) - (file may be gzipped with ".gz" extension) - --read2=FILENAME read2 export file or '-' for stdin - (file may be gzipped with ".gz" extension) - --nofilter include reads that failed the basecaller - purity filter - --qlogodds assume export file(s) use logodds quality values - as reported by OLB (Pipeline) prior to v1.3 - (default: phred quality values) - -END - - my $version_msg = <<END; - -$progname version: $version - -END - - if((not $result) or $help or ($arg_count==0)) { - die($usage); - } - - if(@ARGV) { - print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n"; - die($usage); - } - - if($print_version) { - die($version_msg); - } - - if(not defined($read1file)) { - print STDERR "\nERROR: read1 export file must be specified\n\n"; - die($usage); - } - - unless((-f $read1file) or ($read1file eq '-')) { - die("\nERROR: Can't find read1 export file: '$read1file'\n\n"); - } - - if (defined $read2file) { - unless((-f $read2file) or ($read2file eq '-')) { - die("\nERROR: Can't find read2 export file: '$read2file'\n\n"); - } - if($read1file eq $read2file) { - die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n"); - } - } - - my ($fh1, $fh2, $is_paired); - - my $read1cmd="$read1file"; - $read1cmd = "gzip -dc $read1file |"; - open($fh1, $read1cmd) - or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n"); - $is_paired = defined $read2file; - if ($is_paired) { - my $read2cmd="$read2file"; - $read2cmd = "gzip -dc $read2file |"; - open($fh2, $read2cmd) - or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n"); - } - # quality value conversion table - my @conv_table; - if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3): - for (-64..64) { - $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499); - } - } else { # convert from phred+64 quality values (pipeline v1.3+): - for (-64..-1) { - $conv_table[$_+64] = undef; - } - for (0..64) { - $conv_table[$_+64] = int(33 + $_); - } - } - # write the header - print write_header( $progname, $version, $cmdline ); - # core loop - my $export_line_count = 0; - while (<$fh1>) { - $export_line_count++; - my (@s1, @s2); - &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); - if ($is_paired) { - my $read2line = <$fh2>; - if(not $read2line){ - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); - } - &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); - - if (@s1 && @s2) { # then set mate coordinate - if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ - die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); - } - - my $isize = 0; - if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize - my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; - my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; - $isize = $x2 - $x1; - } - - foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ - my ($sa,$sb,$is) = @{$_}; - if ($sb->[SAM_RNAME] ne '*') { - $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; - $sa->[SAM_MPOS] = $sb->[SAM_POS]; - $sa->[SAM_ISIZE] = $is; - $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); - } else { - $sa->[SAM_FLAG] |= 0x8; - } - } - } - } - print join("\t", @s1), "\n" if (@s1); - print join("\t", @s2), "\n" if (@s2 && $is_paired); - } - close($fh1); - if($is_paired) { - while(my $read2line = <$fh2>){ - $export_line_count++; - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); - } - close($fh2); - } -} - -sub export2sam_aux { - my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; - chomp($line); - my @t = split(/\t/, $line, -1); - my $isPassFilt = 1; - # Sorted files do not have passfilter column, so the number of columns can be total-1. - if(scalar(@t) < (EXPORT_SIZE - 1)) { - my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . (EXPORT_SIZE - 1) . ".\n"; - $msg.="\t...erroneous export record:\n" . $line . "\n\n"; - die($msg); - } - elsif(scalar(@t) == EXPORT_SIZE) { - $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); - return if(not ($isPassFilt or $is_nofilter)); - } - @$s = (); - # read name - my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); - $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), - int($t[EXPORT_X]), int($t[EXPORT_Y])); - # initial flag (will be updated later) - $s->[SAM_FLAG] = 0; - if($is_paired) { - if($t[EXPORT_READNO] != $read_no){ - die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); - } - $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); - } - $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); - - # read & quality - my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); - if ($is_export_rev) { # then reverse the sequence and quality - $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); - $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; - $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); - } else { - $s->[SAM_SEQ] = $t[EXPORT_READ]; - $s->[SAM_QUAL] = $t[EXPORT_QUAL]; - } - my @convqual = (); - foreach (unpack('C*', $s->[SAM_QUAL])){ - my $val=$ct->[$_]; - if(not defined $val){ - my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; - if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } - die($msg . "\n"); - } - push @convqual,$val; - } - - $s->[SAM_QUAL] = pack('C*',@convqual); # change coding - - - # coor - my $has_coor = 0; - $s->[SAM_RNAME] = "*"; - if (($t[EXPORT_CHROM] eq 'NM') or - ($t[EXPORT_CHROM] eq 'QC') or - ($t[EXPORT_CHROM] eq 'RM') or - ($t[EXPORT_CHROM] eq 'CONTROL')) { - $s->[SAM_FLAG] |= 0x4; # unmapped - push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); - } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { - $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? - push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") - } elsif ($t[EXPORT_POS] < 1) { - $s->[SAM_FLAG] |= 0x4; # unmapped - } else { - $s->[SAM_RNAME] = $t[EXPORT_CHROM]; - $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); - $has_coor = 1; - } - $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; - -# print STDERR "t[14] = " . $t[14] . "\n"; - my $matchDesc = ''; - $s->[SAM_CIGAR] = "*"; - if($has_coor){ - $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; - - if($matchDesc =~ /\^/){ - # construct CIGAR string using Richard's function - $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing - } else { - $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; - } - } - -# print STDERR "cigar_string = $cigar_string\n"; - - $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); - if($has_coor){ - my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; - my $pemap = 0; - if($is_paired) { - $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; - - # set `proper pair' bit if non-blank, non-zero PE alignment score: - $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); - } - $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); - } else { - $s->[SAM_MAPQ] = 0; - } - # mate coordinate - $s->[SAM_MRNM] = '*'; - $s->[SAM_MPOS] = 0; - $s->[SAM_ISIZE] = 0; - # aux - push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); - if($has_coor){ - # The export match descriptor differs slightly from the samtools match descriptor. - # In order for the converted SAM files to be as compliant as possible, - # we put the export match descriptor in optional field 'XD' rather than 'MD': - push(@$s, "XD:Z:$matchDesc"); - push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); - push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); - } -} - - - -# -# the following code is taken from Richard Shaw's sorted2sam.pl file -# -sub reverse_compl_match_descriptor($) -{ -# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; - my ($match_desc) = @_; - my $rev_compl_match_desc = reverse($match_desc); - $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; - - # Unreverse the digits of numbers. - $rev_compl_match_desc = join('', - map {($_ =~ /\d+/) - ? join('', reverse(split('', $_))) - : $_} split(/(\d+)/, - $rev_compl_match_desc)); - - return $rev_compl_match_desc; -} - - - -sub match_desc_to_cigar($) -{ - my ($match_desc) = @_; - - my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); - my $cigar_str = ''; - my $cigar_del_ch = 'D'; - my $cigar_ins_ch = 'I'; - my $cigar_match_ch = 'M'; - - foreach my $match_desc_part (@match_desc_parts) { - next if (!$match_desc_part); - - if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { - # Deletion - $cigar_str .= (length($1) . $cigar_del_ch); - } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { - # Insertion - $cigar_str .= ($1 . $cigar_ins_ch); - } else { - $cigar_str .= (match_desc_frag_length($match_desc_part) - . $cigar_match_ch); - } - } - - return $cigar_str; -} - - -#------------------------------------------------------------------------------ - -sub match_desc_frag_length($) - { - my ($match_desc_str) = @_; - my $len = 0; - - my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); - - foreach my $match_desc_field (@match_desc_fields) { - next if ($match_desc_field eq ''); - - $len += (($match_desc_field =~ /(\d+)/) - ? $1 : length($match_desc_field)); - } - - return $len; -} - - -# argument holds the command line -sub write_header($;$;$) -{ - my ($progname,$version,$cl) = @_; - my $complete_header = ""; - $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; - - return $complete_header; -}