0
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
1 #!/usr/bin/env perl
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
2 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
3 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
4 # illumina_export2sam.pl converts GERALD export files to SAM format.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
5 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
6 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
7 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
8 ########## License:
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
9 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
10 # The MIT License
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
11 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
12 # Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
13 # Modified SAMtools work copyright (c) 2010 Illumina, Inc.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
14 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
15 # Permission is hereby granted, free of charge, to any person obtaining a copy
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
16 # of this software and associated documentation files (the "Software"), to deal
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
17 # in the Software without restriction, including without limitation the rights
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
18 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
19 # copies of the Software, and to permit persons to whom the Software is
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
20 # furnished to do so, subject to the following conditions:
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
21 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
22 # The above copyright notice and this permission notice shall be included in
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
23 # all copies or substantial portions of the Software.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
24 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
25 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
26 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
27 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
28 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
29 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
30 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
31 # THE SOFTWARE.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
32 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
33 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
34 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
35 ########## Additional notice for CASAVA installation:
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
36 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
37 # This file [illumina_export2sam.pl] in the CASAVA installation has
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
38 # been copied from the file [export2sam.pl] in the SAMtools package
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
39 # and modified by Illumina as permitted under the MIT license that
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
40 # governs SAMtools. Illumina recommends the use of the modified
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
41 # version to convert data from the Illumina export format to the SAM
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
42 # file format. The terms of the MIT license specify your right to
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
43 # further modify and distribute the SAMtools code. For the avoidance
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
44 # of doubt, your rights with respect to copying, modifying, using and
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
45 # distributing CASAVA are more restricted than the rights in the MIT
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
46 # license, and are set forth in the Illumina Genome Analyzer Software
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
47 # License Agreement and the Illumina Source Code License Agreement.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
48 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
49 ########## ChangeLog:
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
50 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
51 # Version: 2.3.1 (18MAR2011)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
52 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
53 # - Restore file '-' as stdin input.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
54 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
55 # Version: 2.3.0 (24JAN2011)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
56 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
57 # - Add support for export reserved chromosome name "CONTROL",
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
58 # which is translated to optional field "XC:Z:CONTROL".
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
59 # - Check for ".gz" file extension on export files and open
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
60 # these as gzip pipes when the extension is found.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
61 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
62 # Version: 2.2.0 (16NOV2010)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
63 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
64 # - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
65 # - For export records with reserved chromosome name identifiers
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
66 # "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
67 # to the SAM record, so that these cases can be distinguished
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
68 # from other unmatched reads.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
69 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
70 # Version: 2.1.0 (21SEP2010)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
71 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
72 # - Additional export record error checking.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
73 # - Convert export records with chromosome value of "RM" to unmapped
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
74 # SAM records.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
75 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
76 # Version: 2.0.0 (15FEB2010)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
77 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
78 # Script updated by Illumina in conjunction with CASAVA 1.7.0
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
79 # release.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
80 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
81 # Major changes are as follows:
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
82 # - The CIGAR string has been updated to include all gaps from
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
83 # ELANDv2 alignments.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
84 # - The ELAND single read alignment score is always stored in the
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
85 # optional "SM" field and the ELAND paired read alignment score
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
86 # is stored in the optional "AS" field when it exists.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
87 # - The MAPQ value is set to the higher of the two alignment scores,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
88 # but no greater than 254, i.e. min(254,max(SM,AS))
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
89 # - The SAM "proper pair" bit (0x0002) is now set for read pairs
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
90 # meeting ELAND's expected orientation and insert size criteria.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
91 # - The default quality score translation is set for export files
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
92 # which contain Phread+64 quality values. An option,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
93 # "--qlogodds", has been added to translate quality values from
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
94 # the Solexa+64 format used in export files prior to Pipeline
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
95 # 1.3
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
96 # - The export match descriptor is now reverse-complemented when
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
97 # necessary such that it always corresponds to the forward
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
98 # strand of the reference, to be consistent with other
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
99 # information in the SAM record. It is now written to the
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
100 # optional 'XD' field (rather than 'MD') to acknowledge its
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
101 # minor differences from the samtools match descriptor (see
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
102 # additional detail below).
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
103 # - An option, "--nofilter", has been added to include reads which
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
104 # have failed primary analysis quality filtration. Such reads
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
105 # will have the corresponding SAM flag bit (0x0200) set.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
106 # - Labels in the export 'contig' field are preserved by setting
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
107 # RNAME to "$export_chromosome/$export_contig" when the contig
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
108 # label exists.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
109 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
110 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
111 # Contact: lh3
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
112 # Version: 0.1.2 (03JAN2009)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
113 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
114 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
115 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
116 ########## Known Conversion Limitations:
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
117 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
118 # - Export records for reads that map to a position < 1 (allowed
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
119 # in export format), are converted to unmapped reads in the SAM
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
120 # record.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
121 # - Export records contain the reserved chromosome names: "NM",
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
122 # "QC","RM" and "CONTROL". "NM" indicates that the aligner could
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
123 # not map the read to the reference sequence set. "QC" means that
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
124 # the aligner did not attempt to map the read due to some
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
125 # technical limitation. "RM" means that the read mapped to a set
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
126 # of 'contaminant' sequences specified in GERALD's RNA-seq
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
127 # workflow. "CONTROL" means that the read is a control. All of
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
128 # these alignment types are collapsed to the single unmapped
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
129 # alignment state in the SAM record, but the optional SAM "XC"
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
130 # field is used to record the original reserved chromosome name of
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
131 # the read for all but the "NM" case.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
132 # - The export match descriptor is slightly different than the
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
133 # samtools match descriptor. For this reason it is stored in the
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
134 # optional SAM field 'XD' (and not 'MD'). Note that the export
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
135 # match descriptor differs from the samtools version in two
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
136 # respects: (1) indels are explicitly closed with the '$'
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
137 # character and (2) insertions must be enumerated in the match
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
138 # descriptor. For example a 35-base read with a two-base insertion
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
139 # is described as: 20^2$14
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
140 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
141 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
142 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
143
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
144 my $version = "2.3.1";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
145
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
146 use strict;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
147 use warnings;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
148
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
149 use Getopt::Long;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
150 use File::Spec;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
151 use List::Util qw(min max);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
152
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
153
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
154 use constant {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
155 EXPORT_MACHINE => 0,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
156 EXPORT_RUNNO => 1,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
157 EXPORT_LANE => 2,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
158 EXPORT_TILE => 3,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
159 EXPORT_X => 4,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
160 EXPORT_Y => 5,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
161 EXPORT_INDEX => 6,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
162 EXPORT_READNO => 7,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
163 EXPORT_READ => 8,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
164 EXPORT_QUAL => 9,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
165 EXPORT_CHROM => 10,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
166 EXPORT_CONTIG => 11,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
167 EXPORT_POS => 12,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
168 EXPORT_STRAND => 13,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
169 EXPORT_MD => 14,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
170 EXPORT_SEMAP => 15,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
171 EXPORT_PEMAP => 16,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
172 EXPORT_PASSFILT => 21,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
173 EXPORT_SIZE => 22,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
174 };
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
175
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
176
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
177 use constant {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
178 SAM_QNAME => 0,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
179 SAM_FLAG => 1,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
180 SAM_RNAME => 2,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
181 SAM_POS => 3,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
182 SAM_MAPQ => 4,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
183 SAM_CIGAR => 5,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
184 SAM_MRNM => 6,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
185 SAM_MPOS => 7,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
186 SAM_ISIZE => 8,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
187 SAM_SEQ => 9,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
188 SAM_QUAL => 10,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
189 };
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
190
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
191
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
192 # function prototypes for Richard's code
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
193 sub match_desc_to_cigar($);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
194 sub match_desc_frag_length($);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
195 sub reverse_compl_match_descriptor($);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
196 sub write_header($;$;$);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
197
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
198
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
199 &export2sam;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
200 exit;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
201
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
202
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
203
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
204
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
205 sub export2sam {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
206
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
207 my $cmdline = $0 . " " . join(" ",@ARGV);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
208 my $arg_count = scalar @ARGV;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
209 my $progname = "illumina_export2sam.pl";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
210
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
211 my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
212 my $is_nofilter = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
213 my $read1file;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
214 my $read2file;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
215 my $print_version = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
216 my $help = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
217
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
218 my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
219 "nofilter" => \$is_nofilter,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
220 "read1=s" => \$read1file,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
221 "read2=s" => \$read2file,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
222 "version" => \$print_version,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
223 "help" => \$help );
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
224
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
225 my $usage = <<END;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
226
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
227 $progname converts GERALD export files to SAM format.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
228
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
229 Usage: $progname --read1=FILENAME [ options ] | --version | --help
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
230
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
231 --read1=FILENAME read1 export file or '-' for stdin (mandatory)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
232 (file may be gzipped with ".gz" extension)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
233 --read2=FILENAME read2 export file or '-' for stdin
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
234 (file may be gzipped with ".gz" extension)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
235 --nofilter include reads that failed the basecaller
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
236 purity filter
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
237 --qlogodds assume export file(s) use logodds quality values
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
238 as reported by OLB (Pipeline) prior to v1.3
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
239 (default: phred quality values)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
240
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
241 END
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
242
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
243 my $version_msg = <<END;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
244
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
245 $progname version: $version
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
246
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
247 END
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
248
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
249 if((not $result) or $help or ($arg_count==0)) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
250 die($usage);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
251 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
252
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
253 if(@ARGV) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
254 print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
255 die($usage);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
256 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
257
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
258 if($print_version) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
259 die($version_msg);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
260 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
261
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
262 if(not defined($read1file)) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
263 print STDERR "\nERROR: read1 export file must be specified\n\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
264 die($usage);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
265 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
266
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
267 unless((-f $read1file) or ($read1file eq '-')) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
268 die("\nERROR: Can't find read1 export file: '$read1file'\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
269 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
270
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
271 if (defined $read2file) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
272 unless((-f $read2file) or ($read2file eq '-')) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
273 die("\nERROR: Can't find read2 export file: '$read2file'\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
274 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
275 if($read1file eq $read2file) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
276 die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
277 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
278 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
279
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
280 my ($fh1, $fh2, $is_paired);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
281
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
282 my $read1cmd="$read1file";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
283 $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
284 open($fh1, $read1cmd)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
285 or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
286 $is_paired = defined $read2file;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
287 if ($is_paired) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
288 my $read2cmd="$read2file";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
289 $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
290 open($fh2, $read2cmd)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
291 or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
292 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
293 # quality value conversion table
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
294 my @conv_table;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
295 if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
296 for (-64..64) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
297 $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
298 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
299 } else { # convert from phred+64 quality values (pipeline v1.3+):
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
300 for (-64..-1) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
301 $conv_table[$_+64] = undef;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
302 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
303 for (0..64) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
304 $conv_table[$_+64] = int(33 + $_);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
305 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
306 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
307 # write the header
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
308 print write_header( $progname, $version, $cmdline );
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
309 # core loop
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
310 my $export_line_count = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
311 while (<$fh1>) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
312 $export_line_count++;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
313 my (@s1, @s2);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
314 &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
315 if ($is_paired) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
316 my $read2line = <$fh2>;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
317 if(not $read2line){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
318 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
319 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
320 &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
321
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
322 if (@s1 && @s2) { # then set mate coordinate
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
323 if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
324 die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
325 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
326
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
327 my $isize = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
328 if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
329 my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
330 my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
331 $isize = $x2 - $x1;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
332 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
333
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
334 foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
335 my ($sa,$sb,$is) = @{$_};
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
336 if ($sb->[SAM_RNAME] ne '*') {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
337 $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
338 $sa->[SAM_MPOS] = $sb->[SAM_POS];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
339 $sa->[SAM_ISIZE] = $is;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
340 $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
341 } else {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
342 $sa->[SAM_FLAG] |= 0x8;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
343 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
344 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
345 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
346 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
347 print join("\t", @s1), "\n" if (@s1);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
348 print join("\t", @s2), "\n" if (@s2 && $is_paired);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
349 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
350 close($fh1);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
351 if($is_paired) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
352 while(my $read2line = <$fh2>){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
353 $export_line_count++;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
354 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
355 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
356 close($fh2);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
357 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
358 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
359
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
360 sub export2sam_aux {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
361 my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
362 chomp($line);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
363 my @t = split(/\t/, $line, -1);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
364 my $isPassFilt = 1;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
365 # Sorted files do not have passfilter column, so the number of columns can be total-1.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
366 if(scalar(@t) < (EXPORT_SIZE - 1)) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
367 my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . (EXPORT_SIZE - 1) . ".\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
368 $msg.="\t...erroneous export record:\n" . $line . "\n\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
369 die($msg);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
370 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
371 elsif(scalar(@t) == EXPORT_SIZE) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
372 $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
373 return if(not ($isPassFilt or $is_nofilter));
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
374 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
375 @$s = ();
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
376 # read name
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
377 my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
378 $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
379 int($t[EXPORT_X]), int($t[EXPORT_Y]));
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
380 # initial flag (will be updated later)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
381 $s->[SAM_FLAG] = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
382 if($is_paired) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
383 if($t[EXPORT_READNO] != $read_no){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
384 die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
385 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
386 $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
387 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
388 $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
389
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
390 # read & quality
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
391 my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
392 if ($is_export_rev) { # then reverse the sequence and quality
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
393 $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
394 $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
395 $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
396 } else {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
397 $s->[SAM_SEQ] = $t[EXPORT_READ];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
398 $s->[SAM_QUAL] = $t[EXPORT_QUAL];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
399 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
400 my @convqual = ();
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
401 foreach (unpack('C*', $s->[SAM_QUAL])){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
402 my $val=$ct->[$_];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
403 if(not defined $val){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
404 my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
405 if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
406 die($msg . "\n");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
407 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
408 push @convqual,$val;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
409 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
410
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
411 $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
412
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
413
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
414 # coor
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
415 my $has_coor = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
416 $s->[SAM_RNAME] = "*";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
417 if (($t[EXPORT_CHROM] eq 'NM') or
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
418 ($t[EXPORT_CHROM] eq 'QC') or
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
419 ($t[EXPORT_CHROM] eq 'RM') or
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
420 ($t[EXPORT_CHROM] eq 'CONTROL')) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
421 $s->[SAM_FLAG] |= 0x4; # unmapped
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
422 push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
423 } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
424 $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
425 push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
426 } elsif ($t[EXPORT_POS] < 1) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
427 $s->[SAM_FLAG] |= 0x4; # unmapped
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
428 } else {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
429 $s->[SAM_RNAME] = $t[EXPORT_CHROM];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
430 $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
431 $has_coor = 1;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
432 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
433 $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
434
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
435 # print STDERR "t[14] = " . $t[14] . "\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
436 my $matchDesc = '';
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
437 $s->[SAM_CIGAR] = "*";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
438 if($has_coor){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
439 $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
440
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
441 if($matchDesc =~ /\^/){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
442 # construct CIGAR string using Richard's function
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
443 $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
444 } else {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
445 $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
446 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
447 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
448
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
449 # print STDERR "cigar_string = $cigar_string\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
450
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
451 $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
452 if($has_coor){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
453 my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
454 my $pemap = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
455 if($is_paired) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
456 $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
457
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
458 # set `proper pair' bit if non-blank, non-zero PE alignment score:
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
459 $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
460 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
461 $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
462 } else {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
463 $s->[SAM_MAPQ] = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
464 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
465 # mate coordinate
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
466 $s->[SAM_MRNM] = '*';
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
467 $s->[SAM_MPOS] = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
468 $s->[SAM_ISIZE] = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
469 # aux
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
470 push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
471 if($has_coor){
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
472 # The export match descriptor differs slightly from the samtools match descriptor.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
473 # In order for the converted SAM files to be as compliant as possible,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
474 # we put the export match descriptor in optional field 'XD' rather than 'MD':
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
475 push(@$s, "XD:Z:$matchDesc");
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
476 push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
477 push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
478 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
479 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
480
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
481
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
482
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
483 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
484 # the following code is taken from Richard Shaw's sorted2sam.pl file
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
485 #
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
486 sub reverse_compl_match_descriptor($)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
487 {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
488 # print "\nREVERSING THE MATCH DESCRIPTOR!\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
489 my ($match_desc) = @_;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
490 my $rev_compl_match_desc = reverse($match_desc);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
491 $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
492
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
493 # Unreverse the digits of numbers.
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
494 $rev_compl_match_desc = join('',
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
495 map {($_ =~ /\d+/)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
496 ? join('', reverse(split('', $_)))
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
497 : $_} split(/(\d+)/,
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
498 $rev_compl_match_desc));
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
499
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
500 return $rev_compl_match_desc;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
501 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
502
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
503
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
504
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
505 sub match_desc_to_cigar($)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
506 {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
507 my ($match_desc) = @_;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
508
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
509 my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
510 my $cigar_str = '';
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
511 my $cigar_del_ch = 'D';
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
512 my $cigar_ins_ch = 'I';
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
513 my $cigar_match_ch = 'M';
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
514
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
515 foreach my $match_desc_part (@match_desc_parts) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
516 next if (!$match_desc_part);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
517
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
518 if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
519 # Deletion
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
520 $cigar_str .= (length($1) . $cigar_del_ch);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
521 } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
522 # Insertion
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
523 $cigar_str .= ($1 . $cigar_ins_ch);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
524 } else {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
525 $cigar_str .= (match_desc_frag_length($match_desc_part)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
526 . $cigar_match_ch);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
527 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
528 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
529
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
530 return $cigar_str;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
531 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
532
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
533
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
534 #------------------------------------------------------------------------------
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
535
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
536 sub match_desc_frag_length($)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
537 {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
538 my ($match_desc_str) = @_;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
539 my $len = 0;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
540
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
541 my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
542
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
543 foreach my $match_desc_field (@match_desc_fields) {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
544 next if ($match_desc_field eq '');
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
545
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
546 $len += (($match_desc_field =~ /(\d+)/)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
547 ? $1 : length($match_desc_field));
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
548 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
549
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
550 return $len;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
551 }
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
552
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
553
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
554 # argument holds the command line
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
555 sub write_header($;$;$)
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
556 {
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
557 my ($progname,$version,$cl) = @_;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
558 my $complete_header = "";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
559 $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
560
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
561 return $complete_header;
|
ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
parents:
diff
changeset
|
562 }
|