comparison seg2matrix/CGData/CsegToMatrix.cc @ 31:ab20c0d04f4a

add seg2matrix tool
author jingchunzhu
date Fri, 24 Jul 2015 13:10:11 -0700
parents
children 59dbe857f5d4
comparison
equal deleted inserted replaced
30:7a7a52e9b019 31:ab20c0d04f4a
1
2 #include <map>
3 #include <iostream>
4 #include <list>
5 #include <set>
6 #include <vector>
7 #include <sstream>
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <string.h>
11
12 #ifdef __cplusplus
13 extern "C" {
14 #endif
15
16 using namespace std;
17
18 #define MISSING_VAL -99
19
20 class segment {
21 public:
22 string target;
23 string chrome;
24 int start, end;
25 float value;
26 };
27
28 typedef map<string,list<segment> > chromemap;
29 typedef map< string, chromemap > segmap;
30 typedef map<string,set<int> > breakmap;
31 typedef vector<vector<float> > genemap;
32
33
34 segmap * new_segment() {
35 return new segmap();
36 }
37
38 set<string> * new_target_set() {
39 return new set<string>();
40 }
41
42 void add_segment(segmap * seg, set<string> *targetSet, char *sample, char *chrom, int chrom_start, int chrom_end, float value) {
43 string tmp;
44 segment a;
45
46 a.target = strdup(sample);
47 a.chrome = strdup(chrom);
48 a.start = chrom_start;
49 a.end = chrom_end;
50 a.value = value;
51
52 if ( a.target.size() > 0 ) {
53 if ( a.chrome.compare("X") == 0 )
54 a.chrome = "23";
55 if ( a.chrome.compare("Y") == 0 )
56 a.chrome = "24";
57
58 if ( a.chrome.compare("23") == 0 )
59 a.chrome = "chrX";
60 else if ( a.chrome.compare("24") == 0 )
61 a.chrome = "chrY";
62 else if ( a.chrome.find("chr") != 0 )
63 a.chrome = string("chr") + a.chrome;
64
65 if(seg->find(a.target) == seg->end()) {
66 map<string, list<segment> > l;
67 (*seg)[ a.target ] = l;
68 }
69 if ( (*seg)[ a.target ].find( a.chrome ) == (*seg)[ a.target ].end() ) {
70 list<segment> l;
71 (*seg)[ a.target ][ a.chrome ] = l;
72 }
73 (*seg)[ a.target ][a.chrome].push_back( a );
74 targetSet->insert( a.target );
75 }
76 }
77
78
79
80 void print_matrix(segmap *data, set<string> *targetSet, void (*print)(const char *)) {
81
82 //create a break map
83 breakmap breaks;
84 for( segmap::iterator t = data->begin(); t!=data->end(); ++t) {
85 for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) {
86 for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) {
87 if ( breaks.find( s->chrome ) == breaks.end() ) {
88 set<int> ns;
89 ns.insert( s->start );
90 ns.insert( s->end );
91 breaks[ s->chrome ] = ns;
92 } else {
93 breaks[ s->chrome ].insert( s->start );
94 breaks[ s->chrome ].insert( s->end );
95 }
96 }
97 }
98 }
99
100 //print out the matrix column names
101 (*print)( "probe" );
102 for ( set<string>::iterator ts = targetSet->begin(); ts != targetSet->end(); ++ts ) {
103 (*print)( "\t" );
104 (*print)( (*ts).c_str() );
105 }
106 (*print)( "\n" );
107 int targetCount = targetSet->size();
108 //check breakmap for each chrome
109 for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) {
110 vector<int> starts;
111 vector<int> ends;
112 vector<string> probeName;
113
114 breakmap::iterator cset = breaks.find( b->first );
115 int blockCount = cset->second.size() - 1;
116 starts.resize( blockCount );
117 ends.resize( blockCount );
118 probeName.resize( blockCount );
119 int i = 0;
120 for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) {
121 if ( i > 0 ) {
122 ends[i-1] = (*v);
123 }
124 if ( i < blockCount ) {
125 starts[ i ] = (*v);
126 }
127 i++;
128 }
129
130 //create names for the breakpoints
131 genemap gm;
132 gm.resize( blockCount );
133 for ( int i = 0; i < blockCount; i++ ) {
134 stringstream name;
135 name << b->first;
136 name << "_";
137 name << starts[i];
138 name << "_";
139 name << ends[i];
140 probeName[i] = name.str();
141 gm[ i ].resize( targetCount );
142 }
143 //scan the targets
144 int curTarget = 0;
145 for ( set<string>::iterator t = targetSet->begin(); t != targetSet->end(); ++t ) {
146 //find the target values for the current chrome
147 segmap::iterator s = data->find( (*t) );
148 chromemap::iterator ss = s->second.find( b->first );
149 if ( ss != s->second.end() ) {
150 vector<float> vals;
151 vals.resize( blockCount );
152 for ( int i = 0; i < blockCount; i++ ) {
153 vals[ i ] = MISSING_VAL;
154 }
155 //if the segment overlaps the break, assign the value
156 for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) {
157 for ( int i = 0; i < blockCount; i++ ) {
158 if ( cs->end > starts[i] && cs->start < ends[i] ) {
159 vals[i] = cs->value;
160 }
161 }
162 }
163 //assign the values to the named map
164 for ( int i = 0; i < blockCount; i++ ) {
165 gm[ i ][ curTarget ] = vals[ i ];
166 }
167 //cout << s->first << "\t" << ss->first << "\n";
168 }
169 curTarget++;
170 }
171 //print out this chrome's segments
172 //and the values for each target
173 for ( int i =0; i < blockCount; i++ ) {
174 print(probeName[i].c_str());
175 for ( int j = 0; j < targetCount; j++ ) {
176 float val = gm[i][j];
177 print("\t");
178 if ( val == MISSING_VAL )
179 print("NA");
180 else {
181 char str[20] = "";
182 sprintf(str, "%f", val);
183 print(str);
184 }
185 }
186 print("\n");
187 }
188 }
189
190 }
191
192
193 /*
194 * This program read's the stdin looking for a segment format of
195 * <targetname> <chrome_number> <chrome_start> <chrome_end>
196 * breaks the segments into discreate probes (so segments on different targets
197 * have the same probe name) and creates a matrix of probe values for each target
198 */
199
200
201 int main(int argc, char** argv) {
202
203 segmap data;
204 set<string> targetSet;
205
206 //load in segment map
207 do {
208 string tmp;
209 segment a;
210 cin >> a.target;
211 cin >> a.chrome;
212 cin >> tmp;
213 a.start = atoi( tmp.c_str() );
214 cin >> tmp;
215 a.end = atoi( tmp.c_str() );
216 cin >> tmp;
217 a.value = atof( tmp.c_str() );
218 if ( a.target.size() > 0 ) {
219 if ( a.chrome.compare("X") == 0 )
220 a.chrome = "23";
221 if ( a.chrome.compare("Y") == 0 )
222 a.chrome = "24";
223
224 if ( a.chrome.compare("23") == 0 )
225 a.chrome = "chrX";
226 else if ( a.chrome.compare("24") == 0 )
227 a.chrome = "chrY";
228 else
229 a.chrome = string("chr") + a.chrome;
230
231 if(data.find(a.target) == data.end()) {
232 map<string, list<segment> > l;
233 data[ a.target ] = l;
234 }
235 if ( data[ a.target ].find( a.chrome ) == data[ a.target ].end() ) {
236 list<segment> l;
237 data[ a.target ][ a.chrome ] = l;
238 }
239 data[ a.target ][a.chrome].push_back( a );
240 targetSet.insert( a.target );
241 }
242 } while ( cin );
243
244 //create a break map
245 breakmap breaks;
246 for( segmap::iterator t = data.begin(); t!=data.end(); ++t) {
247 for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) {
248 for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) {
249 if ( breaks.find( s->chrome ) == breaks.end() ) {
250 set<int> ns;
251 ns.insert( s->start );
252 ns.insert( s->end );
253 breaks[ s->chrome ] = ns;
254 } else {
255 breaks[ s->chrome ].insert( s->start );
256 breaks[ s->chrome ].insert( s->end );
257 }
258 }
259 }
260 }
261
262 //print out the matrix column names
263 cout << "probe";
264 for ( set<string>::iterator ts = targetSet.begin(); ts != targetSet.end(); ++ts ) {
265 cout << "\t";
266 cout << (*ts);
267 }
268 cout << "\n";
269 int targetCount = targetSet.size();
270 //check breakmap for each chrome
271 for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) {
272 cerr << b->first << "\n";
273 vector<int> starts;
274 vector<int> ends;
275 vector<string> probeName;
276
277 breakmap::iterator cset = breaks.find( b->first );
278 int blockCount = cset->second.size() - 1;
279 starts.resize( blockCount );
280 ends.resize( blockCount );
281 probeName.resize( blockCount );
282 int i = 0;
283 for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) {
284 if ( i > 0 ) {
285 ends[i-1] = (*v);
286 }
287 if ( i < blockCount ) {
288 starts[ i ] = (*v);
289 }
290 i++;
291 }
292
293 //create names for the breakpoints
294 genemap gm;
295 gm.resize( blockCount );
296 for ( int i = 0; i < blockCount; i++ ) {
297 stringstream name;
298 name << b->first;
299 name << "_";
300 name << starts[i];
301 name << "_";
302 name << ends[i];
303 probeName[i] = name.str();
304 gm[ i ].resize( targetCount );
305 }
306 //scan the targets
307 int curTarget = 0;
308 for ( set<string>::iterator t = targetSet.begin(); t != targetSet.end(); ++t ) {
309 //find the target values for the current chrome
310 segmap::iterator s = data.find( (*t) );
311 chromemap::iterator ss = s->second.find( b->first );
312 if ( ss != s->second.end() ) {
313 vector<float> vals;
314 vals.resize( blockCount );
315 for ( int i = 0; i < blockCount; i++ ) {
316 vals[ i ] = MISSING_VAL;
317 }
318 //if the segment overlaps the break, assign the value
319 for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) {
320 for ( int i = 0; i < blockCount; i++ ) {
321 if ( cs->end > starts[i] && cs->start < ends[i] ) {
322 vals[i] = cs->value;
323 }
324 }
325 }
326 //assign the values to the named map
327 for ( int i = 0; i < blockCount; i++ ) {
328 gm[ i ][ curTarget ] = vals[ i ];
329 }
330 //cout << s->first << "\t" << ss->first << "\n";
331 }
332 curTarget++;
333 }
334 //print out this chrome's segments
335 //and the values for each target
336 for ( int i =0; i < blockCount; i++ ) {
337 cout << probeName[i];
338 for ( int j = 0; j < targetCount; j++ ) {
339 float val = gm[i][j];
340 cout << "\t";
341 if ( val == MISSING_VAL )
342 cout << "null";
343 else
344 cout << val;
345 }
346 cout << "\n";
347 }
348 }
349
350 return 0;
351 }
352
353 #ifdef __cplusplus
354 }
355 #endif