Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison seg2matrix/CGData/CsegToMatrix.cc @ 31:ab20c0d04f4a
add seg2matrix tool
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 13:10:11 -0700 |
parents | |
children | 59dbe857f5d4 |
comparison
equal
deleted
inserted
replaced
30:7a7a52e9b019 | 31:ab20c0d04f4a |
---|---|
1 | |
2 #include <map> | |
3 #include <iostream> | |
4 #include <list> | |
5 #include <set> | |
6 #include <vector> | |
7 #include <sstream> | |
8 #include <stdlib.h> | |
9 #include <stdio.h> | |
10 #include <string.h> | |
11 | |
12 #ifdef __cplusplus | |
13 extern "C" { | |
14 #endif | |
15 | |
16 using namespace std; | |
17 | |
18 #define MISSING_VAL -99 | |
19 | |
20 class segment { | |
21 public: | |
22 string target; | |
23 string chrome; | |
24 int start, end; | |
25 float value; | |
26 }; | |
27 | |
28 typedef map<string,list<segment> > chromemap; | |
29 typedef map< string, chromemap > segmap; | |
30 typedef map<string,set<int> > breakmap; | |
31 typedef vector<vector<float> > genemap; | |
32 | |
33 | |
34 segmap * new_segment() { | |
35 return new segmap(); | |
36 } | |
37 | |
38 set<string> * new_target_set() { | |
39 return new set<string>(); | |
40 } | |
41 | |
42 void add_segment(segmap * seg, set<string> *targetSet, char *sample, char *chrom, int chrom_start, int chrom_end, float value) { | |
43 string tmp; | |
44 segment a; | |
45 | |
46 a.target = strdup(sample); | |
47 a.chrome = strdup(chrom); | |
48 a.start = chrom_start; | |
49 a.end = chrom_end; | |
50 a.value = value; | |
51 | |
52 if ( a.target.size() > 0 ) { | |
53 if ( a.chrome.compare("X") == 0 ) | |
54 a.chrome = "23"; | |
55 if ( a.chrome.compare("Y") == 0 ) | |
56 a.chrome = "24"; | |
57 | |
58 if ( a.chrome.compare("23") == 0 ) | |
59 a.chrome = "chrX"; | |
60 else if ( a.chrome.compare("24") == 0 ) | |
61 a.chrome = "chrY"; | |
62 else if ( a.chrome.find("chr") != 0 ) | |
63 a.chrome = string("chr") + a.chrome; | |
64 | |
65 if(seg->find(a.target) == seg->end()) { | |
66 map<string, list<segment> > l; | |
67 (*seg)[ a.target ] = l; | |
68 } | |
69 if ( (*seg)[ a.target ].find( a.chrome ) == (*seg)[ a.target ].end() ) { | |
70 list<segment> l; | |
71 (*seg)[ a.target ][ a.chrome ] = l; | |
72 } | |
73 (*seg)[ a.target ][a.chrome].push_back( a ); | |
74 targetSet->insert( a.target ); | |
75 } | |
76 } | |
77 | |
78 | |
79 | |
80 void print_matrix(segmap *data, set<string> *targetSet, void (*print)(const char *)) { | |
81 | |
82 //create a break map | |
83 breakmap breaks; | |
84 for( segmap::iterator t = data->begin(); t!=data->end(); ++t) { | |
85 for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) { | |
86 for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) { | |
87 if ( breaks.find( s->chrome ) == breaks.end() ) { | |
88 set<int> ns; | |
89 ns.insert( s->start ); | |
90 ns.insert( s->end ); | |
91 breaks[ s->chrome ] = ns; | |
92 } else { | |
93 breaks[ s->chrome ].insert( s->start ); | |
94 breaks[ s->chrome ].insert( s->end ); | |
95 } | |
96 } | |
97 } | |
98 } | |
99 | |
100 //print out the matrix column names | |
101 (*print)( "probe" ); | |
102 for ( set<string>::iterator ts = targetSet->begin(); ts != targetSet->end(); ++ts ) { | |
103 (*print)( "\t" ); | |
104 (*print)( (*ts).c_str() ); | |
105 } | |
106 (*print)( "\n" ); | |
107 int targetCount = targetSet->size(); | |
108 //check breakmap for each chrome | |
109 for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) { | |
110 vector<int> starts; | |
111 vector<int> ends; | |
112 vector<string> probeName; | |
113 | |
114 breakmap::iterator cset = breaks.find( b->first ); | |
115 int blockCount = cset->second.size() - 1; | |
116 starts.resize( blockCount ); | |
117 ends.resize( blockCount ); | |
118 probeName.resize( blockCount ); | |
119 int i = 0; | |
120 for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) { | |
121 if ( i > 0 ) { | |
122 ends[i-1] = (*v); | |
123 } | |
124 if ( i < blockCount ) { | |
125 starts[ i ] = (*v); | |
126 } | |
127 i++; | |
128 } | |
129 | |
130 //create names for the breakpoints | |
131 genemap gm; | |
132 gm.resize( blockCount ); | |
133 for ( int i = 0; i < blockCount; i++ ) { | |
134 stringstream name; | |
135 name << b->first; | |
136 name << "_"; | |
137 name << starts[i]; | |
138 name << "_"; | |
139 name << ends[i]; | |
140 probeName[i] = name.str(); | |
141 gm[ i ].resize( targetCount ); | |
142 } | |
143 //scan the targets | |
144 int curTarget = 0; | |
145 for ( set<string>::iterator t = targetSet->begin(); t != targetSet->end(); ++t ) { | |
146 //find the target values for the current chrome | |
147 segmap::iterator s = data->find( (*t) ); | |
148 chromemap::iterator ss = s->second.find( b->first ); | |
149 if ( ss != s->second.end() ) { | |
150 vector<float> vals; | |
151 vals.resize( blockCount ); | |
152 for ( int i = 0; i < blockCount; i++ ) { | |
153 vals[ i ] = MISSING_VAL; | |
154 } | |
155 //if the segment overlaps the break, assign the value | |
156 for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) { | |
157 for ( int i = 0; i < blockCount; i++ ) { | |
158 if ( cs->end > starts[i] && cs->start < ends[i] ) { | |
159 vals[i] = cs->value; | |
160 } | |
161 } | |
162 } | |
163 //assign the values to the named map | |
164 for ( int i = 0; i < blockCount; i++ ) { | |
165 gm[ i ][ curTarget ] = vals[ i ]; | |
166 } | |
167 //cout << s->first << "\t" << ss->first << "\n"; | |
168 } | |
169 curTarget++; | |
170 } | |
171 //print out this chrome's segments | |
172 //and the values for each target | |
173 for ( int i =0; i < blockCount; i++ ) { | |
174 print(probeName[i].c_str()); | |
175 for ( int j = 0; j < targetCount; j++ ) { | |
176 float val = gm[i][j]; | |
177 print("\t"); | |
178 if ( val == MISSING_VAL ) | |
179 print("NA"); | |
180 else { | |
181 char str[20] = ""; | |
182 sprintf(str, "%f", val); | |
183 print(str); | |
184 } | |
185 } | |
186 print("\n"); | |
187 } | |
188 } | |
189 | |
190 } | |
191 | |
192 | |
193 /* | |
194 * This program read's the stdin looking for a segment format of | |
195 * <targetname> <chrome_number> <chrome_start> <chrome_end> | |
196 * breaks the segments into discreate probes (so segments on different targets | |
197 * have the same probe name) and creates a matrix of probe values for each target | |
198 */ | |
199 | |
200 | |
201 int main(int argc, char** argv) { | |
202 | |
203 segmap data; | |
204 set<string> targetSet; | |
205 | |
206 //load in segment map | |
207 do { | |
208 string tmp; | |
209 segment a; | |
210 cin >> a.target; | |
211 cin >> a.chrome; | |
212 cin >> tmp; | |
213 a.start = atoi( tmp.c_str() ); | |
214 cin >> tmp; | |
215 a.end = atoi( tmp.c_str() ); | |
216 cin >> tmp; | |
217 a.value = atof( tmp.c_str() ); | |
218 if ( a.target.size() > 0 ) { | |
219 if ( a.chrome.compare("X") == 0 ) | |
220 a.chrome = "23"; | |
221 if ( a.chrome.compare("Y") == 0 ) | |
222 a.chrome = "24"; | |
223 | |
224 if ( a.chrome.compare("23") == 0 ) | |
225 a.chrome = "chrX"; | |
226 else if ( a.chrome.compare("24") == 0 ) | |
227 a.chrome = "chrY"; | |
228 else | |
229 a.chrome = string("chr") + a.chrome; | |
230 | |
231 if(data.find(a.target) == data.end()) { | |
232 map<string, list<segment> > l; | |
233 data[ a.target ] = l; | |
234 } | |
235 if ( data[ a.target ].find( a.chrome ) == data[ a.target ].end() ) { | |
236 list<segment> l; | |
237 data[ a.target ][ a.chrome ] = l; | |
238 } | |
239 data[ a.target ][a.chrome].push_back( a ); | |
240 targetSet.insert( a.target ); | |
241 } | |
242 } while ( cin ); | |
243 | |
244 //create a break map | |
245 breakmap breaks; | |
246 for( segmap::iterator t = data.begin(); t!=data.end(); ++t) { | |
247 for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) { | |
248 for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) { | |
249 if ( breaks.find( s->chrome ) == breaks.end() ) { | |
250 set<int> ns; | |
251 ns.insert( s->start ); | |
252 ns.insert( s->end ); | |
253 breaks[ s->chrome ] = ns; | |
254 } else { | |
255 breaks[ s->chrome ].insert( s->start ); | |
256 breaks[ s->chrome ].insert( s->end ); | |
257 } | |
258 } | |
259 } | |
260 } | |
261 | |
262 //print out the matrix column names | |
263 cout << "probe"; | |
264 for ( set<string>::iterator ts = targetSet.begin(); ts != targetSet.end(); ++ts ) { | |
265 cout << "\t"; | |
266 cout << (*ts); | |
267 } | |
268 cout << "\n"; | |
269 int targetCount = targetSet.size(); | |
270 //check breakmap for each chrome | |
271 for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) { | |
272 cerr << b->first << "\n"; | |
273 vector<int> starts; | |
274 vector<int> ends; | |
275 vector<string> probeName; | |
276 | |
277 breakmap::iterator cset = breaks.find( b->first ); | |
278 int blockCount = cset->second.size() - 1; | |
279 starts.resize( blockCount ); | |
280 ends.resize( blockCount ); | |
281 probeName.resize( blockCount ); | |
282 int i = 0; | |
283 for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) { | |
284 if ( i > 0 ) { | |
285 ends[i-1] = (*v); | |
286 } | |
287 if ( i < blockCount ) { | |
288 starts[ i ] = (*v); | |
289 } | |
290 i++; | |
291 } | |
292 | |
293 //create names for the breakpoints | |
294 genemap gm; | |
295 gm.resize( blockCount ); | |
296 for ( int i = 0; i < blockCount; i++ ) { | |
297 stringstream name; | |
298 name << b->first; | |
299 name << "_"; | |
300 name << starts[i]; | |
301 name << "_"; | |
302 name << ends[i]; | |
303 probeName[i] = name.str(); | |
304 gm[ i ].resize( targetCount ); | |
305 } | |
306 //scan the targets | |
307 int curTarget = 0; | |
308 for ( set<string>::iterator t = targetSet.begin(); t != targetSet.end(); ++t ) { | |
309 //find the target values for the current chrome | |
310 segmap::iterator s = data.find( (*t) ); | |
311 chromemap::iterator ss = s->second.find( b->first ); | |
312 if ( ss != s->second.end() ) { | |
313 vector<float> vals; | |
314 vals.resize( blockCount ); | |
315 for ( int i = 0; i < blockCount; i++ ) { | |
316 vals[ i ] = MISSING_VAL; | |
317 } | |
318 //if the segment overlaps the break, assign the value | |
319 for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) { | |
320 for ( int i = 0; i < blockCount; i++ ) { | |
321 if ( cs->end > starts[i] && cs->start < ends[i] ) { | |
322 vals[i] = cs->value; | |
323 } | |
324 } | |
325 } | |
326 //assign the values to the named map | |
327 for ( int i = 0; i < blockCount; i++ ) { | |
328 gm[ i ][ curTarget ] = vals[ i ]; | |
329 } | |
330 //cout << s->first << "\t" << ss->first << "\n"; | |
331 } | |
332 curTarget++; | |
333 } | |
334 //print out this chrome's segments | |
335 //and the values for each target | |
336 for ( int i =0; i < blockCount; i++ ) { | |
337 cout << probeName[i]; | |
338 for ( int j = 0; j < targetCount; j++ ) { | |
339 float val = gm[i][j]; | |
340 cout << "\t"; | |
341 if ( val == MISSING_VAL ) | |
342 cout << "null"; | |
343 else | |
344 cout << val; | |
345 } | |
346 cout << "\n"; | |
347 } | |
348 } | |
349 | |
350 return 0; | |
351 } | |
352 | |
353 #ifdef __cplusplus | |
354 } | |
355 #endif |