Mercurial > repos > melissacline > ucsc_cancer_utilities
view seg2matrix/CGData/CsegToMatrix.cc @ 31:ab20c0d04f4a
add seg2matrix tool
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 13:10:11 -0700 |
parents | |
children | 59dbe857f5d4 |
line wrap: on
line source
#include <map> #include <iostream> #include <list> #include <set> #include <vector> #include <sstream> #include <stdlib.h> #include <stdio.h> #include <string.h> #ifdef __cplusplus extern "C" { #endif using namespace std; #define MISSING_VAL -99 class segment { public: string target; string chrome; int start, end; float value; }; typedef map<string,list<segment> > chromemap; typedef map< string, chromemap > segmap; typedef map<string,set<int> > breakmap; typedef vector<vector<float> > genemap; segmap * new_segment() { return new segmap(); } set<string> * new_target_set() { return new set<string>(); } void add_segment(segmap * seg, set<string> *targetSet, char *sample, char *chrom, int chrom_start, int chrom_end, float value) { string tmp; segment a; a.target = strdup(sample); a.chrome = strdup(chrom); a.start = chrom_start; a.end = chrom_end; a.value = value; if ( a.target.size() > 0 ) { if ( a.chrome.compare("X") == 0 ) a.chrome = "23"; if ( a.chrome.compare("Y") == 0 ) a.chrome = "24"; if ( a.chrome.compare("23") == 0 ) a.chrome = "chrX"; else if ( a.chrome.compare("24") == 0 ) a.chrome = "chrY"; else if ( a.chrome.find("chr") != 0 ) a.chrome = string("chr") + a.chrome; if(seg->find(a.target) == seg->end()) { map<string, list<segment> > l; (*seg)[ a.target ] = l; } if ( (*seg)[ a.target ].find( a.chrome ) == (*seg)[ a.target ].end() ) { list<segment> l; (*seg)[ a.target ][ a.chrome ] = l; } (*seg)[ a.target ][a.chrome].push_back( a ); targetSet->insert( a.target ); } } void print_matrix(segmap *data, set<string> *targetSet, void (*print)(const char *)) { //create a break map breakmap breaks; for( segmap::iterator t = data->begin(); t!=data->end(); ++t) { for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) { for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) { if ( breaks.find( s->chrome ) == breaks.end() ) { set<int> ns; ns.insert( s->start ); ns.insert( s->end ); breaks[ s->chrome ] = ns; } else { breaks[ s->chrome ].insert( s->start ); breaks[ s->chrome ].insert( s->end ); } } } } //print out the matrix column names (*print)( "probe" ); for ( set<string>::iterator ts = targetSet->begin(); ts != targetSet->end(); ++ts ) { (*print)( "\t" ); (*print)( (*ts).c_str() ); } (*print)( "\n" ); int targetCount = targetSet->size(); //check breakmap for each chrome for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) { vector<int> starts; vector<int> ends; vector<string> probeName; breakmap::iterator cset = breaks.find( b->first ); int blockCount = cset->second.size() - 1; starts.resize( blockCount ); ends.resize( blockCount ); probeName.resize( blockCount ); int i = 0; for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) { if ( i > 0 ) { ends[i-1] = (*v); } if ( i < blockCount ) { starts[ i ] = (*v); } i++; } //create names for the breakpoints genemap gm; gm.resize( blockCount ); for ( int i = 0; i < blockCount; i++ ) { stringstream name; name << b->first; name << "_"; name << starts[i]; name << "_"; name << ends[i]; probeName[i] = name.str(); gm[ i ].resize( targetCount ); } //scan the targets int curTarget = 0; for ( set<string>::iterator t = targetSet->begin(); t != targetSet->end(); ++t ) { //find the target values for the current chrome segmap::iterator s = data->find( (*t) ); chromemap::iterator ss = s->second.find( b->first ); if ( ss != s->second.end() ) { vector<float> vals; vals.resize( blockCount ); for ( int i = 0; i < blockCount; i++ ) { vals[ i ] = MISSING_VAL; } //if the segment overlaps the break, assign the value for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) { for ( int i = 0; i < blockCount; i++ ) { if ( cs->end > starts[i] && cs->start < ends[i] ) { vals[i] = cs->value; } } } //assign the values to the named map for ( int i = 0; i < blockCount; i++ ) { gm[ i ][ curTarget ] = vals[ i ]; } //cout << s->first << "\t" << ss->first << "\n"; } curTarget++; } //print out this chrome's segments //and the values for each target for ( int i =0; i < blockCount; i++ ) { print(probeName[i].c_str()); for ( int j = 0; j < targetCount; j++ ) { float val = gm[i][j]; print("\t"); if ( val == MISSING_VAL ) print("NA"); else { char str[20] = ""; sprintf(str, "%f", val); print(str); } } print("\n"); } } } /* * This program read's the stdin looking for a segment format of * <targetname> <chrome_number> <chrome_start> <chrome_end> * breaks the segments into discreate probes (so segments on different targets * have the same probe name) and creates a matrix of probe values for each target */ int main(int argc, char** argv) { segmap data; set<string> targetSet; //load in segment map do { string tmp; segment a; cin >> a.target; cin >> a.chrome; cin >> tmp; a.start = atoi( tmp.c_str() ); cin >> tmp; a.end = atoi( tmp.c_str() ); cin >> tmp; a.value = atof( tmp.c_str() ); if ( a.target.size() > 0 ) { if ( a.chrome.compare("X") == 0 ) a.chrome = "23"; if ( a.chrome.compare("Y") == 0 ) a.chrome = "24"; if ( a.chrome.compare("23") == 0 ) a.chrome = "chrX"; else if ( a.chrome.compare("24") == 0 ) a.chrome = "chrY"; else a.chrome = string("chr") + a.chrome; if(data.find(a.target) == data.end()) { map<string, list<segment> > l; data[ a.target ] = l; } if ( data[ a.target ].find( a.chrome ) == data[ a.target ].end() ) { list<segment> l; data[ a.target ][ a.chrome ] = l; } data[ a.target ][a.chrome].push_back( a ); targetSet.insert( a.target ); } } while ( cin ); //create a break map breakmap breaks; for( segmap::iterator t = data.begin(); t!=data.end(); ++t) { for ( chromemap::iterator c = t->second.begin(); c != t->second.end(); ++c) { for ( list<segment>::iterator s = c->second.begin(); s != c->second.end(); s++ ) { if ( breaks.find( s->chrome ) == breaks.end() ) { set<int> ns; ns.insert( s->start ); ns.insert( s->end ); breaks[ s->chrome ] = ns; } else { breaks[ s->chrome ].insert( s->start ); breaks[ s->chrome ].insert( s->end ); } } } } //print out the matrix column names cout << "probe"; for ( set<string>::iterator ts = targetSet.begin(); ts != targetSet.end(); ++ts ) { cout << "\t"; cout << (*ts); } cout << "\n"; int targetCount = targetSet.size(); //check breakmap for each chrome for ( breakmap::iterator b = breaks.begin(); b != breaks.end(); ++b ) { cerr << b->first << "\n"; vector<int> starts; vector<int> ends; vector<string> probeName; breakmap::iterator cset = breaks.find( b->first ); int blockCount = cset->second.size() - 1; starts.resize( blockCount ); ends.resize( blockCount ); probeName.resize( blockCount ); int i = 0; for ( set<int>::iterator v = cset->second.begin(); v != cset->second.end(); ++v ) { if ( i > 0 ) { ends[i-1] = (*v); } if ( i < blockCount ) { starts[ i ] = (*v); } i++; } //create names for the breakpoints genemap gm; gm.resize( blockCount ); for ( int i = 0; i < blockCount; i++ ) { stringstream name; name << b->first; name << "_"; name << starts[i]; name << "_"; name << ends[i]; probeName[i] = name.str(); gm[ i ].resize( targetCount ); } //scan the targets int curTarget = 0; for ( set<string>::iterator t = targetSet.begin(); t != targetSet.end(); ++t ) { //find the target values for the current chrome segmap::iterator s = data.find( (*t) ); chromemap::iterator ss = s->second.find( b->first ); if ( ss != s->second.end() ) { vector<float> vals; vals.resize( blockCount ); for ( int i = 0; i < blockCount; i++ ) { vals[ i ] = MISSING_VAL; } //if the segment overlaps the break, assign the value for ( list<segment>::iterator cs = ss->second.begin(); cs != ss->second.end(); cs++ ) { for ( int i = 0; i < blockCount; i++ ) { if ( cs->end > starts[i] && cs->start < ends[i] ) { vals[i] = cs->value; } } } //assign the values to the named map for ( int i = 0; i < blockCount; i++ ) { gm[ i ][ curTarget ] = vals[ i ]; } //cout << s->first << "\t" << ss->first << "\n"; } curTarget++; } //print out this chrome's segments //and the values for each target for ( int i =0; i < blockCount; i++ ) { cout << probeName[i]; for ( int j = 0; j < targetCount; j++ ) { float val = gm[i][j]; cout << "\t"; if ( val == MISSING_VAL ) cout << "null"; else cout << val; } cout << "\n"; } } return 0; } #ifdef __cplusplus } #endif