Mercurial > repos > devteam > mapping_to_ucsc
diff mapping_to_ucsc.py @ 0:cf8d47e49c21 draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 10:59:47 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mapping_to_ucsc.py Mon May 19 10:59:47 2014 -0400 @@ -0,0 +1,203 @@ +#!/usr/bin/env python + +import sys, tempfile, os + +assert sys.version_info[:2] >= (2.4) + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def main(): + + out_fname = sys.argv[1] + in_fname = sys.argv[2] + chr_col = int(sys.argv[3])-1 + coord_col = int(sys.argv[4])-1 + track_type = sys.argv[5] + if track_type == 'coverage' or track_type == 'both': + coverage_col = int(sys.argv[6])-1 + cname = sys.argv[7] + cdescription = sys.argv[8] + ccolor = sys.argv[9].replace('-',',') + cvisibility = sys.argv[10] + if track_type == 'snp' or track_type == 'both': + if track_type == 'both': + j = 5 + else: + j = 0 + #sname = sys.argv[7+j] + sdescription = sys.argv[6+j] + svisibility = sys.argv[7+j] + #ref_col = int(sys.argv[10+j])-1 + read_col = int(sys.argv[8+j])-1 + + + # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically) + sorted_infile = tempfile.NamedTemporaryFile() + try: + os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname)) + except Exception, exc: + stop_err( 'Initialization error -> %s' %str(exc) ) + + #generate chr list + sorted_infile.seek(0) + chr_vals = [] + for line in file( sorted_infile.name ): + line = line.strip() + if not(line): + continue + try: + fields = line.split('\t') + chr = fields[chr_col] + if chr not in chr_vals: + chr_vals.append(chr) + except: + pass + if not(chr_vals): + stop_err("Skipped all lines as invalid.") + + if track_type == 'coverage' or track_type == 'both': + if track_type == 'coverage': + fout = open( out_fname, "w" ) + else: + fout = tempfile.NamedTemporaryFile() + fout.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( cname, cdescription, ccolor, cvisibility )) + if track_type == 'snp' or track_type == 'both': + fout_a = tempfile.NamedTemporaryFile() + fout_t = tempfile.NamedTemporaryFile() + fout_g = tempfile.NamedTemporaryFile() + fout_c = tempfile.NamedTemporaryFile() + fout_ref = tempfile.NamedTemporaryFile() + + fout_a.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track A", sdescription, '255,0,0', svisibility )) + fout_t.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track T", sdescription, '0,255,0', svisibility )) + fout_g.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track G", sdescription, '0,0,255', svisibility )) + fout_c.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track C", sdescription, '255,0,255', svisibility )) + + + sorted_infile.seek(0) + for line in file( sorted_infile.name ): + line = line.strip() + if not(line): + continue + try: + fields = line.split('\t') + chr = fields[chr_col] + start = int(fields[coord_col]) + assert start > 0 + except: + continue + try: + ind = chr_vals.index(chr) #encountered chr for the 1st time + del chr_vals[ind] + prev_start = '' + header = "variableStep chrom=%s\n" %(chr) + if track_type == 'coverage' or track_type == 'both': + coverage = int(fields[coverage_col]) + line1 = "%s\t%s\n" %(start,coverage) + fout.write("%s%s" %(header,line1)) + if track_type == 'snp' or track_type == 'both': + a = t = g = c = 0 + fout_a.write("%s" %(header)) + fout_t.write("%s" %(header)) + fout_g.write("%s" %(header)) + fout_c.write("%s" %(header)) + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + prev_start = start + except ValueError: + if start != prev_start: + if track_type == 'coverage' or track_type == 'both': + coverage = int(fields[coverage_col]) + fout.write("%s\t%s\n" %(start,coverage)) + if track_type == 'snp' or track_type == 'both': + if a: + fout_a.write("%s\t%s\n" %(prev_start,a)) + if t: + fout_t.write("%s\t%s\n" %(prev_start,t)) + if g: + fout_g.write("%s\t%s\n" %(prev_start,g)) + if c: + fout_c.write("%s\t%s\n" %(prev_start,c)) + a = t = g = c = 0 + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + prev_start = start + else: + if track_type == 'snp' or track_type == 'both': + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + + if track_type == 'snp' or track_type == 'both': + if a: + fout_a.write("%s\t%s\n" %(prev_start,a)) + if t: + fout_t.write("%s\t%s\n" %(prev_start,t)) + if g: + fout_g.write("%s\t%s\n" %(prev_start,g)) + if c: + fout_c.write("%s\t%s\n" %(prev_start,c)) + + fout_a.seek(0) + fout_g.seek(0) + fout_t.seek(0) + fout_c.seek(0) + + if track_type == 'snp': + os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) + elif track_type == 'both': + fout.seek(0) + os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) +if __name__ == "__main__": + main() \ No newline at end of file