0
|
1 #!/usr/bin/python
|
|
2 # python parser module for for readmaps and size distributions, guided by GFF3
|
|
3 # version 0.9.1 (1-6-2014)
|
|
4 # Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
|
|
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
|
|
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
|
|
7
|
|
8 import sys, subprocess, argparse
|
|
9 from smRtools import *
|
|
10 from collections import OrderedDict, defaultdict
|
|
11 import os
|
|
12
|
|
13 def Parser():
|
|
14 the_parser = argparse.ArgumentParser()
|
|
15 the_parser.add_argument('--output_readmap', action="store", type=str, help="readmap dataframe")
|
|
16 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe")
|
|
17 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file")
|
|
18 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references")
|
|
19 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
|
|
20 the_parser.add_argument('--ext',nargs='+', help="input file type")
|
|
21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
|
|
22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file")
|
|
23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
|
|
24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
|
|
25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
|
|
26 the_parser.add_argument('--rcode', type=str, help="R script")
|
|
27 args = the_parser.parse_args()
|
|
28 return args
|
|
29
|
|
30 args=Parser()
|
|
31 if args.reference_fasta:
|
|
32 genomeRefFormat = "fastaSource"
|
|
33 genomeRefFile = args.reference_fasta
|
|
34 if args.reference_bowtie_index:
|
|
35 genomeRefFormat = "bowtieIndex"
|
|
36 genomeRefFile = args.reference_bowtie_index
|
|
37 readmap_file=args.output_readmap
|
|
38 size_distribution_file=args.output_size_distribution
|
|
39 minquery=args.minquery
|
|
40 maxquery=args.maxquery
|
|
41 Rcode = args.rcode
|
|
42 filePath=args.input
|
|
43 fileExt=args.ext
|
|
44 fileLabel=args.label
|
|
45 normalization_factor=args.normalization_factor
|
|
46
|
|
47 MasterListOfGenomes = OrderedDict()
|
|
48
|
|
49 def process_samples(filePath):
|
|
50 for i, filePath in enumerate(filePath):
|
|
51 norm=normalization_factor[i]
|
|
52 print fileLabel[i]
|
|
53 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\
|
|
54 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
|
|
55 return MasterListOfGenomes
|
|
56
|
|
57 def dataframe_sanityzer (listofdatalines):
|
|
58 Dict = defaultdict(float)
|
|
59 for line in listofdatalines:
|
|
60 fields= line.split("\t")
|
|
61 Dict[fields[0]] += float (fields[2])
|
|
62 filtered_list = []
|
|
63 for line in listofdatalines:
|
|
64 fields= line.split("\t")
|
|
65 if Dict[fields[0]] != 0:
|
|
66 filtered_list.append(line)
|
|
67 return filtered_list
|
|
68
|
|
69 def write_readplot_dataframe(readDict, readmap_file):
|
|
70 listoflines = []
|
|
71 with open(readmap_file, 'w') as readmap:
|
|
72 print >>readmap, "gene\tcoord\tcount\tpolarity\tsample"
|
|
73 for sample in readDict.keys():
|
|
74 if args.gff:
|
|
75 dict=readDict[sample]
|
|
76 else:
|
|
77 dict=readDict[sample].instanceDict
|
|
78 for gene in dict.keys():
|
|
79 plottable = dict[gene].readplot()
|
|
80 for line in plottable:
|
|
81 #print >>readmap, "%s\t%s" % (line, sample)
|
|
82 listoflines.append ("%s\t%s" % (line, sample))
|
|
83 listoflines = dataframe_sanityzer(listoflines)
|
|
84 for line in listoflines:
|
|
85 print >>readmap, line
|
|
86
|
|
87 def write_size_distribution_dataframe(readDict, size_distribution_file):
|
|
88 listoflines = []
|
|
89 with open(size_distribution_file, 'w') as size_distrib:
|
|
90 print >>size_distrib, "gene\tsize\tcount\tpolarity\tsample" # test before was "gene\tpolarity\tsize\tcount\tsample"
|
|
91 for sample in readDict.keys():
|
|
92 if args.gff:
|
|
93 dict=readDict[sample]
|
|
94 else:
|
|
95 dict=readDict[sample].instanceDict
|
|
96 for gene in dict.keys():
|
|
97 histogram = dict[gene].size_histogram(minquery=args.minquery, maxquery=args.maxquery)
|
|
98 for polarity in histogram.keys():
|
|
99 if polarity=='both':
|
|
100 continue
|
|
101 #for size in xrange(args.minquery, args.maxquery):
|
|
102 # if not size in histogram[polarity].keys():
|
|
103 # histogram[size]=0
|
|
104 for size, count in histogram[polarity].iteritems():
|
|
105 #print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) # test, changed the order accordingly
|
|
106 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) )
|
|
107 listoflines = dataframe_sanityzer(listoflines)
|
|
108 for line in listoflines:
|
|
109 print >>size_distrib, line
|
|
110
|
|
111 def gff_item_subinstances(readDict, gff3):
|
|
112 GFFinstanceDict=OrderedDict()
|
|
113 for sample in readDict.keys():
|
|
114 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class)
|
|
115 with open(gff3) as gff:
|
|
116 for line in gff:
|
|
117 if line[0] == "#": continue
|
|
118 gff_fields = line[:-1].split("\t")
|
|
119 chrom = gff_fields[0]
|
|
120 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
|
|
121 item_upstream_coordinate = int(gff_fields[3])
|
|
122 item_downstream_coordinate = int(gff_fields[4])
|
|
123 item_polarity = gff_fields[6]
|
|
124 for sample in readDict.keys():
|
|
125 ## this is not required anymore but test
|
|
126 # if not GFFinstanceDict.has_key(sample):
|
|
127 # GFFinstanceDict[sample]={}
|
|
128 ####
|
|
129 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
|
|
130 if item_polarity == '-':
|
|
131 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
|
|
132 subinstance.gene=gff_name
|
|
133 GFFinstanceDict[sample][gff_name]=subinstance
|
|
134 return GFFinstanceDict
|
|
135
|
|
136 MasterListOfGenomes=process_samples(filePath)
|
|
137
|
|
138 if args.gff:
|
|
139 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff)
|
|
140
|
|
141 write_readplot_dataframe(MasterListOfGenomes, readmap_file)
|
|
142 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file)
|
|
143
|
|
144 R_command="Rscript "+ Rcode
|
|
145 process = subprocess.Popen(R_command.split())
|
|
146 process.wait()
|
|
147
|