Mercurial > repos > drosofff > msp_sr_readmap_and_size_histograms
annotate readmap.py @ 18:893560ece89f draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
| author | mvdbeek |
|---|---|
| date | Wed, 03 Feb 2016 11:34:35 -0500 |
| parents | a118d511a27c |
| children | f75315939afe |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/python |
| 2 # python parser module for for readmaps and size distributions, guided by GFF3 | |
| 3 # version 0.9.1 (1-6-2014) | |
| 4 # Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> | |
| 5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> | |
| 6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> | |
| 7 | |
| 8 import sys, subprocess, argparse | |
| 9 from smRtools import * | |
| 10 from collections import OrderedDict, defaultdict | |
| 11 import os | |
| 12 | |
| 13 def Parser(): | |
| 14 the_parser = argparse.ArgumentParser() | |
| 15 the_parser.add_argument('--output_readmap', action="store", type=str, help="readmap dataframe") | |
| 16 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") | |
| 17 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") | |
| 18 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") | |
| 19 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") | |
| 20 the_parser.add_argument('--ext',nargs='+', help="input file type") | |
| 21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") | |
| 22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") | |
| 23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") | |
| 24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") | |
| 25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") | |
| 26 the_parser.add_argument('--rcode', type=str, help="R script") | |
| 27 args = the_parser.parse_args() | |
| 28 return args | |
| 29 | |
| 30 args=Parser() | |
| 31 if args.reference_fasta: | |
| 32 genomeRefFormat = "fastaSource" | |
| 33 genomeRefFile = args.reference_fasta | |
| 34 if args.reference_bowtie_index: | |
| 35 genomeRefFormat = "bowtieIndex" | |
| 36 genomeRefFile = args.reference_bowtie_index | |
| 37 readmap_file=args.output_readmap | |
| 38 size_distribution_file=args.output_size_distribution | |
| 39 minquery=args.minquery | |
| 40 maxquery=args.maxquery | |
| 41 Rcode = args.rcode | |
| 42 filePath=args.input | |
| 43 fileExt=args.ext | |
| 44 fileLabel=args.label | |
| 45 normalization_factor=args.normalization_factor | |
| 46 | |
| 47 MasterListOfGenomes = OrderedDict() | |
| 48 | |
| 49 def process_samples(filePath): | |
| 50 for i, filePath in enumerate(filePath): | |
| 51 norm=normalization_factor[i] | |
| 52 print fileLabel[i] | |
| 53 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ | |
| 54 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) | |
| 55 return MasterListOfGenomes | |
| 56 | |
| 57 def dataframe_sanityzer (listofdatalines): | |
| 58 Dict = defaultdict(float) | |
| 59 for line in listofdatalines: | |
| 60 fields= line.split("\t") | |
| 61 Dict[fields[0]] += float (fields[2]) | |
| 62 filtered_list = [] | |
| 63 for line in listofdatalines: | |
| 64 fields= line.split("\t") | |
| 65 if Dict[fields[0]] != 0: | |
| 66 filtered_list.append(line) | |
| 67 return filtered_list | |
| 68 | |
|
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
69 |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
70 def listify_plottable_item(item): |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
71 """ |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
72 plottable is a list of strings: |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
73 'FBti0020401\t78\t-1.0\tR' |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
74 split on tab and return gene, coordinate, count and orientation |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
75 """ |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
76 gene, coordinate, count, orientation = item.split("\t") |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
77 return gene, coordinate, count, orientation |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
78 |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
79 def lookup_gene_length(gene, readDict): |
|
18
893560ece89f
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
mvdbeek
parents:
17
diff
changeset
|
80 return readDict[readDict.keys()[0]].instanceDict.values()[0].size |
|
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
81 |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
82 def handle_start_stop_coordinates(plottable, readDict): |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
83 """ |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
84 To ensure that the plot area always includes the correct start and end coordinates, |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
85 we add an entry at start [coordinate 0] and end [last coordinate] of count 0, if these do not exist. |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
86 """ |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
87 first_line = plottable[0] |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
88 last_line = plottable[-1] |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
89 gene, coordinate, count, orientation = listify_plottable_item(first_line) |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
90 if not coordinate == "0": |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
91 new_line = "\t".join([gene, "0", "0", "F"]) |
|
17
a118d511a27c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 032b3f084f3b2a6d42ba476ef55f4de593b58606-dirty
mvdbeek
parents:
16
diff
changeset
|
92 plottable = [new_line] + plottable |
|
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
93 gene_length = str(lookup_gene_length(gene, readDict)) |
|
18
893560ece89f
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
mvdbeek
parents:
17
diff
changeset
|
94 gene, coordinate, count, orientation = listify_plottable_item(last_line) |
|
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
95 if not coordinate == gene_length: |
|
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
96 last_line = "\t".join([gene, gene_length, "0", "F"]) |
|
17
a118d511a27c
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 032b3f084f3b2a6d42ba476ef55f4de593b58606-dirty
mvdbeek
parents:
16
diff
changeset
|
97 plottable = plottable + [last_line] |
|
18
893560ece89f
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
mvdbeek
parents:
17
diff
changeset
|
98 return plottable |
|
16
70f4385534f9
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit c400846f3ef40a5cbaa373d2ca7c93fb05ecd20c-dirty
mvdbeek
parents:
0
diff
changeset
|
99 |
| 0 | 100 def write_readplot_dataframe(readDict, readmap_file): |
| 101 listoflines = [] | |
| 102 with open(readmap_file, 'w') as readmap: | |
| 103 print >>readmap, "gene\tcoord\tcount\tpolarity\tsample" | |
| 104 for sample in readDict.keys(): | |
| 105 if args.gff: | |
| 106 dict=readDict[sample] | |
| 107 else: | |
| 108 dict=readDict[sample].instanceDict | |
| 109 for gene in dict.keys(): | |
| 110 plottable = dict[gene].readplot() | |
|
18
893560ece89f
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit 9237338d798251fb2667280d597746e852f3ffcc-dirty
mvdbeek
parents:
17
diff
changeset
|
111 plottable = handle_start_stop_coordinates(plottable, readDict) |
| 0 | 112 for line in plottable: |
| 113 #print >>readmap, "%s\t%s" % (line, sample) | |
| 114 listoflines.append ("%s\t%s" % (line, sample)) | |
| 115 listoflines = dataframe_sanityzer(listoflines) | |
| 116 for line in listoflines: | |
| 117 print >>readmap, line | |
| 118 | |
| 119 def write_size_distribution_dataframe(readDict, size_distribution_file): | |
| 120 listoflines = [] | |
| 121 with open(size_distribution_file, 'w') as size_distrib: | |
| 122 print >>size_distrib, "gene\tsize\tcount\tpolarity\tsample" # test before was "gene\tpolarity\tsize\tcount\tsample" | |
| 123 for sample in readDict.keys(): | |
| 124 if args.gff: | |
| 125 dict=readDict[sample] | |
| 126 else: | |
| 127 dict=readDict[sample].instanceDict | |
| 128 for gene in dict.keys(): | |
| 129 histogram = dict[gene].size_histogram(minquery=args.minquery, maxquery=args.maxquery) | |
| 130 for polarity in histogram.keys(): | |
| 131 if polarity=='both': | |
| 132 continue | |
| 133 #for size in xrange(args.minquery, args.maxquery): | |
| 134 # if not size in histogram[polarity].keys(): | |
| 135 # histogram[size]=0 | |
| 136 for size, count in histogram[polarity].iteritems(): | |
| 137 #print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) # test, changed the order accordingly | |
| 138 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) ) | |
| 139 listoflines = dataframe_sanityzer(listoflines) | |
| 140 for line in listoflines: | |
| 141 print >>size_distrib, line | |
| 142 | |
| 143 def gff_item_subinstances(readDict, gff3): | |
| 144 GFFinstanceDict=OrderedDict() | |
| 145 for sample in readDict.keys(): | |
| 146 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class) | |
| 147 with open(gff3) as gff: | |
| 148 for line in gff: | |
| 149 if line[0] == "#": continue | |
| 150 gff_fields = line[:-1].split("\t") | |
| 151 chrom = gff_fields[0] | |
| 152 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | |
| 153 item_upstream_coordinate = int(gff_fields[3]) | |
| 154 item_downstream_coordinate = int(gff_fields[4]) | |
| 155 item_polarity = gff_fields[6] | |
| 156 for sample in readDict.keys(): | |
| 157 ## this is not required anymore but test | |
| 158 # if not GFFinstanceDict.has_key(sample): | |
| 159 # GFFinstanceDict[sample]={} | |
| 160 #### | |
| 161 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) | |
| 162 if item_polarity == '-': | |
| 163 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} | |
| 164 subinstance.gene=gff_name | |
| 165 GFFinstanceDict[sample][gff_name]=subinstance | |
| 166 return GFFinstanceDict | |
| 167 | |
| 168 MasterListOfGenomes=process_samples(filePath) | |
| 169 | |
| 170 if args.gff: | |
| 171 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) | |
| 172 | |
| 173 write_readplot_dataframe(MasterListOfGenomes, readmap_file) | |
| 174 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) | |
| 175 | |
| 176 R_command="Rscript "+ Rcode | |
| 177 process = subprocess.Popen(R_command.split()) | |
| 178 process.wait() | |
| 179 |
