Mercurial > repos > drosofff > msp_sr_size_histograms
comparison size_histogram.py @ 0:63ff807752d7 draft
Imported from capsule None
| author | drosofff |
|---|---|
| date | Mon, 03 Nov 2014 10:30:29 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:63ff807752d7 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 # python parser module for size distributions, guided by GFF3 | |
| 3 # version 0.9.1 (1-6-2014) | |
| 4 # Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> | |
| 5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> | |
| 6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> | |
| 7 | |
| 8 import sys, subprocess, argparse | |
| 9 from smRtools import * | |
| 10 from collections import OrderedDict, defaultdict | |
| 11 import os | |
| 12 | |
| 13 def Parser(): | |
| 14 the_parser = argparse.ArgumentParser() | |
| 15 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") | |
| 16 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") | |
| 17 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") | |
| 18 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") | |
| 19 the_parser.add_argument('--ext',nargs='+', help="input file type") | |
| 20 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") | |
| 21 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") | |
| 22 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") | |
| 23 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") | |
| 24 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") | |
| 25 the_parser.add_argument('--rcode', type=str, help="R script") | |
| 26 the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calcilated for the sum of all items") | |
| 27 the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed") | |
| 28 args = the_parser.parse_args() | |
| 29 return args | |
| 30 | |
| 31 args=Parser() | |
| 32 if args.reference_fasta: | |
| 33 genomeRefFormat = "fastaSource" | |
| 34 genomeRefFile = args.reference_fasta | |
| 35 if args.reference_bowtie_index: | |
| 36 genomeRefFormat = "bowtieIndex" | |
| 37 genomeRefFile = args.reference_bowtie_index | |
| 38 size_distribution_file=args.output_size_distribution | |
| 39 minquery=args.minquery | |
| 40 maxquery=args.maxquery | |
| 41 Rcode = args.rcode | |
| 42 filePath=args.input | |
| 43 fileExt=args.ext | |
| 44 fileLabel=args.label | |
| 45 normalization_factor=args.normalization_factor | |
| 46 global_size=args.global_size | |
| 47 collapse=args.collapse | |
| 48 | |
| 49 if collapse: | |
| 50 pol=["both"] | |
| 51 else: | |
| 52 pol=["F", "R"] | |
| 53 | |
| 54 MasterListOfGenomes = OrderedDict() | |
| 55 | |
| 56 def process_samples(filePath): | |
| 57 for i, filePath in enumerate(filePath): | |
| 58 norm=normalization_factor[i] | |
| 59 print fileLabel[i] | |
| 60 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ | |
| 61 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) | |
| 62 return MasterListOfGenomes | |
| 63 | |
| 64 def write_size_distribution_dataframe(readDict, size_distribution_file, pol=["both"] ): | |
| 65 '''refactored on 7-9-2014''' | |
| 66 with open(size_distribution_file, 'w') as size_distrib: | |
| 67 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" | |
| 68 for sample in readDict.keys(): | |
| 69 if args.gff: | |
| 70 dict=readDict[sample] | |
| 71 else: | |
| 72 dict=readDict[sample].instanceDict | |
| 73 for gene in dict.keys(): | |
| 74 histogram = dict[gene].size_histogram() | |
| 75 for polarity in pol: | |
| 76 for size, count in histogram[polarity].iteritems(): | |
| 77 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample) | |
| 78 | |
| 79 def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]): | |
| 80 with open(size_distribution_file, 'w') as size_distrib: | |
| 81 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample" | |
| 82 for sample in readDict.keys(): | |
| 83 histogram = readDict[sample].size_histogram() | |
| 84 gene="sample" | |
| 85 for polarity in pol: | |
| 86 for size, count in histogram[polarity].iteritems(): | |
| 87 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample) | |
| 88 | |
| 89 def gff_item_subinstances(readDict, gff3): | |
| 90 GFFinstanceDict=OrderedDict() | |
| 91 with open(gff3) as gff: | |
| 92 for line in gff: | |
| 93 if line[0] == "#": continue | |
| 94 gff_fields = line[:-1].split("\t") | |
| 95 chrom = gff_fields[0] | |
| 96 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | |
| 97 item_upstream_coordinate = int(gff_fields[3]) | |
| 98 item_downstream_coordinate = int(gff_fields[4]) | |
| 99 item_polarity = gff_fields[6] | |
| 100 for sample in readDict.keys(): | |
| 101 if not GFFinstanceDict.has_key(sample): | |
| 102 GFFinstanceDict[sample]={} | |
| 103 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) | |
| 104 if item_polarity == '-': | |
| 105 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} | |
| 106 # subinstance.readDict.setdefault(key, []) | |
| 107 subinstance.gene=gff_name | |
| 108 GFFinstanceDict[sample][gff_name]=subinstance | |
| 109 return GFFinstanceDict | |
| 110 | |
| 111 MasterListOfGenomes=process_samples(filePath) | |
| 112 | |
| 113 if args.gff: | |
| 114 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) | |
| 115 | |
| 116 if global_size: | |
| 117 write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol) | |
| 118 else: | |
| 119 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file, pol) | |
| 120 | |
| 121 R_command="Rscript "+ Rcode | |
| 122 process = subprocess.Popen(R_command.split()) | |
| 123 process.wait() | |
| 124 | |
| 125 |
