Mercurial > repos > drosofff > mi_rna_parser
diff MirParser.py @ 1:f6c22925fc3c draft
Uploaded
| author | drosofff | 
|---|---|
| date | Sun, 22 Jun 2014 18:31:14 -0400 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MirParser.py Sun Jun 22 18:31:14 2014 -0400 @@ -0,0 +1,81 @@ +#!/usr/bin/python +# python parser module for pre-mir and mature miRNAs, guided by mirbase.org GFF3 +# version 0.0.9 (1-6-2014) +# Usage MirParser.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> +# <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> +# <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> + +import sys, subprocess +from smRtools import * + +IndexSource = sys.argv[1] +ExtractionDirective = sys.argv[2] +if ExtractionDirective == "--do_not_extract_index": + genomeRefFormat = "fastaSource" +elif ExtractionDirective == "--extract_index": + genomeRefFormat = "bowtieIndex" +OutputPre_mirs = sys.argv[3] +OutputMature_Mirs = sys.argv[4] +GFF3_file = sys.argv[5] +lattice = sys.argv[6] +Rcode = sys.argv[7] +latticePDF = sys.argv[8] +Triplets = [sys.argv[9:][i:i+3] for i in xrange(0, len(sys.argv[9:]), 3)] +MasterListOfGenomes = {} + +for [filePath, FileExt, FileLabel] in Triplets: + print FileLabel + MasterListOfGenomes[FileLabel] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=FileExt, genomeRefFile=IndexSource, genomeRefFormat=genomeRefFormat, biosample=FileLabel) + +header = ["gene"] +for [filePath, FileExt, FileLabel] in Triplets: + header.append(FileLabel) + +hit_table = ["\t".join(header)] # table header: gene, sample1, sample2, sample3, etc. separated by tabulation + +## read GFF3 to subinstantiate +gff3 = open (GFF3_file, "r") +lattice_dataframe = [] +for line in gff3: + if line[0] == "#": continue + gff_fields = line[:-1].split("\t") + chrom = gff_fields[0] + gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name + item_upstream_coordinate = int(gff_fields[3]) + item_downstream_coordinate = int(gff_fields[4]) + if gff_fields[6] == "+": + item_polarity = "forward" + else: + item_polarity = "reverse" + item_line = [gff_name] + for sample in header[1:]: + count = MasterListOfGenomes[sample].instanceDict[chrom].readcount(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, polarity=item_polarity) + item_line.append(str(count)) + ## subtreatement for lattice + if lattice != "dummy_dataframe_path": + if ("5p" not in gff_name) and ("3p" not in gff_name): + lattice_dataframe.append(MasterListOfGenomes[sample].instanceDict[chrom].readcoverage(upstream_coord=item_upstream_coordinate, downstream_coord=item_downstream_coordinate, windowName=gff_name+"_"+sample) ) + ## end of subtreatement for lattice + hit_table.append("\t".join(item_line) ) +gff3.close() + +Fpremirs = open (OutputPre_mirs, "w") +print >> Fpremirs, hit_table[0] +finalPreList = [ i for i in sorted(hit_table[1:]) if ("5p" not in i) and ("3p" not in i)] +print >> Fpremirs, "\n".join(finalPreList ) +Fpremirs.close() + +Fmaturemires = open (OutputMature_Mirs, "w") +print >> Fmaturemires, hit_table[0] +finalMatureList = [ i for i in sorted(hit_table[1:]) if ("5p" in i) or ("3p" in i)] +print >> Fmaturemires, "\n".join(finalMatureList ) +Fmaturemires.close() + +if lattice != "dummy_dataframe_path": + Flattice = open(lattice, "w") + print >> Flattice, "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("sample", "mir", "offset", "offsetNorm", "counts","countsNorm", "polarity") + print >> Flattice, "\n".join(lattice_dataframe) + Flattice.close() + R_command="Rscript "+ Rcode + process = subprocess.Popen(R_command.split()) + process.wait()
