view mirbase_ultra_v2.py @ 18:836894e48f1d draft

Uploaded
author glogobyte
date Thu, 22 Oct 2020 07:49:40 +0000
parents
children
line wrap: on
line source

from mirbase_functions import *
from mirbase_graphs import *
import itertools
import time
import sys
import os
import urllib.request
import gzip
from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
import subprocess
import argparse
from collections import OrderedDict
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
from math import pi
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import scipy.stats as stats
from plotnine import *
import math
import re
import matplotlib.ticker as mtick
import copy

subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])

parser = argparse.ArgumentParser()
parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[])
parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] )
parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
parser.add_argument("-gen", "--org_name", help="tool directory path", action="store")
parser.add_argument("-program", "--pro", help="choose type of analysis", action="store")
parser.add_argument("-f", "--flag", help="choose the database", action="store")
parser.add_argument("-umis", "--umi", help="choose the database", action="store")
parser.add_argument("-percentage", "--per", help="choose the database", action="store")
parser.add_argument("-counts", "--count", help="choose the database", action="store")
parser.add_argument("-name1", "--n1", help="choose the database", action="store")
parser.add_argument("-name2", "--n2", help="choose the database", action="store")
args = parser.parse_args()


###############################################################################################################################################################################################

if __name__ == '__main__':

 starttime = time.time()

 q1 = Queue()
 q2 = Queue()
 lock = Lock()
 manager = Manager()

 mature_mirnas=manager.list()
 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
 ps_mature.start()

 args.control[0]=args.control[0][1:]
 args.control[len(args.control)-1][:-1]
 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]

 args.treated[0]=args.treated[0][1:]
 args.treated[len(args.treated)-1][:-1]
 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]


##############  Detection of templated isoforms ################

 radar = manager.list([0,0,0,0])
 samples = manager.list()
 data= manager.list()
 names_con=manager.list()
 samples_mirna_names=manager.list()
 deseq=manager.list()
 unmap_seq=manager.Value('i',0)
 unmap_counts=manager.Value('i',0)
 LH2E_names=manager.list()
 ini_c_samples = manager.list()


 radar1 = manager.list([0,0,0,0])
 samples1 = manager.list()
 data1 = manager.list()
 names_tre = manager.list()
 samples_mirna_names1=manager.list()
 deseq1=manager.list()
 unmap1_seq = manager.Value('i',0)
 unmap1_counts = manager.Value('i',0)
 LH8E_names=manager.list()
 ini_t_samples = manager.list()
 ps_mature.join()


 mature_mirnas=list(mature_mirnas)


 starttime1 = time.time()
 ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control]
 ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated])

 [p.start() for p in ps_sam]
 [p.join() for p in ps_sam]
 print('SAM took {} seconds'.format(time.time() - starttime1))

 ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))]
 ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))])
 [x.start() for x in ps_hist]

 starttime200=time.time()

 sc = list(samples)
 st = list(samples1)

 names_con=list(names_con)
 names_tre=list(names_tre)
 samples_mirna_names=list(samples_mirna_names)
 samples_mirna_names.sort()
 samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names))

 samples_mirna_names1=list(samples_mirna_names1)
 samples_mirna_names1.sort()
 samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1))

 deseq=list(deseq)
 deseq1=list(deseq1)

 new_names_con=manager.list()
 new_names_tre=manager.list()
 new_deseq=manager.list()
 new_deseq1=manager.list()
 ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)]
 ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)])

 [z.start() for z in ps_deseq]
 [z.join() for z in ps_deseq]
 new_deseq=list(new_deseq)
 new_deseq1=list(new_deseq1)

 LH2E=[[x[0],x[2]] for x in new_deseq[0]]
 [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq]

 LH8E=[[x[0],x[2]] for x in new_deseq1[0]]
 [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1]

 print('Deseq took {} seconds'.format(time.time() - starttime200))

 merg_nam_LH2E=manager.list()
 merg_nam_LH8E=manager.list()

 LH2E_copy=copy.deepcopy(list(LH2E))
 LH8E_copy=copy.deepcopy(list(LH8E))

 fil_sort_tre=manager.list()
 fil_sort_con=manager.list()
 raw_sort_tre=manager.list()
 raw_sort_con=manager.list()

 ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con))
 ps_main.start()

 if args.anal=="2":
    all_iso = manager.list()
    ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso))
    ps_non_iso.start()

 ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))]
 ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))])
 [x.start() for x in ps_merge]
 [x.join() for x in ps_merge]

 merg_nam_LH2E=list(merg_nam_LH2E)
 merg_nam_LH8E=list(merg_nam_LH8E)

 starttime2 = time.time()
 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data]
 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1])
 procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))])
 if args.anal == "1":
    procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))])

 [p.start() for p in procs]


 if args.anal=="1":
    [x.join() for x in ps_hist]
    [p.join() for p in procs]
    ps_pdf = Process(target=pdf_before_DE,args=(args.anal))
    ps_pdf.start()

 print('Graphs took {} seconds'.format(time.time() - starttime2))

 ps_main.join()

 fil_sort_con=list(fil_sort_con)
 fil_sort_tre=list(fil_sort_tre)
 if fil_sort_con==[]:
    fil_sort_con=raw_sort_con
    fil_sort_tre=raw_sort_tre

 raw_sort_con=list(raw_sort_con)
 raw_sort_tre=list(raw_sort_tre)
 names_con=list(new_names_con)
 names_tre=list(new_names_tre)

 ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1))
 ps_write.start()

 ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))]
 ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))])
 [p.start() for p in ps1_matrix]

 if args.anal=="1":
    ps_pdf.join()
 if args.anal=="2":
    [p.join() for p in procs]
    [x.join() for x in ps_hist]

 ps_write.join()
 [p.join() for p in ps1_matrix]


############################## Detection of Both  #######################################

 starttime10 = time.time()

 if args.anal == "2":

  n_data= manager.list()
  n_names_con=manager.list()
  n_samples_mirna_names=manager.list()
  n_deseq=manager.list()
  n_LH2E_names=manager.list()

  n_data1 = manager.list()
  n_names_tre = manager.list()
  n_samples_mirna_names1=manager.list()
  n_deseq1=manager.list()
  n_LH8E_names=manager.list()


  new_mat_mirnas = list(mature_mirnas)
  ps_non_iso.join()

  all_iso=list(all_iso)
  new_mat_mirnas.extend(all_iso)

  starttime11=time.time()

  ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control]
  ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated])

  [p.start() for p in ps_sam]
  [p.join() for p in ps_sam]

  print('Non-sam took {} seconds'.format(time.time() - starttime11))

  starttime12=time.time()

  n_names_con=list(n_names_con)
  n_names_tre=list(n_names_tre)
  n_samples_mirna_names=list(n_samples_mirna_names)
  n_samples_mirna_names.sort()
  n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names))

  n_samples_mirna_names1=list(n_samples_mirna_names1)
  n_samples_mirna_names1.sort()
  n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1))

  n_deseq=list(n_deseq)
  n_deseq1=list(n_deseq1)

  new_n_names_con=manager.list()
  new_n_names_tre=manager.list()
  n_new_deseq=manager.list()
  n_new_deseq1=manager.list()
  ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)]
  ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)])

  [x.start() for x in ps_deseq]
  [x.join() for x in ps_deseq]
  n_new_deseq=list(n_new_deseq)
  n_new_deseq1=list(n_new_deseq1)

  n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]]
  [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq]

  n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]]
  [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1]

  print('Non-deseq took {} seconds'.format(time.time() - starttime12))

  merg_nam_n_LH2E=manager.list()
  merg_nam_n_LH8E=manager.list()

  n_LH2E_copy=copy.deepcopy(list(n_LH2E))
  n_LH8E_copy=copy.deepcopy(list(n_LH8E))

  n_sort_tre=manager.list()
  n_sort_con=manager.list()

  n_fil_sort_con=manager.list()
  n_fil_sort_tre=manager.list()
  n_raw_sort_con=manager.list()
  n_raw_sort_tre=manager.list()

  ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con))
  ps_main.start()

  ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))]
  ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))])
  [p.start() for p in ps_merge]
  [p.join() for p in ps_merge]

  merg_nam_n_LH2E=list(merg_nam_n_LH2E)
  merg_nam_n_LH8E=list(merg_nam_n_LH8E)

  procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data]
  procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1])
  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))])
  procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))])
  procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))])

  starttime13=time.time()
  [p.start() for p in procs]
  [p.join() for p in procs]

  print('Graphs took {} seconds'.format(time.time() - starttime13))

  procs1 = Process(target=pdf_before_DE,args=(args.anal))
  procs1.start()

  starttime14=time.time()
  ps_main.join()

  n_fil_sort_con=list(n_fil_sort_con)
  n_fil_sort_tre=list(n_fil_sort_tre)
  if n_fil_sort_con==[]:
     n_fil_sort_con=n_raw_sort_con
     n_fil_sort_tre=n_raw_sort_tre

  n_raw_sort_con=list(n_raw_sort_con)
  n_raw_sort_tre=list(n_raw_sort_tre)
  n_names_con=list(new_n_names_con)
  n_names_tre=list(new_n_names_tre)

  ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2))
  ps_write.start()

  ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))]
  ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))])
  [p.start() for p in ps1_matrix]

  ps_write.join()
  [p.join() for p in ps1_matrix]
  procs1.join()
 print('That took {} seconds'.format(time.time() - starttime10))
 print('That took {} seconds'.format(time.time() - starttime))