annotate pre_mirbase.py @ 12:2b09cbf138c8 draft

Uploaded
author glogobyte
date Tue, 20 Oct 2020 09:41:39 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
1 from itertools import groupby
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
2 import sys
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
3 import subprocess
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
4 import argparse
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
5 import time
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
6 import urllib.request
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
7 from multiprocessing import Process, Queue
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
8 import itertools
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
9
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
10 subprocess.call(['mkdir', 'out'])
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
11 parser = argparse.ArgumentParser()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
12
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
13 parser.add_argument("-pos", "--positions", help="", action="store")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
14 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
15 parser.add_argument("-gen", "--genome", help="tool directory path", action="store")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
16 parser.add_argument("-gff3", "--gff", help="",action="store")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
17
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
18 args = parser.parse_args()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
19
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
20 #=======================================================================================================================================
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
21
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
22
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
23 #-----------------------Download and read the file hsa.gff3---------------------------------
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
24
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
25 def read_url(q):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
26
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
27
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
28 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/genomes/'+args.gff
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
29 #url = 'ftp://mirbase.org/pub/mirbase/21/genomes/hsa.gff3'
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
30 response = urllib.request.urlopen(url)
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
31 data = response.read()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
32 file_mirna = data.decode('utf-8')
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
33 file_mirna = file_mirna.split("\n")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
34 q.put(file_mirna)
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
35
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
36
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
37 def write_gff(file_mirna):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
38 f = open('original_mirnas.bed', "w")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
39
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
40 for i in range(len(file_mirna)):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
41 f.write(file_mirna[i] + "\n")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
42
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
43
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
44 #------------------------Processed the file with mature mirnas-------------------------------
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
45
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
46
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
47 def new_gff(file_mirna):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
48
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
49 mirna = [] # new list with shifted mirnas
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
50 positions =int(args.positions) # positions shifted
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
51 print(str(positions)+" positions shifted")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
52 names=[]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
53 # Remove lines which conatain the word "primary"
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
54 for i in range(len(file_mirna)):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
55
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
56 if "primary" not in file_mirna[i]:
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
57 mirna.append(file_mirna[i])
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
58
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
59 if "chr" in file_mirna[i]:
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
60 a=file_mirna[i].split("\t")[0]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
61 b=file_mirna[i].split("\t")[6]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
62 c=file_mirna[i].split("=")[3].split(";")[0]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
63 names.append([a,b,c])
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
64
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
65 names.sort()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
66 sublists=[]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
67
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
68 [sublists.append([item] * names.count(item)) for item in names if names.count(item)>=2]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
69 sublists.sort()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
70 sublists=list(sublists for sublists, _ in itertools.groupby(sublists))
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
71 unique_names=[[x[0][0],x[0][2]] for x in sublists]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
72
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
73 for x in unique_names:
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
74 flag = 0
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
75 for i in range(len(mirna)):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
76
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
77 if "chr" in mirna[i] and mirna[i].split("=")[3].split(";")[0]==x[1] and x[0]==mirna[i].split("\t")[0]:
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
78 flag+=1
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
79 ktr=mirna[i].split(";")[0]+";"+mirna[i].split(";")[1]+";"+mirna[i].split(";")[2]+"-{"+str(flag)+"}"+";"+mirna[i].split(";")[3]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
80 mirna[i]=ktr
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
81
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
82
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
83 f = open('shifted_mirnas.bed', "w")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
84
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
85 for i in range(len(mirna)):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
86
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
87 if "chr" in mirna[i]:
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
88
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
89 # change the name of current mirna
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
90 mirna_name_1 = mirna[i].split("=")[3]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
91 mirna_name_2 = mirna[i].split("=")[4]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
92 # mirna_name_2 = mirna_name_2.split(";")[0]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
93 mirna_name_1 = mirna_name_1.split(";")[0]+"_"+mirna_name_2+"_"+mirna[i].split("\t")[0]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
94 mirna[i] = mirna[i].replace("miRNA", mirna_name_1)
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
95
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
96 # Shift the position of mirna
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
97 start = mirna[i].split("\t")[3]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
98 end = mirna[i].split("\t")[4]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
99 shift_start = int(start)-positions # shift the interval to the left
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
100 shift_end = int(end)+positions # shift the interval to the right
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
101
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
102 # Replace the previous intervals with the new
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
103 mirna[i] = mirna[i].replace(start, str(shift_start))
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
104 mirna[i] = mirna[i].replace(end, str(shift_end))
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
105
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
106 f.write(mirna[i] + "\n")
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
107
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
108 f.close()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
109
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
110 #===================================================================================================================================
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
111
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
112 def bedtool(genome):
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
113 subprocess.call(["bedtools", "getfasta", "-fi", "/cvmfs/data.galaxyproject.org/byhand/"+genome+"/sam_index/"+genome+".fa", "-bed", "shifted_mirnas.bed", "-fo", "new_ref.fa", "-name", "-s"])
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
114
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
115 #===================================================================================================================================
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
116
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
117
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
118 if __name__=='__main__':
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
119
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
120 starttime = time.time()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
121 q = Queue()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
122 p1 = Process(target=read_url(q))
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
123 p1.start()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
124 p1.join()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
125
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
126 file_mirna=q.get()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
127
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
128 p2 = [Process(target=write_gff(file_mirna))]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
129 p2.extend([Process(target=new_gff(file_mirna))])
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
130 [x.start() for x in p2]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
131 [x.join() for x in p2]
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
132
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
133 p3 = Process(target=bedtool(args.genome))
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
134 p3.start()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
135 p3.join()
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
136
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
137 print('That took {} seconds'.format(time.time() - starttime))
2b09cbf138c8 Uploaded
glogobyte
parents:
diff changeset
138