| 31 | 1 import itertools | 
|  | 2 import urllib.request | 
|  | 3 from collections import OrderedDict | 
|  | 4 import copy | 
|  | 5 | 
|  | 6 ######################################################################################################################################################## | 
|  | 7 | 
|  | 8 """ Read a file and return it as a list """ | 
|  | 9 | 
|  | 10 def read(path, flag): | 
|  | 11     if flag == 0: | 
|  | 12         with open(path) as fp: | 
|  | 13             file=fp.readlines() | 
|  | 14         fp.close() | 
|  | 15         return file | 
|  | 16 | 
|  | 17     if flag == 1: | 
|  | 18         with open(path) as fp: | 
|  | 19             file = fp.read().splitlines() | 
|  | 20         fp.close() | 
|  | 21         return file | 
|  | 22 | 
|  | 23 # Write a list to a txt file | 
|  | 24 def write(path, list): | 
|  | 25     with open(path,'w') as fp: | 
|  | 26         for x in list: | 
|  | 27             fp.write(str("\t".join(x[1:-1]))) | 
|  | 28     fp.close() | 
|  | 29 | 
|  | 30 ######################################################################################################################################################## | 
|  | 31 | 
|  | 32 """ Detect the longest common substring sequence between two mirnas """ | 
|  | 33 | 
|  | 34 def longestSubstring(str1, str2): | 
|  | 35 | 
|  | 36     from difflib import SequenceMatcher | 
|  | 37     # initialize SequenceMatcher object with | 
|  | 38     # input string | 
|  | 39     seqMatch = SequenceMatcher(None, str1, str2) | 
|  | 40 | 
|  | 41     # find match of longest sub-string | 
|  | 42     # output will be like Match(a=0, b=0, size=5) | 
|  | 43     match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | 
|  | 44 | 
|  | 45     # print longest substring | 
|  | 46     if (match.size != 0): | 
|  | 47         return str1[match.a: match.a + match.size] | 
|  | 48     else: | 
|  | 49         print('No longest common sub-string found') | 
|  | 50 | 
|  | 51 | 
|  | 52 ################################################################################################################################################################# | 
|  | 53 | 
|  | 54 """ | 
|  | 55 | 
|  | 56 Read the sam files from alignment tool and do the followings: | 
|  | 57 | 
|  | 58 1) Keep mapped reads | 
|  | 59 2) Keep all sequences with length between 18 and 26 nucleotides | 
|  | 60 3) Detects the ref and templated miRNAs | 
|  | 61 4) Gives names to templated miRNAs based on ref miRNAs | 
|  | 62 | 
|  | 63 """ | 
|  | 64 | 
|  | 65 def sam_edit(mature_mirnas,path,file,case,l,samples,data,file_order,unmap_seq,names_n_seqs,deseq,mirna_names,ini_sample,unmap_counts): | 
|  | 66 | 
|  | 67     # read the sam file | 
|  | 68     ini_sam=read(path,0) | 
|  | 69     main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | 
|  | 70     unique_seq = [x for x in main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] | 
|  | 71     filter_sam = [[x[0],x[1],x[2],len(x[9])] for x in main_sam] | 
|  | 72     sorted_uni_arms = [] | 
|  | 73 | 
|  | 74     # Detection of differences between the canonical miRNA and the detected miRNA | 
|  | 75     for i in range(len(mature_mirnas)): | 
|  | 76         tmp_count_reads = 0   # calculate the total number of reads | 
|  | 77         tmp_count_seq = 0     # calculate the total number of sequences | 
|  | 78         for j in range(len(unique_seq)): | 
|  | 79 | 
|  | 80          if mature_mirnas[i] == unique_seq[j][2]: | 
|  | 81 | 
|  | 82                 temp_mature = mature_mirnas[i+1] | 
|  | 83                 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | 
|  | 84 | 
|  | 85                 mat_diff = temp_mature.split(off_part) | 
|  | 86                 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | 
|  | 87 | 
|  | 88                 unique_diff = unique_seq[j][9].split(off_part) | 
|  | 89                 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | 
|  | 90 | 
|  | 91                 # Handling of some special mirnas like (hsa-miR-8485) | 
|  | 92                 if mat_diff[1]!=0 and unique_diff[1]!=0: | 
|  | 93                     unique_seq[j]=1 | 
|  | 94                     pre_pos = 0 | 
|  | 95                     post_pos = 0 | 
|  | 96 | 
|  | 97                 elif mat_diff[0]!=0 and unique_diff[0]!=0: | 
|  | 98                     unique_seq[j]=1 | 
|  | 99                     pre_pos = 0 | 
|  | 100                     post_pos = 0 | 
|  | 101 | 
|  | 102                 else: | 
|  | 103                    # Keep the findings | 
|  | 104                    pre_pos = mat_diff[0]-unique_diff[0] | 
|  | 105                    post_pos = unique_diff[1]-mat_diff[1] | 
|  | 106                    tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | 
|  | 107                    tmp_count_seq = tmp_count_seq+1 | 
|  | 108 | 
|  | 109                 # Store the detected miRNAs with new names according to the findings | 
|  | 110                 if pre_pos != 0 or post_pos != 0: | 
|  | 111                     if pre_pos == 0: | 
|  | 112                         unique_seq[j][2] = unique_seq[j][2] + "_t_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | 
|  | 113                     elif post_pos == 0: | 
|  | 114                         unique_seq[j][2] = unique_seq[j][2] + "_t_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | 
|  | 115                     else: | 
|  | 116                         unique_seq[j][2] = unique_seq[j][2]+"_t_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | 
|  | 117 | 
|  | 118         # Remove the values "1" from the handling of special mirnas (hsa-miR-8485) | 
|  | 119         for x in range(unique_seq.count(1)): | 
|  | 120            unique_seq.remove(1) | 
|  | 121 | 
|  | 122         # metrics for the production of database | 
|  | 123         if tmp_count_reads != 0 and tmp_count_seq != 0: | 
|  | 124            sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) | 
|  | 125 | 
|  | 126     # Sorting of the metrics for database | 
|  | 127     sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | 
|  | 128 | 
|  | 129     # Correction of metrics due to the collapsing and removing of duplicates for the production of Database | 
|  | 130     for y in sorted_uni_arms: | 
|  | 131        counts=0 | 
|  | 132        seqs=0 | 
|  | 133        for x in unique_seq: | 
|  | 134            if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: | 
|  | 135               counts+=int(x[0].split("-")[1]) | 
|  | 136               seqs+=1 | 
|  | 137 | 
|  | 138        y[1]=seqs | 
|  | 139        y[2]=counts | 
|  | 140 | 
|  | 141     # Output variables | 
|  | 142     temp_mirna_names=[] | 
|  | 143 | 
|  | 144     l.acquire() | 
|  | 145     if case == "c" or case == "t": | 
|  | 146        temp_mirna_names.extend(z[2] for z in unique_seq) | 
|  | 147        names_n_seqs.extend([[y[2],y[9]] for y in unique_seq]) | 
|  | 148        deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | 
|  | 149        mirna_names.extend(temp_mirna_names) | 
|  | 150        unmap_seq.value += sum([1 for x in main_sam if x[1] == '4']) | 
|  | 151        unmap_counts.value += sum([int(x[0].split("-")[1]) for x in main_sam if x[1] == '4']) | 
|  | 152        file_order.append(file) | 
|  | 153        samples.append(unique_seq) | 
|  | 154        data.append([case,file,unique_seq,sorted_uni_arms]) | 
|  | 155        ini_sample.append(filter_sam) | 
|  | 156     l.release() | 
|  | 157 | 
|  | 158 | 
|  | 159 ###################################################################################################################################### | 
|  | 160 """ | 
|  | 161 | 
|  | 162 Read a sam file from Bowtie and do the followings: | 
|  | 163 | 
|  | 164 1) Keep unmapped reads | 
|  | 165 2) Keep all sequences with length between 18 and 26 nucleotides | 
|  | 166 3) Detects the non-template isomirs | 
|  | 167 4) Gives names to isomir's based on ref miRNAs | 
|  | 168 | 
|  | 169 """ | 
|  | 170 | 
|  | 171 def non_sam_edit(mature_mirnas,path,file,case,l,data,file_order,n_deseq,names_n_seqs): | 
|  | 172 | 
|  | 173     # read the sam file | 
|  | 174     ini_sam=read(path,0) | 
|  | 175     main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | 
|  | 176     unique_seq=[] | 
|  | 177     unique_seq = [x for x in main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | 
|  | 178     uni_seq=[] | 
|  | 179 | 
|  | 180     # Calculate the shifted positions for every isomir and add them to the name of it | 
|  | 181     sorted_uni_arms = [] | 
|  | 182     for i in range(1,len(mature_mirnas),2): | 
|  | 183         tmp_count_reads = 0   # calculate the total number of reads | 
|  | 184         tmp_count_seq = 0     # calculate the total number of sequences | 
|  | 185 | 
|  | 186         for j in range(len(unique_seq)): | 
|  | 187 | 
|  | 188             temp_mature = mature_mirnas[i].strip().replace("U", "T") | 
|  | 189 | 
|  | 190             # Detection of differences between the canonical miRNA and the detected non template miRNA | 
|  | 191             if temp_mature in unique_seq[j][9]: | 
|  | 192 | 
|  | 193                 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | 
|  | 194 | 
|  | 195                 mat_diff = temp_mature.split(off_part) | 
|  | 196                 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | 
|  | 197 | 
|  | 198                 unique_diff = unique_seq[j][9].split(off_part) | 
|  | 199                 if len(unique_diff)<=2: | 
|  | 200                    unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | 
|  | 201 | 
|  | 202                    pre_pos = mat_diff[0]-unique_diff[0] | 
|  | 203                    post_pos = unique_diff[1]-mat_diff[1] | 
|  | 204 | 
|  | 205                    lengthofmir = len(off_part) + post_pos | 
|  | 206                    if pre_pos == 0 and post_pos<4: | 
|  | 207                       tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | 
|  | 208                       tmp_count_seq = tmp_count_seq + 1 | 
|  | 209 | 
|  | 210                       t_name=copy.deepcopy(unique_seq[j]) | 
|  | 211                       t_name[2]=mature_mirnas[i - 1] + "_nont_" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | 
|  | 212                       uni_seq.append(t_name) | 
|  | 213 | 
|  | 214         # metrics for the production of database | 
|  | 215         if tmp_count_reads != 0 and tmp_count_seq != 0: | 
|  | 216             sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) | 
|  | 217 | 
|  | 218 | 
|  | 219     sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | 
|  | 220     unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | 
|  | 221 | 
|  | 222     # Output variables | 
|  | 223 | 
|  | 224     l.acquire() | 
|  | 225     if case == "c" or case == "t": | 
|  | 226        names_n_seqs.extend([[y[2],y[9]] for y in unique_seq if y[2]!="*"]) | 
|  | 227        n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | 
|  | 228        file_order.append(file) | 
|  | 229        data.append([case,file,unique_seq,sorted_uni_arms]) | 
|  | 230     l.release() | 
|  | 231 | 
|  | 232 ################################################################################################################################################################################################################# | 
|  | 233 | 
|  | 234 """ | 
|  | 235 | 
|  | 236 This function detects the differences between the two groups (control, treated). | 
|  | 237 Then copy the undetected miRNAs from one group to other and add zeros as counts. | 
|  | 238 With this way the two groups will have the same number of miRNAs. | 
|  | 239 | 
|  | 240 """ | 
|  | 241 | 
|  | 242 def black_white(mirna_names_1,mirna_names_2,group,manager): | 
|  | 243 | 
|  | 244     add_names = [x for x in mirna_names_1 if x not in mirna_names_2] | 
|  | 245     add_names.sort() | 
|  | 246     add_names = list(add_names for add_names,_ in itertools.groupby(add_names)) | 
|  | 247 | 
|  | 248     group.sort() | 
|  | 249     group = list(group for group,_ in itertools.groupby(group)) | 
|  | 250 | 
|  | 251     zeros=["0"]*(len(group[0])-2) | 
|  | 252     [add_names[i].extend(zeros) for i,_ in enumerate(add_names)] | 
|  | 253     group=group+add_names | 
|  | 254 | 
|  | 255     manager.extend(group) | 
|  | 256 | 
|  | 257 ########################################################################################################> | 
|  | 258 | 
|  | 259 """ | 
|  | 260 | 
|  | 261 This function collapses the miRNAs with same sequences and different names into one entry | 
|  | 262 by merging all the different names into one and are separated with the character "/" | 
|  | 263 | 
|  | 264 """ | 
|  | 265 | 
|  | 266 def merging_dupes(group,f_dupes): | 
|  | 267 | 
|  | 268     dupes=[] | 
|  | 269     temp_mat =[] | 
|  | 270 | 
|  | 271     for num,_ in enumerate(group): | 
|  | 272 | 
|  | 273         if group[num][1] not in temp_mat and group[num][0] not in temp_mat: | 
|  | 274            temp_mat.append(group[num][1]) | 
|  | 275            temp_mat.append(group[num][0]) | 
|  | 276         else: | 
|  | 277            dupes.append(group[num][1]) | 
|  | 278 | 
|  | 279 | 
|  | 280     dupes=list(set(dupes)) | 
|  | 281 | 
|  | 282     dupes=[[x] for x in dupes] | 
|  | 283 | 
|  | 284     for x in group: | 
|  | 285         for y in dupes: | 
|  | 286             if x[1]==y[0]: | 
|  | 287                fl=0 | 
|  | 288                if len(y)==1: | 
|  | 289                   y.append(x[0]) | 
|  | 290                else: | 
|  | 291                   for i in range(1,len(y)): | 
|  | 292                       if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | 
|  | 293                          fl=1 | 
|  | 294                          if len(x[0])<len(y[i]): | 
|  | 295                             del y[i] | 
|  | 296                             y.append(x[0]) | 
|  | 297                             break | 
|  | 298 | 
|  | 299                   if fl==0: | 
|  | 300                      y.append((x[0])) | 
|  | 301 | 
|  | 302     for y in dupes: | 
|  | 303         if len(y)>2: | 
|  | 304            for i in range(len(y)-1,1,-1): | 
|  | 305                y[1]=y[1]+"/"+y[i] | 
|  | 306                del y[i] | 
|  | 307 | 
|  | 308     f_dupes.extend(dupes) | 
|  | 309 | 
|  | 310 | 
|  | 311 ########################################################################################################> | 
|  | 312 | 
|  | 313 """ | 
|  | 314 | 
|  | 315 This function removes the duplications of sequences based on output from the fuction merging_dupes | 
|  | 316 | 
|  | 317 """ | 
|  | 318 | 
|  | 319 def apply_merging_dupes(group,dupes,managger): | 
|  | 320 | 
|  | 321     for x in group: | 
|  | 322      for y in dupes: | 
|  | 323          if x[1]==y[0]: | 
|  | 324             x[0]=y[1] | 
|  | 325 | 
|  | 326     group.sort() | 
|  | 327     group=list(group for group,_ in itertools.groupby(group)) | 
|  | 328     managger.extend(group) | 
|  | 329 | 
|  | 330 ########################################################################################################> | 
|  | 331 | 
|  | 332 """ | 
|  | 333 | 
|  | 334 This function is optional and performs a filter for low counts miRNAs based on | 
|  | 335 number of counts and the percentage of the samples, according to user preferences | 
|  | 336 | 
|  | 337 """ | 
|  | 338 | 
|  | 339 def filter_low_counts(c_group,t_group,fil_c_group,fil_t_group,per,counts): | 
|  | 340 | 
|  | 341     t_group_new=[] | 
|  | 342     c_group_new=[] | 
|  | 343 | 
|  | 344     percent=int(per)/100 | 
|  | 345     c_col_filter=round(percent*(len(c_group[1])-2)) | 
|  | 346     t_col_filter=round(percent*(len(t_group[1])-2)) | 
|  | 347 | 
|  | 348     for i, _ in enumerate(c_group): | 
|  | 349         c_cols=0 | 
|  | 350         t_cols=0 | 
|  | 351 | 
|  | 352         c_cols=sum([1 for j in range(len(c_group[i])-2) if int(c_group[i][j+2])>=int(counts)]) | 
|  | 353         t_cols=sum([1 for j in range(len(t_group[i])-2) if int(t_group[i][j+2])>=int(counts)]) | 
|  | 354 | 
|  | 355         if c_cols>=c_col_filter or t_cols>=t_col_filter: | 
|  | 356            t_group_new.append(t_group[i]) | 
|  | 357            c_group_new.append(c_group[i]) | 
|  | 358 | 
|  | 359     fil_c_group.extend(c_group_new) | 
|  | 360     fil_t_group.extend(t_group_new) | 
|  | 361 | 
|  | 362 ################################################################################################################################################################################################################## | 
|  | 363 | 
|  | 364 """ | 
|  | 365 | 
|  | 366 This function exports the count matrices for every group (controls, treated) | 
|  | 367 and condition (ref and templated miRNAs, non-templated miRNAs) | 
|  | 368 | 
|  | 369 """ | 
|  | 370 | 
|  | 371 def write_main(raw_con, raw_tre, fil_con, fil_tre, con_file_order, tre_file_order, flag, n1, n2, per): | 
|  | 372 | 
|  | 373  if flag == 1 and int(per)!=-1: | 
|  | 374     fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') | 
|  | 375     fp.write("Name\t") | 
|  | 376     fp.write("Sequence") | 
|  | 377     for y in tre_file_order: | 
|  | 378        fp.write("\t"+y) | 
|  | 379 | 
|  | 380     for x in fil_tre: | 
|  | 381         fp.write("\n%s" % "\t".join(x)) | 
|  | 382     fp.close() | 
|  | 383 | 
|  | 384     fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') | 
|  | 385     fp.write("Name\t") | 
|  | 386     fp.write("Sequence") | 
|  | 387     for y in con_file_order: | 
|  | 388        fp.write("\t"+y) | 
|  | 389 | 
|  | 390     for x in fil_con: | 
|  | 391         fp.write("\n%s" % "\t".join(x)) | 
|  | 392     fp.close() | 
|  | 393 | 
|  | 394 | 
|  | 395  if flag == 2 and int(per)!=-1: | 
|  | 396     fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') | 
|  | 397     fp.write("Name\t") | 
|  | 398     fp.write("Sequence") | 
|  | 399     for y in tre_file_order: | 
|  | 400        fp.write("\t"+y) | 
|  | 401 | 
|  | 402 | 
|  | 403     for x in fil_tre: | 
|  | 404         fp.write("\n%s" % "\t".join(x)) | 
|  | 405     fp.close() | 
|  | 406 | 
|  | 407     fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') | 
|  | 408     fp.write("Name\t") | 
|  | 409     fp.write("Sequence") | 
|  | 410     for y in con_file_order: | 
|  | 411        fp.write("\t"+y) | 
|  | 412 | 
|  | 413     for x in fil_con: | 
|  | 414         fp.write("\n%s" % "\t".join(x)) | 
|  | 415     fp.close() | 
|  | 416 | 
|  | 417 | 
|  | 418  if flag == 1: | 
|  | 419     fp = open('Counts/Raw '+n2+' Templated Counts', 'w') | 
|  | 420     fp.write("Name\t") | 
|  | 421     fp.write("Sequence") | 
|  | 422     for y in tre_file_order: | 
|  | 423        fp.write("\t"+y) | 
|  | 424 | 
|  | 425     for x in raw_tre: | 
|  | 426         fp.write("\n%s" % "\t".join(x)) | 
|  | 427     fp.close() | 
|  | 428 | 
|  | 429     fp = open('Counts/Raw '+n1+' Templated Counts', 'w') | 
|  | 430     fp.write("Name\t") | 
|  | 431     fp.write("Sequence") | 
|  | 432     for y in con_file_order: | 
|  | 433        fp.write("\t"+y) | 
|  | 434 | 
|  | 435     for x in raw_con: | 
|  | 436         fp.write("\n%s" % "\t".join(x)) | 
|  | 437     fp.close() | 
|  | 438 | 
|  | 439  if flag == 2: | 
|  | 440     fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') | 
|  | 441     fp.write("Name\t") | 
|  | 442     fp.write("Sequence") | 
|  | 443     for y in tre_file_order: | 
|  | 444        fp.write("\t"+y) | 
|  | 445 | 
|  | 446 | 
|  | 447     for x in raw_tre: | 
|  | 448         fp.write("\n%s" % "\t".join(x)) | 
|  | 449     fp.close() | 
|  | 450 | 
|  | 451     fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') | 
|  | 452     fp.write("Name\t") | 
|  | 453     fp.write("Sequence") | 
|  | 454     for y in con_file_order: | 
|  | 455        fp.write("\t"+y) | 
|  | 456 | 
|  | 457     for x in raw_con: | 
|  | 458         fp.write("\n%s" % "\t".join(x)) | 
|  | 459     fp.close() | 
|  | 460 | 
|  | 461 #################################################################################################################################################################################################################### | 
|  | 462 | 
|  | 463 """ | 
|  | 464 | 
|  | 465 This function exports the files of the database with all the info | 
|  | 466 about every type of the detected miRNAs for every sample | 
|  | 467 | 
|  | 468 """ | 
|  | 469 | 
|  | 470 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | 
|  | 471 | 
|  | 472  if f==1: | 
|  | 473     if con=="c": | 
|  | 474        fp = open('split1/'+name, 'w') | 
|  | 475 | 
|  | 476        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 477     if con=="t": | 
|  | 478        fp = open('split2/'+name, 'w') | 
|  | 479        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 480 | 
|  | 481     for i in range(len(sorted_uni_arms)): | 
|  | 482         temp = [] | 
|  | 483         for j in range(len(unique_seq)): | 
|  | 484 | 
|  | 485             if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): | 
|  | 486 | 
|  | 487                 temp.append(unique_seq[j]) | 
|  | 488 | 
|  | 489         temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | 
|  | 490         fp.write("*********************************************************************************************************\n") | 
|  | 491         fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | 
|  | 492         fp.write("*********************************************************************************************************\n\n") | 
|  | 493         [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | 
|  | 494         fp.write("\n" + "\n") | 
|  | 495     fp.close() | 
|  | 496 | 
|  | 497  if f==2: | 
|  | 498 | 
|  | 499     if con=="c": | 
|  | 500        fp = open('split3/'+name, 'w') | 
|  | 501        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 502     if con=="t": | 
|  | 503        fp = open('split4/'+name, 'w') | 
|  | 504        fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | 
|  | 505 | 
|  | 506     for i in range(len(sorted_uni_arms)): | 
|  | 507         temp = [] | 
|  | 508         for j in range(len(unique_seq)): | 
|  | 509                if sorted_uni_arms[i][0]==unique_seq[j][2].split("_nont_")[0]: | 
|  | 510                   temp.append(unique_seq[j]) | 
|  | 511         if temp!=[]: | 
|  | 512            temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | 
|  | 513            fp.write("*********************************************************************************************************\n") | 
|  | 514            fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | 
|  | 515            fp.write("*********************************************************************************************************\n\n") | 
|  | 516            [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | 
|  | 517            fp.write("\n" + "\n") | 
|  | 518     fp.close() | 
|  | 519 | 
|  | 520 | 
|  | 521 ######################################################################################################################### | 
|  | 522 | 
|  | 523 """ | 
|  | 524 | 
|  | 525 This function merges the different names for the same mirna sequence per group (controls, treated) to avoid duplicates | 
|  | 526 | 
|  | 527 """ | 
|  | 528 | 
|  | 529 def merging_names(ini_mat,new): | 
|  | 530 | 
|  | 531     dupes=[] | 
|  | 532     temp_mat =[] | 
|  | 533 | 
|  | 534     for num in range(len(ini_mat)): | 
|  | 535 | 
|  | 536         if ini_mat[num][1] not in temp_mat and ini_mat[num][0] not in temp_mat: | 
|  | 537            temp_mat.append(ini_mat[num][1]) | 
|  | 538            temp_mat.append(ini_mat[num][0]) | 
|  | 539         else: | 
|  | 540            dupes.append(ini_mat[num][1]) | 
|  | 541 | 
|  | 542     dupes=list(set(dupes)) | 
|  | 543 | 
|  | 544     for i in range(len(dupes)): | 
|  | 545         dupes[i]=[dupes[i]] | 
|  | 546 | 
|  | 547     for x in ini_mat: | 
|  | 548         for y in dupes: | 
|  | 549             if x[1]==y[0]: | 
|  | 550                fl=0 | 
|  | 551                if len(y)==1: | 
|  | 552                   y.append(x[0]) | 
|  | 553                else: | 
|  | 554                   for i in range(1,len(y)): | 
|  | 555                       if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | 
|  | 556                          fl=1 | 
|  | 557                          if len(x[0])<len(y[i]): | 
|  | 558                             del y[i] | 
|  | 559                             y.append(x[0]) | 
|  | 560                             break | 
|  | 561 | 
|  | 562                   if fl==0: | 
|  | 563                      y.append((x[0])) | 
|  | 564 | 
|  | 565     for y in dupes: | 
|  | 566         if len(y)>2: | 
|  | 567            for i in range(len(y)-1,1,-1): | 
|  | 568                y[1]=y[1]+"/"+y[i] | 
|  | 569                del y[i] | 
|  | 570 | 
|  | 571 | 
|  | 572     for x in ini_mat: | 
|  | 573         for y in dupes: | 
|  | 574             if x[1]==y[0]: | 
|  | 575                x[0]=y[1] | 
|  | 576 | 
|  | 577     ini_mat.sort() | 
|  | 578     ini_mat=list(ini_mat for ini_mat,_ in itertools.groupby(ini_mat)) | 
|  | 579     new.extend(ini_mat) | 
|  | 580 | 
|  | 581 #################################################################################################################################################################################################################### | 
|  | 582 | 
|  | 583 """ | 
|  | 584 | 
|  | 585 This function exports the count matrices for differential expresion | 
|  | 586 if user chose analysis with non-templated miRNAs detection | 
|  | 587 | 
|  | 588 """ | 
|  | 589 | 
|  | 590 def nontemp_counts_to_diff(tem_names,tem_samp,non_names,non_samp,folder,pro): | 
|  | 591 | 
|  | 592     for i in range(2,len(tem_samp[0])): | 
|  | 593 | 
|  | 594        fp = open(folder+tem_names[i-2]+'.txt','w') | 
|  | 595        fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | 
|  | 596 | 
|  | 597        for x in tem_samp: | 
|  | 598            fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | 
|  | 599 | 
|  | 600        for j in range(len(non_names)): | 
|  | 601            if non_names[j]==tem_names[i-2]: | 
|  | 602               for x in non_samp: | 
|  | 603                   fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | 
|  | 604        fp.close() | 
|  | 605 | 
|  | 606 ################################################################################################################################################################################################################# | 
|  | 607 | 
|  | 608 """ | 
|  | 609 | 
|  | 610 This function exports the count matrices for differential expresion | 
|  | 611 if user chose analysis only with templated miRNAs detection | 
|  | 612 | 
|  | 613 """ | 
|  | 614 | 
|  | 615 def temp_counts_to_diff(names,samp,folder,pro): | 
|  | 616 | 
|  | 617     for i in range(2,len(samp[0])): | 
|  | 618 | 
|  | 619        fp = open(folder+names[i-2]+'.txt','w') | 
|  | 620        fp.write("miRNA id"+"\t"+names[i-2]+"\n") | 
|  | 621 | 
|  | 622        for x in samp: | 
|  | 623            fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | 
|  | 624        fp.close() | 
|  | 625 | 
|  | 626 ################################################################################################################################################################################################################# | 
|  | 627 | 
|  | 628 """ | 
|  | 629 | 
|  | 630 This function downloads the fasta files from MirGene site | 
|  | 631 with ref miRNAs and star miRNAs sequences and merges them | 
|  | 632 into one list | 
|  | 633 | 
|  | 634 """ | 
|  | 635 | 
|  | 636 def download_matures(matures,org_name): | 
|  | 637 | 
|  | 638     mature_mir=[] | 
|  | 639 | 
|  | 640     mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' | 
|  | 641     star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' | 
|  | 642 | 
|  | 643     data = urllib.request.urlopen(mat_url).read() | 
|  | 644     file_mirna = data.decode('utf-8') | 
|  | 645     mature_mir = file_mirna.split("\n") | 
|  | 646     mature_mir = [x.replace(">","") for x in mature_mir] | 
|  | 647     del mature_mir[-1] | 
|  | 648 | 
|  | 649     data = urllib.request.urlopen(star_url).read() | 
|  | 650     file_mirna = data.decode('utf-8') | 
|  | 651     star_mir = file_mirna.split("\n") | 
|  | 652     star_mir = [x.replace(">","") for x in star_mir] | 
|  | 653     del star_mir[-1] | 
|  | 654 | 
|  | 655     mature_mir.extend(star_mir) | 
|  | 656 | 
|  | 657     for i in range(1,len(mature_mir),2): | 
|  | 658         mature_mir[i]=mature_mir[i].replace("U","T") | 
|  | 659 | 
|  | 660     matures.extend(mature_mir) | 
|  | 661 | 
|  | 662 ################################################################################################################### | 
|  | 663 | 
|  | 664 """ | 
|  | 665 | 
|  | 666 This function detects the templated isoforms from the 1st part of analysis | 
|  | 667 These isoforms and ref miRNAs will be used for the detection of non-templated miRNAs | 
|  | 668 | 
|  | 669 """ | 
|  | 670 | 
|  | 671 def non_template_ref(sc,st,all_isoforms): | 
|  | 672 | 
|  | 673   pre_uni_seq_con = list(sc) | 
|  | 674   pre_uni_seq_tre = list(st) | 
|  | 675 | 
|  | 676   for x in pre_uni_seq_con: | 
|  | 677       for y in x: | 
|  | 678           if y[2] not in all_isoforms and "_t_" in y[2]: | 
|  | 679              all_isoforms.append(y[2]) | 
|  | 680              all_isoforms.append(y[9]) | 
|  | 681 | 
|  | 682   for x in pre_uni_seq_tre: | 
|  | 683       for y in x: | 
|  | 684           if y[2] not in all_isoforms and "_t_" in y[2]: | 
|  | 685              all_isoforms.append(y[2]) | 
|  | 686              all_isoforms.append(y[9]) | 
|  | 687 | 
|  | 688 ################################################################################################################################################################################################ | 
|  | 689 | 
|  | 690 """ | 
|  | 691 | 
|  | 692 This function adds uncommon detected mirnas among the samples with zeros as counts | 
|  | 693 | 
|  | 694 """ | 
|  | 695 | 
|  | 696 def uncommon_mirnas(sample,mir_names,l,new_d,sample_name,sample_order): | 
|  | 697 | 
|  | 698     for y in mir_names: | 
|  | 699         flag=0 | 
|  | 700         for x in sample: | 
|  | 701             if y[0]==x[0]: # check if miRNA exists in the sample | 
|  | 702                flag=1 | 
|  | 703                break | 
|  | 704         if flag==0: | 
|  | 705            sample.append([y[0],"0",y[1]]) # add the name of mirna to the sample with zero counts and its sequence | 
|  | 706 | 
|  | 707     # sorting and remove duplicates | 
|  | 708     sample.sort(key=lambda x: x[0]) | 
|  | 709     sample=list(sample for sample,_ in itertools.groupby(sample)) | 
|  | 710 | 
|  | 711     # Return the updated sample | 
|  | 712     l.acquire() | 
|  | 713     new_d.append(sample) | 
|  | 714     sample_order.append(sample_name) | 
|  | 715     l.release() | 
|  | 716 | 
|  | 717 ############################################################################################################################################################################################### |