6
|
1 #!/usr/bin/python
|
|
2
|
|
3 import os
|
|
4 import tempfile
|
|
5
|
|
6 # Internal dependencies
|
|
7 from Datatype import Datatype
|
|
8 from util import subtools
|
|
9
|
|
10 class InfoModifiedGtf():
|
|
11 def __init__(self, is_modified=False, array_modified_lines=[]):
|
|
12 self.is_modified = is_modified
|
|
13 self.array_modified_lines = array_modified_lines
|
|
14
|
|
15 def get_str_modified_lines(self):
|
|
16 return ','.join(map(str, self.array_modified_lines))
|
|
17
|
|
18 class Gtf( Datatype ):
|
|
19 def __init__( self, input_gtf_false_path, data_gtf):
|
|
20
|
|
21 super(Gtf, self).__init__()
|
|
22
|
|
23 self.track = None
|
|
24
|
|
25 self.input_gtf_false_path = input_gtf_false_path
|
|
26 self.name_gtf = data_gtf["name"]
|
|
27 self.priority = data_gtf["order_index"]
|
|
28 self.track_color = data_gtf["track_color"]
|
|
29 # TODO: Think about how to avoid repetition of the group_name everywhere
|
|
30 self.group_name = data_gtf["group_name"]
|
|
31
|
|
32 #print "Creating TrackHub GTF from (falsePath: %s; name: %s)" % ( self.input_gtf_false_path, self.name_gtf)
|
|
33
|
|
34 # TODO: See if we need these temporary files as part of the generated files
|
|
35 genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred")
|
|
36 unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred")
|
|
37 sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred")
|
|
38
|
|
39 # GtfToGenePred
|
|
40 ## Checking the integrity of the inputs
|
|
41 modified_gtf = self._checkAndFixGtf()
|
|
42
|
|
43 ## Processing the gtf
|
|
44 subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name)
|
|
45
|
|
46 # TODO: From there, refactor because common use with Gff3.py
|
|
47 # genePredToBigGenePred processing
|
|
48 subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name)
|
|
49
|
|
50 # Sort processing
|
|
51 subtools.sort(unsorted_bigGenePred_file.name, sorted_bigGenePred_file.name)
|
|
52
|
|
53 # bedToBigBed processing
|
|
54 trackName = "".join( ( self.name_gtf, ".bb") )
|
|
55
|
|
56 auto_sql_option = os.path.join(self.tool_directory, 'bigGenePred.as')
|
|
57
|
|
58 myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName)
|
|
59
|
|
60 with open(myBigBedFilePath, 'w') as bigBedFile:
|
|
61 subtools.bedToBigBed(sorted_bigGenePred_file.name,
|
|
62 self.chromSizesFile.name,
|
|
63 bigBedFile.name,
|
|
64 autoSql=auto_sql_option,
|
|
65 typeOption='bed12+8',
|
|
66 tab=True)
|
|
67
|
|
68
|
|
69 # Create the Track Object
|
|
70 self.createTrack(file_path=trackName,
|
|
71 track_name=trackName,
|
|
72 long_label=self.name_gtf, track_type='bigGenePred',
|
|
73 visibility='dense', priority=self.priority,
|
|
74 track_file=myBigBedFilePath,
|
|
75 track_color=self.track_color,
|
|
76 group_name=self.group_name)
|
|
77
|
|
78 # TODO: Use Logging instead of print
|
|
79 if modified_gtf.is_modified:
|
|
80 print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues."
|
|
81 % self.name_gtf)
|
|
82 print("Here are the lines removed: " + modified_gtf.get_str_modified_lines())
|
|
83 else:
|
|
84 print("- Gtf %s created" % self.name_gtf)
|
|
85
|
|
86 def _checkAndFixGtf(self):
|
|
87 """
|
|
88 Call _checkAndFixGtf, check the integrity of gtf file,
|
|
89 if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold
|
|
90 depending on the user choice
|
|
91 default: remove the whole line(s)
|
|
92 """
|
|
93 # Set the boolean telling if we had to modify the file
|
|
94 modified_gtf = InfoModifiedGtf()
|
|
95
|
|
96 # Create a temp gtf just in case we have issues
|
|
97 temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False)
|
|
98
|
|
99 # TODO: Get the user choice and use it
|
|
100 # TODO: Check if the start > 0 and the end <= chromosome size
|
|
101 # Get the chrom.sizes into a dictionary to have a faster access
|
|
102 # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary
|
|
103 dict_chrom_sizes = {}
|
|
104 with open(self.chromSizesFile.name, 'r') as chromSizes:
|
|
105 lines = chromSizes.readlines()
|
|
106 for line in lines:
|
|
107 fields = line.split()
|
|
108 # fields[1] should be the name of the scaffold
|
|
109 # fields[2] should be the size of the scaffold
|
|
110 # TODO: Ensure this is true for all lines
|
|
111 dict_chrom_sizes[fields[0]] = fields[1]
|
|
112
|
|
113 # Parse the GTF and check each line using the chrom sizes dictionary
|
|
114 with open(temp_gtf.name, 'a+') as tmp:
|
|
115 with open(self.input_gtf_false_path, 'r') as gtf:
|
|
116 lines = gtf.readlines()
|
|
117 for index, line in enumerate(lines):
|
|
118 # If this is not a comment, we check the fields
|
|
119 if not line.startswith('#'):
|
|
120 fields = line.split()
|
|
121 # We are interested in fields[0] => Seqname (scaffold)
|
|
122 # We are interested in fields[3] => Start of the scaffold
|
|
123 # We are interested in fields[4] => End of the scaffold
|
|
124 scaffold_size = dict_chrom_sizes[fields[0]]
|
|
125 start_position = fields[3]
|
|
126 end_position = fields[4]
|
|
127
|
|
128 if start_position > 0 and end_position <= scaffold_size:
|
|
129 # We are good, so we copy this line
|
|
130 tmp.write(line)
|
|
131 tmp.write(os.linesep)
|
|
132
|
|
133
|
|
134 # The sequence is not good, we are going to process it regarding the user choice
|
|
135 # TODO: Process the user choice
|
|
136 # By default, we are assuming the user choice is to remove the lines: We don't copy it
|
|
137
|
|
138 # If we are here, it means the gtf has been modified
|
|
139 else:
|
|
140 # We save the line for the feedback to the user
|
|
141 modified_gtf.array_modified_lines.append(index + 1)
|
|
142
|
|
143 if modified_gtf.is_modified is False:
|
|
144 modified_gtf.is_modified = True
|
|
145 else:
|
|
146 pass
|
|
147 else:
|
|
148 tmp.write(line)
|
|
149 tmp.write(os.linesep)
|
|
150
|
|
151 # Once the process it completed, we just replace the path of the gtf
|
|
152 self.input_gtf_false_path = temp_gtf.name
|
|
153
|
|
154 # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False
|
|
155
|
|
156 return modified_gtf
|