comparison gene_family_scaffold_loader.py @ 2:751b36922d59 draft

Uploaded
author greg
date Fri, 25 May 2018 10:18:13 -0400
parents 177b62db4320
children 48c13482e6c9
comparison
equal deleted inserted replaced
1:488bf95641d2 2:751b36922d59
118 clustering_method = items[0] 118 clustering_method = items[0]
119 # Save all clustering methods for later processing. 119 # Save all clustering methods for later processing.
120 if clustering_method not in self.clustering_methods: 120 if clustering_method not in self.clustering_methods:
121 self.clustering_methods.append(clustering_method) 121 self.clustering_methods.append(clustering_method)
122 # Insert a row in to the plant_tribes_scaffold table. 122 # Insert a row in to the plant_tribes_scaffold table.
123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) 123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method))
124 args = [scaffold_id, clustering_method] 124 args = [scaffold_id, clustering_method]
125 sql = """ 125 sql = """
126 INSERT INTO plant_tribes_scaffold 126 INSERT INTO plant_tribes_scaffold
127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) 127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
128 RETURNING id; 128 RETURNING id;
130 cur = self.update(sql, tuple(args)) 130 cur = self.update(sql, tuple(args))
131 self.flush() 131 self.flush()
132 scaffold_id_db = cur.fetchone()[0] 132 scaffold_id_db = cur.fetchone()[0]
133 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) 133 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
134 with open(file_name, "r") as fh: 134 with open(file_name, "r") as fh:
135 for i, line in enumerate(fh): 135 i = 0
136 if i == 0: 136 for i2, line in enumerate(fh):
137 if i2 == 0:
137 # Skip first line. 138 # Skip first line.
138 continue 139 continue
139 num_genes = 0 140 num_genes = 0
140 num_species = 0 141 num_species = 0
141 items = line.split("\t") 142 items = line.split("\t")
148 if j_int > 0: 149 if j_int > 0:
149 # The species has at least 1 gene 150 # The species has at least 1 gene
150 num_species += 1 151 num_species += 1
151 num_genes += j_int 152 num_genes += j_int
152 # Insert a row into the plant_tribes_orthogroup table. 153 # Insert a row into the plant_tribes_orthogroup table.
153 self.log("Inserting a row into the plant_tribes_orthogroup table...")
154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] 154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
155 for k in range(super_ortho_start_index, len(items)): 155 for k in range(super_ortho_start_index, len(items)):
156 args.append('%s' % str(items[k])) 156 args.append('%s' % str(items[k]))
157 sql = """ 157 sql = """
158 INSERT INTO plant_tribes_orthogroup 158 INSERT INTO plant_tribes_orthogroup
159 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); 159 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
160 """ 160 """
161 cur = self.update(sql, tuple(args)) 161 cur = self.update(sql, tuple(args))
162 self.flush() 162 self.flush()
163 i += 1
164 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
163 for file_name in glob.glob(os.path.join(file_dir, "*list")): 165 for file_name in glob.glob(os.path.join(file_dir, "*list")):
164 items = os.path.basename(file_name).split(".") 166 items = os.path.basename(file_name).split(".")
165 clustering_method = items[0] 167 clustering_method = items[0]
166 with open(file_name, "r") as fh: 168 with open(file_name, "r") as fh:
167 for i, line in enumerate(fh): 169 for i, line in enumerate(fh):
228 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] 230 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
229 # Populate the plant_tribes_taxon table. 231 # Populate the plant_tribes_taxon table.
230 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) 232 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
231 self.log("Processing taxa lineage config: %s" % str(file_name)) 233 self.log("Processing taxa lineage config: %s" % str(file_name))
232 with open(file_name, "r") as fh: 234 with open(file_name, "r") as fh:
233 for i, line in enumerate(fh): 235 for line in fh:
234 line = line.strip() 236 line = line.strip()
235 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): 237 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
236 # Skip blank lines, comments and section headers. 238 # Skip blank lines, comments and section headers.
237 continue 239 continue
238 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots 240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
239 items = line.split("\t") 241 items = line.split("\t")
240 species_name = items[0] 242 species_name = items[0]
241 self.log("Calculating the number of genes for species_name: %s" % str(species_name)) 243 i = 0
242 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): 244 for species_genes_dict_key in sorted(self.species_genes_dict.keys()):
243 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. 245 # The format of species_genes_dict_key is <clustering_method>^^<species_code>.
244 species_genes_dict_key_items = species_genes_dict_key.split("^^") 246 species_genes_dict_key_items = species_genes_dict_key.split("^^")
245 clustering_method = species_genes_dict_key_items[0] 247 clustering_method = species_genes_dict_key_items[0]
246 species_code = species_genes_dict_key_items[1] 248 species_code = species_genes_dict_key_items[1]
261 INSERT INTO plant_tribes_taxon 263 INSERT INTO plant_tribes_taxon
262 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); 264 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
263 """ 265 """
264 self.update(sql, tuple(args)) 266 self.update(sql, tuple(args))
265 self.flush() 267 self.flush()
268 i += 1
269 self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % str(species_name))
266 270
267 def process_orthogroup_fasta_files(self): 271 def process_orthogroup_fasta_files(self):
268 """ 272 """
269 1. Analyze all of the scaffold .fna and .faa files for each clustering 273 1. Analyze all of the scaffold .fna and .faa files for each clustering
270 method to populate the aa_dict and dna_dict sequence dictionaries. 274 method to populate the aa_dict and dna_dict sequence dictionaries.
311 # above will be the sequence associated with that gene until 315 # above will be the sequence associated with that gene until
312 # the next gene id line is encountered. 316 # the next gene id line is encountered.
313 sequence = adict[combined_id] 317 sequence = adict[combined_id]
314 sequence = "%s%s" % (sequence, line) 318 sequence = "%s%s" % (sequence, line)
315 adict[combined_id] = sequence 319 adict[combined_id] = sequence
316 # Populate the plant_tribes_gene and gen_scaffold_association tables 320 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_association tables
317 # from the contents of aa_dict and dna_dict. 321 # from the contents of aa_dict and dna_dict.
318 for combined_id in sorted(dna_dict.keys()): 322 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables.")
323 gi = 0
324 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
319 # The dictionary keys combine the orthogroup_id, clustering method and 325 # The dictionary keys combine the orthogroup_id, clustering method and
320 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. 326 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
321 items = combined_id.split("^^") 327 items = combined_id.split("^^")
322 orthogroup_id = items[0] 328 orthogroup_id = items[0]
323 clustering_method = items[1] 329 clustering_method = items[1]
324 gene_id = items[2] 330 gene_id = items[2]
325 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id))
326 # The value will be a list containing both 331 # The value will be a list containing both
327 # clustering_method and the dna string. 332 # clustering_method and the dna string.
328 dna_sequence = dna_dict[combined_id] 333 dna_sequence = dna_dict[combined_id]
329 aa_sequence = aa_dict[combined_id] 334 aa_sequence = aa_dict[combined_id]
330 # Get the species_code from the gene_id. 335 # Get the species_code from the gene_id.
362 RETURNING id; 367 RETURNING id;
363 """ 368 """
364 cur = self.update(sql, tuple(args)) 369 cur = self.update(sql, tuple(args))
365 self.flush() 370 self.flush()
366 gene_id_db = cur.fetchone()[0] 371 gene_id_db = cur.fetchone()[0]
372 gi += 1
367 # Insert a row into the gene_scaffold_orthogroup_association table. 373 # Insert a row into the gene_scaffold_orthogroup_association table.
368 # Get the scaffold_rec for the current scaffold_id and clustering_method. 374 # Get the scaffold_rec for the current scaffold_id and clustering_method.
369 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] 375 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
370 for scaffold_rec in self.scaffold_recs: 376 for scaffold_rec in self.scaffold_recs:
371 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: 377 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
375 INSERT INTO gene_scaffold_orthogroup_association 381 INSERT INTO gene_scaffold_orthogroup_association
376 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); 382 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s);
377 """ 383 """
378 cur = self.update(sql, tuple(args)) 384 cur = self.update(sql, tuple(args))
379 self.flush() 385 self.flush()
380 386 if gsoai % 1000 == 0:
387 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_association table.")
388 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
389 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_association table." % gsoai)
381 390
382 if __name__ == '__main__': 391 if __name__ == '__main__':
383 scaffold_loader = ScaffoldLoader() 392 scaffold_loader = ScaffoldLoader()
384 scaffold_loader.run() 393 scaffold_loader.run()
385 scaffold_loader.shutdown() 394 scaffold_loader.shutdown()