Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader
comparison gene_family_scaffold_loader.py @ 2:751b36922d59 draft
Uploaded
author | greg |
---|---|
date | Fri, 25 May 2018 10:18:13 -0400 |
parents | 177b62db4320 |
children | 48c13482e6c9 |
comparison
equal
deleted
inserted
replaced
1:488bf95641d2 | 2:751b36922d59 |
---|---|
118 clustering_method = items[0] | 118 clustering_method = items[0] |
119 # Save all clustering methods for later processing. | 119 # Save all clustering methods for later processing. |
120 if clustering_method not in self.clustering_methods: | 120 if clustering_method not in self.clustering_methods: |
121 self.clustering_methods.append(clustering_method) | 121 self.clustering_methods.append(clustering_method) |
122 # Insert a row in to the plant_tribes_scaffold table. | 122 # Insert a row in to the plant_tribes_scaffold table. |
123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) | 123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method)) |
124 args = [scaffold_id, clustering_method] | 124 args = [scaffold_id, clustering_method] |
125 sql = """ | 125 sql = """ |
126 INSERT INTO plant_tribes_scaffold | 126 INSERT INTO plant_tribes_scaffold |
127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) | 127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) |
128 RETURNING id; | 128 RETURNING id; |
130 cur = self.update(sql, tuple(args)) | 130 cur = self.update(sql, tuple(args)) |
131 self.flush() | 131 self.flush() |
132 scaffold_id_db = cur.fetchone()[0] | 132 scaffold_id_db = cur.fetchone()[0] |
133 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) | 133 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) |
134 with open(file_name, "r") as fh: | 134 with open(file_name, "r") as fh: |
135 for i, line in enumerate(fh): | 135 i = 0 |
136 if i == 0: | 136 for i2, line in enumerate(fh): |
137 if i2 == 0: | |
137 # Skip first line. | 138 # Skip first line. |
138 continue | 139 continue |
139 num_genes = 0 | 140 num_genes = 0 |
140 num_species = 0 | 141 num_species = 0 |
141 items = line.split("\t") | 142 items = line.split("\t") |
148 if j_int > 0: | 149 if j_int > 0: |
149 # The species has at least 1 gene | 150 # The species has at least 1 gene |
150 num_species += 1 | 151 num_species += 1 |
151 num_genes += j_int | 152 num_genes += j_int |
152 # Insert a row into the plant_tribes_orthogroup table. | 153 # Insert a row into the plant_tribes_orthogroup table. |
153 self.log("Inserting a row into the plant_tribes_orthogroup table...") | |
154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] | 154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] |
155 for k in range(super_ortho_start_index, len(items)): | 155 for k in range(super_ortho_start_index, len(items)): |
156 args.append('%s' % str(items[k])) | 156 args.append('%s' % str(items[k])) |
157 sql = """ | 157 sql = """ |
158 INSERT INTO plant_tribes_orthogroup | 158 INSERT INTO plant_tribes_orthogroup |
159 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); | 159 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); |
160 """ | 160 """ |
161 cur = self.update(sql, tuple(args)) | 161 cur = self.update(sql, tuple(args)) |
162 self.flush() | 162 self.flush() |
163 i += 1 | |
164 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) | |
163 for file_name in glob.glob(os.path.join(file_dir, "*list")): | 165 for file_name in glob.glob(os.path.join(file_dir, "*list")): |
164 items = os.path.basename(file_name).split(".") | 166 items = os.path.basename(file_name).split(".") |
165 clustering_method = items[0] | 167 clustering_method = items[0] |
166 with open(file_name, "r") as fh: | 168 with open(file_name, "r") as fh: |
167 for i, line in enumerate(fh): | 169 for i, line in enumerate(fh): |
228 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] | 230 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] |
229 # Populate the plant_tribes_taxon table. | 231 # Populate the plant_tribes_taxon table. |
230 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) | 232 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) |
231 self.log("Processing taxa lineage config: %s" % str(file_name)) | 233 self.log("Processing taxa lineage config: %s" % str(file_name)) |
232 with open(file_name, "r") as fh: | 234 with open(file_name, "r") as fh: |
233 for i, line in enumerate(fh): | 235 for line in fh: |
234 line = line.strip() | 236 line = line.strip() |
235 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): | 237 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): |
236 # Skip blank lines, comments and section headers. | 238 # Skip blank lines, comments and section headers. |
237 continue | 239 continue |
238 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots | 240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots |
239 items = line.split("\t") | 241 items = line.split("\t") |
240 species_name = items[0] | 242 species_name = items[0] |
241 self.log("Calculating the number of genes for species_name: %s" % str(species_name)) | 243 i = 0 |
242 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): | 244 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): |
243 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. | 245 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. |
244 species_genes_dict_key_items = species_genes_dict_key.split("^^") | 246 species_genes_dict_key_items = species_genes_dict_key.split("^^") |
245 clustering_method = species_genes_dict_key_items[0] | 247 clustering_method = species_genes_dict_key_items[0] |
246 species_code = species_genes_dict_key_items[1] | 248 species_code = species_genes_dict_key_items[1] |
261 INSERT INTO plant_tribes_taxon | 263 INSERT INTO plant_tribes_taxon |
262 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); | 264 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); |
263 """ | 265 """ |
264 self.update(sql, tuple(args)) | 266 self.update(sql, tuple(args)) |
265 self.flush() | 267 self.flush() |
268 i += 1 | |
269 self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % str(species_name)) | |
266 | 270 |
267 def process_orthogroup_fasta_files(self): | 271 def process_orthogroup_fasta_files(self): |
268 """ | 272 """ |
269 1. Analyze all of the scaffold .fna and .faa files for each clustering | 273 1. Analyze all of the scaffold .fna and .faa files for each clustering |
270 method to populate the aa_dict and dna_dict sequence dictionaries. | 274 method to populate the aa_dict and dna_dict sequence dictionaries. |
311 # above will be the sequence associated with that gene until | 315 # above will be the sequence associated with that gene until |
312 # the next gene id line is encountered. | 316 # the next gene id line is encountered. |
313 sequence = adict[combined_id] | 317 sequence = adict[combined_id] |
314 sequence = "%s%s" % (sequence, line) | 318 sequence = "%s%s" % (sequence, line) |
315 adict[combined_id] = sequence | 319 adict[combined_id] = sequence |
316 # Populate the plant_tribes_gene and gen_scaffold_association tables | 320 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_association tables |
317 # from the contents of aa_dict and dna_dict. | 321 # from the contents of aa_dict and dna_dict. |
318 for combined_id in sorted(dna_dict.keys()): | 322 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables.") |
323 gi = 0 | |
324 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())): | |
319 # The dictionary keys combine the orthogroup_id, clustering method and | 325 # The dictionary keys combine the orthogroup_id, clustering method and |
320 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. | 326 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. |
321 items = combined_id.split("^^") | 327 items = combined_id.split("^^") |
322 orthogroup_id = items[0] | 328 orthogroup_id = items[0] |
323 clustering_method = items[1] | 329 clustering_method = items[1] |
324 gene_id = items[2] | 330 gene_id = items[2] |
325 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id)) | |
326 # The value will be a list containing both | 331 # The value will be a list containing both |
327 # clustering_method and the dna string. | 332 # clustering_method and the dna string. |
328 dna_sequence = dna_dict[combined_id] | 333 dna_sequence = dna_dict[combined_id] |
329 aa_sequence = aa_dict[combined_id] | 334 aa_sequence = aa_dict[combined_id] |
330 # Get the species_code from the gene_id. | 335 # Get the species_code from the gene_id. |
362 RETURNING id; | 367 RETURNING id; |
363 """ | 368 """ |
364 cur = self.update(sql, tuple(args)) | 369 cur = self.update(sql, tuple(args)) |
365 self.flush() | 370 self.flush() |
366 gene_id_db = cur.fetchone()[0] | 371 gene_id_db = cur.fetchone()[0] |
372 gi += 1 | |
367 # Insert a row into the gene_scaffold_orthogroup_association table. | 373 # Insert a row into the gene_scaffold_orthogroup_association table. |
368 # Get the scaffold_rec for the current scaffold_id and clustering_method. | 374 # Get the scaffold_rec for the current scaffold_id and clustering_method. |
369 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | 375 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] |
370 for scaffold_rec in self.scaffold_recs: | 376 for scaffold_rec in self.scaffold_recs: |
371 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: | 377 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: |
375 INSERT INTO gene_scaffold_orthogroup_association | 381 INSERT INTO gene_scaffold_orthogroup_association |
376 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); | 382 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); |
377 """ | 383 """ |
378 cur = self.update(sql, tuple(args)) | 384 cur = self.update(sql, tuple(args)) |
379 self.flush() | 385 self.flush() |
380 | 386 if gsoai % 1000 == 0: |
387 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_association table.") | |
388 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi) | |
389 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_association table." % gsoai) | |
381 | 390 |
382 if __name__ == '__main__': | 391 if __name__ == '__main__': |
383 scaffold_loader = ScaffoldLoader() | 392 scaffold_loader = ScaffoldLoader() |
384 scaffold_loader.run() | 393 scaffold_loader.run() |
385 scaffold_loader.shutdown() | 394 scaffold_loader.shutdown() |