plant_tribes_gene_family_scaffold_loader: gene_family_scaffold

comparison gene_family_scaffold_loader.py @ 2:751b36922d59 draft

Uploaded

author	greg
date	Fri, 25 May 2018 10:18:13 -0400
parents	177b62db4320
children	48c13482e6c9

comparison

equal deleted inserted replaced

-:488bf95641d2
+:751b36922d59
 clustering_method = items[0]
 # Save all clustering methods for later processing.
 if clustering_method not in self.clustering_methods:
 self.clustering_methods.append(clustering_method)
 # Insert a row in to the plant_tribes_scaffold table.
-self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method))
+self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method))
 args = [scaffold_id, clustering_method]
 sql = """
 INSERT INTO plant_tribes_scaffold
 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
 RETURNING id;
 cur = self.update(sql, tuple(args))
 self.flush()
 scaffold_id_db = cur.fetchone()[0]
 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
 with open(file_name, "r") as fh:
-for i, line in enumerate(fh):
+i = 0
-if i == 0:
+for i2, line in enumerate(fh):
+if i2 == 0:
 # Skip first line.
 continue
 num_genes = 0
 num_species = 0
 items = line.split("\t")
 if j_int > 0:
 # The  species has at least 1 gene
 num_species += 1
 num_genes += j_int
 # Insert a row into the plant_tribes_orthogroup table.
-self.log("Inserting a row into the plant_tribes_orthogroup table...")
 args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
 for k in range(super_ortho_start_index, len(items)):
 args.append('%s' % str(items[k]))
 sql = """
 INSERT INTO plant_tribes_orthogroup
 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
 """
 cur = self.update(sql, tuple(args))
 self.flush()
+i += 1
+self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
 for file_name in glob.glob(os.path.join(file_dir, "*list")):
 items = os.path.basename(file_name).split(".")
 clustering_method = items[0]
 with open(file_name, "r") as fh:
 for i, line in enumerate(fh):
 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
 # Populate the plant_tribes_taxon table.
 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
 self.log("Processing taxa lineage config: %s" % str(file_name))
 with open(file_name, "r") as fh:
-for i, line in enumerate(fh):
+for line in fh:
 line = line.strip()
 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
 # Skip blank lines, comments and section headers.
 continue
 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
 items = line.split("\t")
 species_name = items[0]
-self.log("Calculating the number of genes for species_name: %s" % str(species_name))
+i = 0
 for species_genes_dict_key in sorted(self.species_genes_dict.keys()):
 # The format of species_genes_dict_key is <clustering_method>^^<species_code>.
 species_genes_dict_key_items = species_genes_dict_key.split("^^")
 clustering_method = species_genes_dict_key_items[0]
 species_code = species_genes_dict_key_items[1]
 INSERT INTO plant_tribes_taxon
 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
 """
 self.update(sql, tuple(args))
 self.flush()
+i += 1
+self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % str(species_name))
 def process_orthogroup_fasta_files(self):
 """
 1. Analyze all of the scaffold .fna and .faa files for each clustering
 method to populate the aa_dict and dna_dict sequence dictionaries.
 # above will be the sequence associated with that gene until
 # the next gene id line is encountered.
 sequence = adict[combined_id]
 sequence = "%s%s" % (sequence, line)
 adict[combined_id] = sequence
-# Populate the plant_tribes_gene and gen_scaffold_association tables
+# Populate the plant_tribes_gene and gene_scaffold_orthogroup_association tables
 # from the contents of aa_dict and dna_dict.
-for combined_id in sorted(dna_dict.keys()):
+self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables.")
+gi = 0
+for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
 # The dictionary keys combine the orthogroup_id, clustering method and
 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
 items = combined_id.split("^^")
 orthogroup_id = items[0]
 clustering_method = items[1]
 gene_id = items[2]
-self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id))
 # The value will be a list containing both
 # clustering_method and the dna string.
 dna_sequence = dna_dict[combined_id]
 aa_sequence = aa_dict[combined_id]
 # Get the species_code from the gene_id.
 RETURNING id;
 """
 cur = self.update(sql, tuple(args))
 self.flush()
 gene_id_db = cur.fetchone()[0]
+gi += 1
 # Insert a row into the gene_scaffold_orthogroup_association table.
 # Get the scaffold_rec for the current scaffold_id and clustering_method.
 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
 for scaffold_rec in self.scaffold_recs:
 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
 INSERT INTO gene_scaffold_orthogroup_association
 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s);
 """
 cur = self.update(sql, tuple(args))
 self.flush()
+if gsoai % 1000 == 0:
+self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_association table.")
+self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
+self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_association table." % gsoai)
 if __name__ == '__main__':
 scaffold_loader = ScaffoldLoader()
 scaffold_loader.run()
 scaffold_loader.shutdown()

Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader

comparison gene_family_scaffold_loader.py @ 2:751b36922d59 draft