comparison gene_family_scaffold_loader.py @ 11:2fac73ec6ee8 draft

Uploaded
author greg
date Tue, 16 Oct 2018 10:57:23 -0400
parents 11a36e425c94
children ef07c8756360
comparison
equal deleted inserted replaced
10:11a36e425c94 11:2fac73ec6ee8
8 import glob 8 import glob
9 import os 9 import os
10 import sys 10 import sys
11 11
12 import psycopg2 12 import psycopg2
13 from sqlalchemy import create_engine, MetaData, Table
13 from sqlalchemy.engine.url import make_url 14 from sqlalchemy.engine.url import make_url
14 15
15 16
16 class ScaffoldLoader(object): 17 class ScaffoldLoader(object):
17 def __init__(self): 18 def __init__(self):
25 self.species_ids_dict = {} 26 self.species_ids_dict = {}
26 self.taxa_lineage_config = None 27 self.taxa_lineage_config = None
27 self.parse_args() 28 self.parse_args()
28 self.fh = open(self.args.output, "w") 29 self.fh = open(self.args.output, "w")
29 self.connect_db() 30 self.connect_db()
31 self.engine = create_engine(self.args.database_connection_string)
32 self.metadata = MetaData(self.engine)
30 33
31 def parse_args(self): 34 def parse_args(self):
32 parser = argparse.ArgumentParser() 35 parser = argparse.ArgumentParser()
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), 36 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
34 parser.add_argument('--output', dest='output', help='Output dataset'), 37 parser.add_argument('--output', dest='output', help='Output dataset'),
101 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate 104 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
102 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables. 105 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
103 1. Parse all of the *.list files in the same directory to populate 106 1. Parse all of the *.list files in the same directory to populate
104 self.scaffold_genes_dict. 107 self.scaffold_genes_dict.
105 """ 108 """
109 self.pto_table = Table('plant_tribes_orthogroup', self.metadata, autoload=True)
106 scaffold_id = os.path.basename(self.args.scaffold_path) 110 scaffold_id = os.path.basename(self.args.scaffold_path)
107 file_dir = os.path.join(self.args.scaffold_path, 'annot') 111 file_dir = os.path.join(self.args.scaffold_path, 'annot')
108 # The scaffold naming convention must follow this pattern: 112 # The scaffold naming convention must follow this pattern:
109 # <integer1>Gv<integer2>.<integer3> 113 # <integer1>Gv<integer2>.<integer3>
110 # where integer 1 is the number of genomes in the scaffold_id. For example: 114 # where integer 1 is the number of genomes in the scaffold_id. For example:
149 j_int = int(items[j]) 153 j_int = int(items[j])
150 if j_int > 0: 154 if j_int > 0:
151 # The species has at least 1 gene 155 # The species has at least 1 gene
152 num_species += 1 156 num_species += 1
153 num_genes += j_int 157 num_genes += j_int
154 # Insert a row into the plant_tribes_orthogroup table. 158 # Get the auto-incremented row id to insert a row inot
155 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] 159 # the plant_tribes_orthogroup table.
156 for k in range(super_ortho_start_index, len(items)): 160 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');"
157 args.append('%s' % str(items[k])) 161 cur = self.conn.cursor()
158 sql = """ 162 cur.execute(sql)
159 INSERT INTO plant_tribes_orthogroup 163 plant_tribes_orthogroup_id = cur.fetchone()[0]
160 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); 164
161 """ 165 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes]
162 cur = self.update(sql, tuple(args)) 166 last_item = len(items)
163 self.flush() 167 for k in range(super_ortho_start_index, last_item):
168 # The last 7 items in this range are as follows.
169 # items[last_item-6]: AHRD Descriptions
170 # items[last_item-5]: TAIR Gene(s) Descriptions
171 # items[last_item-4]: Pfam Domains
172 # items[last_item-3]: InterProScan Descriptions
173 # items[last_item-2]: GO Molecular Functions
174 # items[last_item-1]: GO Biological Processes
175 # items[last_item]: GO Cellular Components
176 # We'll translate each of these items into a JSON
177 # dictionary for inserting into the table.
178 if k >= (last_item-7) and k <= last_item:
179 # Here is an example string:
180 # Phosphate transporter PHO1 [0.327] | Phosphate
181 # We'll split the string on " | " to create each value.
182 # The keys will be zero-padded integers to enable sorting.
183 json_dict = dict()
184 json_str = str(items[k])
185 json_vals = json_str.split(' | ')
186 for key_index, json_val in enumerate(json_vals):
187 # The zero-padded key is 1 based.
188 json_key = '%04d' % key_index
189 json_dict[json_key] = json_val
190 args.append(json_dict)
191 else:
192 args.append('%s' % str(items[k]))
193 sql = self.pto_table.insert().values(args)
194 try:
195 self.engine.execute(sql)
196 except Exception as e:
197 msg = "Caught exception executing SQL:\n%s\nvalues:\n%s\nException:\n%s\n" % (str(sql), str(args), e)
198 self.stop_err(msg)
164 i += 1 199 i += 1
165 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) 200 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
166 for file_name in glob.glob(os.path.join(file_dir, "*list")): 201 for file_name in glob.glob(os.path.join(file_dir, "*list")):
167 items = os.path.basename(file_name).split(".") 202 items = os.path.basename(file_name).split(".")
168 clustering_method = items[0] 203 clustering_method = items[0]