Mercurial > repos > dave > genebed_maf_to_fasta
comparison interval_maf_to_merged_fasta.py @ 0:b5c3cb24e9de draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genebed_maf_to_fasta/ commit 8d55cabcec17915d959f672ecacfa851df1f4ca4-dirty"
| author | dave |
|---|---|
| date | Mon, 27 Jul 2020 17:54:18 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b5c3cb24e9de |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Reads an interval or gene BED and a MAF Source. | |
| 4 Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates | |
| 5 | |
| 6 Alignment blocks are layered ontop of each other based upon score. | |
| 7 | |
| 8 usage: %prog maf_file [options] | |
| 9 -d, --dbkey=d: Database key, ie hg17 | |
| 10 -c, --chromCol=c: Column of Chr | |
| 11 -s, --startCol=s: Column of Start | |
| 12 -e, --endCol=e: Column of End | |
| 13 -S, --strandCol=S: Column of Strand | |
| 14 -G, --geneBED: Input is a Gene BED file, process and join exons as one region | |
| 15 -t, --mafSourceType=t: Type of MAF source to use | |
| 16 -m, --mafSource=m: Path of source MAF file, if not using cached version | |
| 17 -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version | |
| 18 -i, --interval_file=i: Input interval file | |
| 19 -o, --output_file=o: Output MAF file | |
| 20 -p, --species=p: Species to include in output | |
| 21 -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species. | |
| 22 -z, --mafIndexFileDir=z: Directory of local maf_index.loc file | |
| 23 | |
| 24 usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR | |
| 25 """ | |
| 26 # Dan Blankenberg | |
| 27 from __future__ import print_function | |
| 28 | |
| 29 import sys | |
| 30 | |
| 31 import bx.intervals.io | |
| 32 from bx.cookbook import doc_optparse | |
| 33 | |
| 34 from galaxy.tools.util import maf_utilities | |
| 35 | |
| 36 | |
| 37 def stop_err(msg): | |
| 38 sys.exit(msg) | |
| 39 | |
| 40 | |
| 41 def __main__(): | |
| 42 # Parse Command Line | |
| 43 options, args = doc_optparse.parse(__doc__) | |
| 44 mincols = 0 | |
| 45 strand_col = -1 | |
| 46 | |
| 47 if options.dbkey: | |
| 48 primary_species = options.dbkey | |
| 49 else: | |
| 50 primary_species = None | |
| 51 if primary_species in [None, "?", "None"]: | |
| 52 stop_err("You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file.") | |
| 53 | |
| 54 include_primary = True | |
| 55 secondary_species = maf_utilities.parse_species_option(options.species) | |
| 56 if secondary_species: | |
| 57 species = list(secondary_species) # make copy of species list | |
| 58 if primary_species in secondary_species: | |
| 59 secondary_species.remove(primary_species) | |
| 60 else: | |
| 61 include_primary = False | |
| 62 else: | |
| 63 species = None | |
| 64 | |
| 65 if options.interval_file: | |
| 66 interval_file = options.interval_file | |
| 67 else: | |
| 68 stop_err("Input interval file has not been specified.") | |
| 69 | |
| 70 if options.output_file: | |
| 71 output_file = options.output_file | |
| 72 else: | |
| 73 stop_err("Output file has not been specified.") | |
| 74 | |
| 75 if not options.geneBED: | |
| 76 if options.chromCol: | |
| 77 chr_col = int(options.chromCol) - 1 | |
| 78 else: | |
| 79 stop_err("Chromosome column not set, click the pencil icon in the history item to set the metadata attributes.") | |
| 80 | |
| 81 if options.startCol: | |
| 82 start_col = int(options.startCol) - 1 | |
| 83 else: | |
| 84 stop_err("Start column not set, click the pencil icon in the history item to set the metadata attributes.") | |
| 85 | |
| 86 if options.endCol: | |
| 87 end_col = int(options.endCol) - 1 | |
| 88 else: | |
| 89 stop_err("End column not set, click the pencil icon in the history item to set the metadata attributes.") | |
| 90 | |
| 91 if options.strandCol: | |
| 92 strand_col = int(options.strandCol) - 1 | |
| 93 | |
| 94 mafIndexFile = "%s/maf_indexes.loc" % options.mafIndexFileDir | |
| 95 | |
| 96 overwrite_with_gaps = True | |
| 97 if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false': | |
| 98 overwrite_with_gaps = False | |
| 99 | |
| 100 # Finish parsing command line | |
| 101 | |
| 102 # get index for mafs based on type | |
| 103 index = index_filename = None | |
| 104 # using specified uid for locally cached | |
| 105 if options.mafSourceType.lower() in ["cached"]: | |
| 106 index = maf_utilities.maf_index_by_uid(options.mafSource, mafIndexFile) | |
| 107 if index is None: | |
| 108 stop_err("The MAF source specified (%s) appears to be invalid." % (options.mafSource)) | |
| 109 elif options.mafSourceType.lower() in ["user"]: | |
| 110 # index maf for use here, need to remove index_file when finished | |
| 111 index, index_filename = maf_utilities.open_or_build_maf_index(options.mafSource, options.mafIndex, species=[primary_species]) | |
| 112 if index is None: | |
| 113 stop_err("Your MAF file appears to be malformed.") | |
| 114 else: | |
| 115 stop_err("Invalid MAF source type specified.") | |
| 116 | |
| 117 # open output file | |
| 118 output = open(output_file, "w") | |
| 119 | |
| 120 if options.geneBED: | |
| 121 region_enumerator = maf_utilities.line_enumerator(open(interval_file, "r").readlines()) | |
| 122 else: | |
| 123 region_enumerator = enumerate(bx.intervals.io.NiceReaderWrapper( | |
| 124 open(interval_file, 'r'), chrom_col=chr_col, start_col=start_col, | |
| 125 end_col=end_col, strand_col=strand_col, fix_strand=True, | |
| 126 return_header=False, return_comments=False)) | |
| 127 | |
| 128 # Step through intervals | |
| 129 regions_extracted = 0 | |
| 130 line_count = 0 | |
| 131 for line_count, line in region_enumerator: | |
| 132 try: | |
| 133 if options.geneBED: # Process as Gene BED | |
| 134 try: | |
| 135 starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed(line) | |
| 136 # create spliced alignment object | |
| 137 alignment = maf_utilities.get_spliced_region_alignment( | |
| 138 index, primary_species, fields[0], starts, ends, | |
| 139 strand='+', species=species, mincols=mincols, | |
| 140 overwrite_with_gaps=overwrite_with_gaps) | |
| 141 primary_name = secondary_name = fields[3] | |
| 142 alignment_strand = fields[5] | |
| 143 except Exception as e: | |
| 144 print("Error loading exon positions from input line %i: %s" % (line_count, e)) | |
| 145 continue | |
| 146 else: # Process as standard intervals | |
| 147 try: | |
| 148 # create spliced alignment object | |
| 149 alignment = maf_utilities.get_region_alignment( | |
| 150 index, primary_species, line.chrom, line.start, | |
| 151 line.end, strand='+', species=species, mincols=mincols, | |
| 152 overwrite_with_gaps=overwrite_with_gaps) | |
| 153 primary_name = "%s(%s):%s-%s" % (line.chrom, line.strand, line.start, line.end) | |
| 154 secondary_name = "" | |
| 155 alignment_strand = line.strand | |
| 156 except Exception as e: | |
| 157 print("Error loading region positions from input line %i: %s" % (line_count, e)) | |
| 158 continue | |
| 159 | |
| 160 # Write alignment to output file | |
| 161 # Output primary species first, if requested | |
| 162 if include_primary: | |
| 163 output.write(">%s.%s\n" % (primary_species, primary_name)) | |
| 164 if alignment_strand == "-": | |
| 165 output.write(alignment.get_sequence_reverse_complement(primary_species)) | |
| 166 else: | |
| 167 output.write(alignment.get_sequence(primary_species)) | |
| 168 output.write("\n") | |
| 169 # Output all remainging species | |
| 170 for spec in secondary_species or alignment.get_species_names(skip=primary_species): | |
| 171 if secondary_name: | |
| 172 output.write(">%s.%s\n" % (spec, secondary_name)) | |
| 173 else: | |
| 174 output.write(">%s\n" % (spec)) | |
| 175 if alignment_strand == "-": | |
| 176 output.write(alignment.get_sequence_reverse_complement(spec)) | |
| 177 else: | |
| 178 output.write(alignment.get_sequence(spec)) | |
| 179 output.write("\n") | |
| 180 | |
| 181 output.write("\n") | |
| 182 regions_extracted += 1 | |
| 183 except Exception as e: | |
| 184 print("Unexpected error from input line %i: %s" % (line_count, e)) | |
| 185 raise | |
| 186 | |
| 187 # close output file | |
| 188 output.close() | |
| 189 | |
| 190 # remove index file if created during run | |
| 191 maf_utilities.remove_temp_index_file(index_filename) | |
| 192 | |
| 193 # Print message about success for user | |
| 194 if regions_extracted > 0: | |
| 195 print("%i regions were processed successfully." % (regions_extracted)) | |
| 196 else: | |
| 197 print("No regions were processed successfully.") | |
| 198 if line_count > 0 and options.geneBED: | |
| 199 print("This tool requires your input file to conform to the 12 column BED standard.") | |
| 200 | |
| 201 | |
| 202 if __name__ == "__main__": | |
| 203 __main__() |
