Mercurial > repos > greg > fasta_extract
comparison fasta_extract.py @ 0:bc3f2a5c7b53 draft
Uploaded
| author | greg |
|---|---|
| date | Sun, 10 Jan 2016 13:03:12 -0500 |
| parents | |
| children | 3fb7f36c2c8a |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bc3f2a5c7b53 |
|---|---|
| 1 import argparse | |
| 2 import csv | |
| 3 import os | |
| 4 import sys | |
| 5 | |
| 6 from fasta_extract_utils import Fasta | |
| 7 | |
| 8 | |
| 9 def reverse_complement(bases): | |
| 10 complements = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} | |
| 11 return ''.join(complements[b.upper()] for b in reversed(bases)) | |
| 12 | |
| 13 | |
| 14 def get_output_path(hid, subtract_from_start, add_to_end, extend_existing, consider_strand, orphan=False): | |
| 15 attrs = 'u%dd%d' % (subtract_from_start, add_to_end) | |
| 16 if extend_existing: | |
| 17 attrs += 'x' | |
| 18 if consider_strand: | |
| 19 attrs += '_s' | |
| 20 if orphan: | |
| 21 attrs += '_orphan' | |
| 22 format = 'gff' | |
| 23 else: | |
| 24 format = 'fasta' | |
| 25 return os.path.join('output_dir', 'fasta_extract-%s_on_data_%d.%s' % (attrs, hid, format)) | |
| 26 | |
| 27 | |
| 28 def stop_err(msg): | |
| 29 sys.stderr.write(msg) | |
| 30 sys.exit(1) | |
| 31 | |
| 32 | |
| 33 parser = argparse.ArgumentParser() | |
| 34 parser.add_argument('--genome_file', dest='genome_file', help='Reference genome fasta index file.') | |
| 35 parser.add_argument('--subtract_from_start', dest='subtract_from_start', type=int, help='Distance to subtract from start.') | |
| 36 parser.add_argument('--add_to_end', dest='add_to_end', type=int, help='Distance to add to end.') | |
| 37 parser.add_argument('--extend_existing', dest='extend_existing', help='Extend existing start/end rather or from computed midpoint.') | |
| 38 parser.add_argument('--strand', dest='strand', help='Consider strandedness: reverse complement extracted sequence on reverse strand.') | |
| 39 args = parser.parse_args() | |
| 40 | |
| 41 fasta = Fasta(args.genome_file) | |
| 42 | |
| 43 for (input_filename, hid) in args.inputs: | |
| 44 extend_existing = args.extend_existing == 'existing' | |
| 45 consider_strand = args.strand == 'yes' | |
| 46 reader = csv.reader(open(input_filename, 'rU'), delimiter='\t') | |
| 47 fasta_output_path = get_output_path(args.hid, | |
| 48 args.subtract_from_start, | |
| 49 args.add_to_end, | |
| 50 extend_existing, | |
| 51 consider_strand) | |
| 52 output = open(fasta_output_path, 'wb') | |
| 53 gff_output_path = get_output_path(args.hid, | |
| 54 args.subtract_from_start, | |
| 55 args.add_to_end, | |
| 56 extend_existing, | |
| 57 consider_strand, | |
| 58 orphan=True) | |
| 59 orphan_writer = csv.writer(open(gff_output_path, 'wb'), delimiter='\t') | |
| 60 for row in reader: | |
| 61 if len(row) != 9 or row[0].startswith('#'): | |
| 62 continue | |
| 63 try: | |
| 64 cname = row[0] | |
| 65 start = int(row[3]) | |
| 66 end = int(row[4]) | |
| 67 strand = row[6] | |
| 68 if extend_existing: | |
| 69 start -= args.subtract_from_start | |
| 70 end += args.add_to_end | |
| 71 else: | |
| 72 midpoint = (start + end) // 2 | |
| 73 start = midpoint - args.subtract_from_start | |
| 74 end = midpoint + args.add_to_end | |
| 75 if 1 <= start and end <= len(fasta[cname]): | |
| 76 output.write('>%s:%s-%s_%s\n' % (cname, start, end, strand)) | |
| 77 bases = fasta[cname][start-1:end] | |
| 78 if consider_strand and strand == '-': | |
| 79 bases = reverse_complement(bases) | |
| 80 output.write('%s\n' % bases) | |
| 81 else: | |
| 82 orphan_writer.writerow(row) | |
| 83 except Exception, e: | |
| 84 stop_err(str(e)) |
