Mercurial > repos > sanbi-uwc > vcf_to_alignment
comparison vcf_to_msa.py @ 3:62fbd3f96b30 draft
planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
| author | sanbi-uwc |
|---|---|
| date | Fri, 03 Feb 2017 05:56:11 -0500 |
| parents | a0c85f2d74a5 |
| children | f58178c0f00d |
comparison
equal
deleted
inserted
replaced
| 2:a0c85f2d74a5 | 3:62fbd3f96b30 |
|---|---|
| 30 | 30 |
| 31 parser = argparse.ArgumentParser() | 31 parser = argparse.ArgumentParser() |
| 32 parser.add_argument('--vcf_files', nargs="+") | 32 parser.add_argument('--vcf_files', nargs="+") |
| 33 parser.add_argument('--reference_file', type=argparse.FileType()) | 33 parser.add_argument('--reference_file', type=argparse.FileType()) |
| 34 parser.add_argument('--output_file', type=argparse.FileType('w')) | 34 parser.add_argument('--output_file', type=argparse.FileType('w')) |
| 35 parser.add_argument('--remove_invariant', action='store_true', default=False) | |
| 35 args = parser.parse_args() | 36 args = parser.parse_args() |
| 36 | 37 |
| 37 do_inserts = False | 38 do_inserts = False |
| 38 do_deletes = False | 39 do_deletes = False |
| 39 do_snps = True | 40 do_snps = True |
| 41 if (do_inserts or do_deletes) and args.remove_invariant: | |
| 42 exit("Cannot combine indel processing with 'remove_invariant' argument") | |
| 40 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) | 43 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) |
| 41 # print(reference, file=open('/tmp/reference.txt', 'w')) | 44 # print(reference, file=open('/tmp/reference.txt', 'w')) |
| 42 # vcf_files_dir = os.path.expanduser("~/Data/vcf") | 45 # vcf_files_dir = os.path.expanduser("~/Data/vcf") |
| 43 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] | 46 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] |
| 44 # print(vcf_files) | 47 # print(vcf_files) |
| 48 insertions = {} | 51 insertions = {} |
| 49 insertion_sites = [] | 52 insertion_sites = [] |
| 50 tree = intervaltree.IntervalTree() | 53 tree = intervaltree.IntervalTree() |
| 51 sequence_names = [] | 54 sequence_names = [] |
| 52 sequences = {} | 55 sequences = {} |
| 56 if args.remove_invariant: | |
| 57 variant_sites = set() | |
| 53 for i, vcf_descriptor in enumerate(args.vcf_files): | 58 for i, vcf_descriptor in enumerate(args.vcf_files): |
| 54 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] | 59 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] |
| 55 (seqname,vcf_filename) = vcf_descriptor.split('^^^') | 60 (seqname,vcf_filename) = vcf_descriptor.split('^^^') |
| 56 sequence_names.append(seqname) | 61 sequence_names.append(seqname) |
| 57 sequence = list(reference) | 62 sequence = list(reference) |
| 67 type="snp" | 72 type="snp" |
| 68 try: | 73 try: |
| 69 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele | 74 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele |
| 70 except IndexError as e: | 75 except IndexError as e: |
| 71 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | 76 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) |
| 77 if args.remove_invariant: | |
| 78 variant_sites.add(record.affected_start) | |
| 72 count += 1 | 79 count += 1 |
| 73 elif record.is_indel: | 80 elif record.is_indel: |
| 74 length = record.affected_end - record.affected_start | 81 length = record.affected_end - record.affected_start |
| 75 if record.is_deletion and do_deletes: | 82 if record.is_deletion and do_deletes: |
| 76 type="del" | 83 type="del" |
| 96 # end = max([existing_interval.end, interval.end]) | 103 # end = max([existing_interval.end, interval.end]) |
| 97 # tree.remove(existing_interval) | 104 # tree.remove(existing_interval) |
| 98 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) | 105 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) |
| 99 # tree.add(new_interval) | 106 # tree.add(new_interval) |
| 100 | 107 |
| 101 SeqIO.write(reference_seq, args.output_file, "fasta") | 108 if args.remove_invariant: |
| 109 reference_str = ''.join([c for (i, c) in enumerate(reference) if i in variant_sites]) | |
| 110 reference_seq_variant = SeqRecord(Seq(reference_str), id=reference_seq.id, description=reference_seq.description) | |
| 111 SeqIO.write(reference_seq_variant, args.output_file, "fasta") | |
| 112 else: | |
| 113 SeqIO.write(reference_seq, args.output_file, "fasta") | |
| 114 | |
| 102 offset = 0 | 115 offset = 0 |
| 116 sequence_length = 0 | |
| 103 for name in sequence_names: | 117 for name in sequence_names: |
| 104 sequence = sequences[name] | 118 sequence = sequences[name] |
| 119 sequence_length = len(sequence) | |
| 105 for site in sorted(insertion_sites, key=itemgetter(0)): | 120 for site in sorted(insertion_sites, key=itemgetter(0)): |
| 106 (start, end, allele, seqname) = site | 121 (start, end, allele, seqname) = site |
| 107 # print(start, allele, seqname) | 122 # print(start, allele, seqname) |
| 108 length = len(allele) | 123 length = len(allele) |
| 109 # start += offset | 124 # start += offset |
| 114 sequence[start:end] = list(str(allele)) | 129 sequence[start:end] = list(str(allele)) |
| 115 else: | 130 else: |
| 116 sequence[start:end] = ['-'] * length | 131 sequence[start:end] = ['-'] * length |
| 117 except IndexError as e: | 132 except IndexError as e: |
| 118 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) | 133 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) |
| 119 SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") | 134 |
| 135 if args.remove_invariant: | |
| 136 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites]) | |
| 137 else: | |
| 138 seq_str = ''.join(sequence) | |
| 139 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") | |
| 120 | 140 |
| 121 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) | 141 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) |
| 122 # output.close() | 142 # output.close() |
| 123 | 143 |
| 124 args.output_file.close() | 144 args.output_file.close() |
