Mercurial > repos > sanbi-uwc > vcf_to_alignment
diff vcf_to_msa.py @ 3:62fbd3f96b30 draft
planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
author | sanbi-uwc |
---|---|
date | Fri, 03 Feb 2017 05:56:11 -0500 |
parents | a0c85f2d74a5 |
children | f58178c0f00d |
line wrap: on
line diff
--- a/vcf_to_msa.py Wed Feb 01 08:45:12 2017 -0500 +++ b/vcf_to_msa.py Fri Feb 03 05:56:11 2017 -0500 @@ -32,11 +32,14 @@ parser.add_argument('--vcf_files', nargs="+") parser.add_argument('--reference_file', type=argparse.FileType()) parser.add_argument('--output_file', type=argparse.FileType('w')) +parser.add_argument('--remove_invariant', action='store_true', default=False) args = parser.parse_args() do_inserts = False do_deletes = False do_snps = True +if (do_inserts or do_deletes) and args.remove_invariant: + exit("Cannot combine indel processing with 'remove_invariant' argument") # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) # print(reference, file=open('/tmp/reference.txt', 'w')) # vcf_files_dir = os.path.expanduser("~/Data/vcf") @@ -50,6 +53,8 @@ tree = intervaltree.IntervalTree() sequence_names = [] sequences = {} +if args.remove_invariant: + variant_sites = set() for i, vcf_descriptor in enumerate(args.vcf_files): # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] (seqname,vcf_filename) = vcf_descriptor.split('^^^') @@ -69,6 +74,8 @@ sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele except IndexError as e: print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) + if args.remove_invariant: + variant_sites.add(record.affected_start) count += 1 elif record.is_indel: length = record.affected_end - record.affected_start @@ -98,10 +105,18 @@ # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) # tree.add(new_interval) -SeqIO.write(reference_seq, args.output_file, "fasta") +if args.remove_invariant: + reference_str = ''.join([c for (i, c) in enumerate(reference) if i in variant_sites]) + reference_seq_variant = SeqRecord(Seq(reference_str), id=reference_seq.id, description=reference_seq.description) + SeqIO.write(reference_seq_variant, args.output_file, "fasta") +else: + SeqIO.write(reference_seq, args.output_file, "fasta") + offset = 0 +sequence_length = 0 for name in sequence_names: sequence = sequences[name] + sequence_length = len(sequence) for site in sorted(insertion_sites, key=itemgetter(0)): (start, end, allele, seqname) = site # print(start, allele, seqname) @@ -116,7 +131,12 @@ sequence[start:end] = ['-'] * length except IndexError as e: print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) - SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") + + if args.remove_invariant: + seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites]) + else: + seq_str = ''.join(sequence) + SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) # output.close()