Mercurial > repos > artbio > small_rna_map
view small_rna_map.py @ 3:2e0dc6032a98 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
author | artbio |
---|---|
date | Tue, 18 Jul 2017 13:34:36 -0400 |
parents | 1ad5d040f85f |
children | d65045e976e6 |
line wrap: on
line source
import argparse from collections import defaultdict import numpy import pysam def Parser(): the_parser = argparse.ArgumentParser() the_parser.add_argument('--input', dest='input', required=True, nargs='+', help='input BAM files') the_parser.add_argument('--sample_name', dest='sample_name', required=True, nargs='+', help='sample name') the_parser.add_argument('--output', action="store", type=str, help='output tabular file') args = the_parser.parse_args() return args def compute_map_dictionary(pysam_object): ''' returns - map_dictionary {(read_chromosome,read_position,polarity): [list_of_read_length]} - max_dictionary {chromosome_name: max_of_number_of_read_at_any_position} ''' unmatched_chromosomes = list(pysam_object.references) map_dictionary = defaultdict(list) max_dictionary = dict.fromkeys(pysam_object.references, 0) for read in pysam_object: if not read.is_unmapped: read_chromosome = read.reference_name # to shift from 0-based to 1-based coordinate read_position = read.pos + 1 polarity = "F" if (not read.is_reverse) else "R" map_dictionary[(read_chromosome, read_position, polarity)].append(read.qlen) if read_chromosome in unmatched_chromosomes: unmatched_chromosomes.remove(read_chromosome) for name in unmatched_chromosomes: # to put at least a null value by chromosome map_dictionary[(name, 1, 'F')] = [0] # make a dictionary with max value of reads by chromosome max_dictionary = defaultdict(int) for name in pysam_object.references: for key in map_dictionary.keys(): if name in key and len(map_dictionary[key]) > max_dictionary[name]: max_dictionary[name] = len(map_dictionary[key]) return map_dictionary, max_dictionary def main(inputs, sample_names, output): with open(output, 'w') as outfile: outfile.write('\t'.join(['Dataset', 'Chromosome', 'Chrom_length', 'Coordinate', 'Nbr_reads', 'Polarity', 'Max', 'Mean', 'Median\n'])) chr_lengths = dict(zip(pysam.AlignmentFile(inputs[0], 'rb').references, pysam.AlignmentFile(inputs[0], 'rb').lengths)) for file, sample in zip(inputs, sample_names): with pysam.AlignmentFile(file, 'rb') as bamfile: map, maximum = compute_map_dictionary(bamfile) for record in sorted(map): line = [sample, record[0], str(chr_lengths[record[0]]), str(record[1]), str(len(map[record])), record[2], str(maximum[record[0]]), str(round(numpy.mean(map[record]), 1)), str(numpy.median(map[record]))] outfile.write("\t".join(line) + "\n") if __name__ == "__main__": args = Parser() # if identical sample names # to be tested if len(set(args.sample_name)) != len(args.sample_name): args.sample_name = [name + '_' + str(i) for i, name in enumerate(args.sample_name)] main(args.input, args.sample_name, args.output)