Mercurial > repos > sanbi-uwc > confil
changeset 6:2b90d0574ea5 draft
planemo upload for repository https://github.com/COMBAT-TB/confil commit c84738cfc4876c591d7108229038a4001f836afb
author | sanbi-uwc |
---|---|
date | Tue, 05 Mar 2019 03:49:38 -0500 |
parents | a7b05dd0087d |
children | 96c8c5cada47 |
files | conda/meta.yaml confil.xml confil/confil.py confil/kraken.py confil/report.py setup.py test/test_report.py test/test_runner.py |
diffstat | 8 files changed, 122 insertions(+), 135 deletions(-) [+] |
line wrap: on
line diff
--- a/conda/meta.yaml Mon Mar 04 07:42:05 2019 -0500 +++ b/conda/meta.yaml Tue Mar 05 03:49:38 2019 -0500 @@ -3,8 +3,8 @@ version: {{ environ['VERSION'] }} source: - fn: confil-0.0.1.tar.gz - url: https://github.com/COMBAT-TB/confil/archive/0.0.1.tar.gz + fn: confil-0.1.3.tar.gz + url: https://github.com/COMBAT-TB/confil/archive/0.1.3.tar.gz build: script_env:
--- a/confil.xml Mon Mar 04 07:42:05 2019 -0500 +++ b/confil.xml Tue Mar 05 03:49:38 2019 -0500 @@ -1,6 +1,8 @@ -<tool id="confil" name="Contamination Filter (confil)" version="0.1.2"> +<tool id="confil" name="Contamination Filter (confil)" version="0.1.3"> <requirements> <requirement type="package" version="7.0">click</requirement> + <requirement type="package" version="2.0.7_beta">kraken2</requirement> + <requirement type="package" version="dev20190305">confil</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ #set $input_type = $input_type_conditional.input_type @@ -12,7 +14,7 @@ #set report_name = os.path.splitext(os.path.basename($input_type_conditional.single_input.element_identifier))[0] #set report_name = re.sub('_[0-9]+$', '', str(report_name)) + '.tab' - python $__tool_directory__/confil/confil.py --threads $threads --cutoff $cutoff + confil --threads $threads --cutoff $cutoff $input_type_conditional.single_input.element_identifier && mv $report_name '$output_report' && ln -sf "${input_type_conditional.single_input}" '$single_output_file' @@ -24,7 +26,7 @@ #set report_name = os.path.splitext(os.path.basename($input_type_conditional.collection_input.forward.element_identifier))[0] #set report_name = re.sub('_[0-9]+$', '', str(report_name)) + '.tab' - python $__tool_directory__/confil/confil.py --threads $threads --cutoff $cutoff --paired + confil --threads $threads --cutoff $cutoff --paired $input_type_conditional.collection_input.forward.element_identifier $input_type_conditional.collection_input.reverse.element_identifier && mv $report_name '$output_report' && ln -sf "${input_type_conditional.collection_input.forward}" '$list_output.forward' @@ -76,9 +78,9 @@ <param name="input_type" value="paired_collection" /> <output name="output_report" ftype='tabular' file="seq.tab" /> <output_collection name="list_output" type="paired"> - <element name="forward"> + <element name="forward" value="seq_1.fastq"> </element> - <element name="reverse"> + <element name="reverse" value="seq_2.fastq"> </element> </output_collection> </test>
--- a/confil/confil.py Mon Mar 04 07:42:05 2019 -0500 +++ b/confil/confil.py Tue Mar 05 03:49:38 2019 -0500 @@ -1,17 +1,108 @@ +import distutils.spawn import os +import re +from shlex import split +from subprocess import PIPE, Popen import click -from kraken import kraken_installed, run_kraken - # TODO: Remove KRAKEN2_DEFAULT_DB = "/tools/databases/kraken2/04092018/standard/" OUT_DIR = os.path.abspath(os.curdir) -fastq_file_extensions = ['.fq', '.fastq'] + + +def db_path(): + # Checking DB path + if os.path.exists(KRAKEN2_DEFAULT_DB): + return KRAKEN2_DEFAULT_DB + else: + return OUT_DIR + + +def run_kraken(db, threads, cutoff, paired, seqfiles): + # Using the sample name to track report + seq_name = [os.path.splitext(os.path.basename(seq))[0] + for seq in seqfiles][0] + # remove _ and numbers + seq_name = re.sub('_[0-9]+$', '', seq_name) + # building cmd + cmd = "kraken2 --threads {threads} --db {db} --output {seq_name}.out --report {seq_name}.tab ".format( + threads=threads, db=db, seq_name=seq_name) + if paired: + cmd += "--paired --classified-out {}_cseqs#.fq ".format(seq_name) + cmd += "{seqfiles}".format(seqfiles=' '.join(seqfiles)) + click.secho("Executing kraken2: \n{}\n".format( + split(cmd)), fg='bright_yellow') + + # TODO: remove + # test_file = "https://raw.githubusercontent.com/COMBAT-TB/confil/master/test/test_data/test_file.tab" + # out_file = os.path.join(OUT_DIR, "{}.tab".format(seq_name)) + # mock_cmd = 'wget {} -O {}'.format(test_file, out_file) + # cmd = mock_cmd + # click.secho("Executing mock_cmd: \n{}\n".format(split(cmd)), fg='red') + + p = Popen(split(cmd), stdout=PIPE, stderr=PIPE, close_fds=True) + while True: + output = p.stdout.readline() + if output == '' and p.poll() is not None: + break + if output: + click.echo(output) + returncode = p.poll() + if returncode != 0: + error = p.stderr.readline() + raise OSError("Kraken2 launch error:\n{}\n".format(error)) + # parse kraken report + report_file = os.path.join(OUT_DIR, "{}.tab".format(seq_name)) + parse_report(report_file=report_file, cutoff=cutoff) + return returncode + + +def parse_report(report_file, cutoff): + file_name = os.path.splitext(os.path.basename(report_file))[0] + hit = None + if os.stat(report_file).st_size > 0 and report_file.endswith(".tab"): + click.secho("Processing {} with cutoff of {}...\n".format( + report_file, cutoff), fg='green') + with open(report_file, 'r') as report: + for line in report: + line = [str(e).strip() for e in line.split('\t')] + if len(line) > 1: + click.secho('{}'.format(line), fg='green') + # Percentage of fragments covered by the clade rooted at this taxon + percentage = int(float(line[0])) + # Number of fragments covered by the clade rooted at this taxon + # num_covered = int(float(line[1])) + # Number of fragments assigned directly to this taxon + # num_assigned = int(float(line[2])) + # NCBI taxonomic ID number + # ncbi_tax = int(float(line[3])) + # Indented scientific name (Mycobacterium\n) + name = str(line[5]).strip() + if percentage < cutoff and 'Mycobacterium' in name: + click.secho('\n{}%: {} is contaminated!\n'.format( + percentage, file_name), fg='red') + raise SystemExit('{}%: {} is contaminated!\n'.format( + percentage, file_name)) + if percentage >= cutoff and 'Mycobacterium' in name: + click.secho('\n{}%: {} is not contaminated!\n'.format( + percentage, file_name), fg='green') + hit = line + break + click.secho('Hit: {}'.format(hit), fg='green') + return hit + + +def kraken_installed(): + # check if `kraken2` is in path + installed = distutils.spawn.find_executable("kraken2") + if not installed: + raise OSError("kraken2 is not installed.") + return installed @click.command() -@click.option('--db', default=OUT_DIR, required=True, +@click.option('--db', default=db_path(), required=True, help='Name for Kraken 2 DB', type=click.Path(exists=True), show_default=True) @click.option('--threads', default=1, help='Number of threads',
--- a/confil/kraken.py Mon Mar 04 07:42:05 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -import distutils.spawn -import os -import re -from shlex import split -from subprocess import PIPE, Popen - -import click - -from report import parse_report - -OUT_DIR = os.path.abspath(os.curdir) - - -def kraken_installed(): - # check if `kraken2` is in path - # TODO remove python - installed = distutils.spawn.find_executable("python") - if not installed: - raise OSError("kraken2 is not installed.") - return installed - - -def run_kraken(db, threads, cutoff, paired, seqfiles): - # Using the sample name to track report - seq_name = [os.path.splitext(os.path.basename(seq))[0] - for seq in seqfiles][0] - # remove _ and numbers - seq_name = re.sub('_[0-9]+$', '', seq_name) - # building cmd - cmd = "kraken2 --threads {threads} --db {db} --output {seq_name}.out --report {seq_name}.tab ".format( - threads=threads, db=db, seq_name=seq_name) - if paired: - cmd += "--paired --classified-out {}_cseqs#.fq ".format(seq_name) - cmd += "{seqfiles}".format(seqfiles=' '.join(seqfiles)) - click.secho("Executing kraken2: \n{}\n".format( - split(cmd)), fg='bright_yellow') - - # TODO: remove - test_file = "https://raw.githubusercontent.com/COMBAT-TB/confil/master/test/test_data/test_file.tab" - out_file = os.path.join(OUT_DIR, "{}.tab".format(seq_name)) - mock_cmd = 'wget {} -O {}'.format(test_file, out_file) - cmd = mock_cmd - click.secho("Executing mock_cmd: \n{}\n".format(split(cmd)), fg='red') - - p = Popen(split(cmd), stdout=PIPE, stderr=PIPE, close_fds=True) - while True: - output = p.stdout.readline() - if output == '' and p.poll() is not None: - break - if output: - click.echo(output) - returncode = p.poll() - if returncode != 0: - error = p.stderr.readline() - raise OSError("Kraken2 launch error:\n{}\n".format(error)) - # parse kraken report - report_file = os.path.join(OUT_DIR, "{}.tab".format(seq_name)) - parse_report(report_file=report_file, cutoff=cutoff) - return returncode
--- a/confil/report.py Mon Mar 04 07:42:05 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -import os - -import click - - -def parse_report(report_file, cutoff): - file_name = os.path.splitext(os.path.basename(report_file))[0] - hit = None - if os.stat(report_file).st_size > 0 and report_file.endswith(".tab"): - click.secho("Processing {} with cutoff of {}...\n".format( - report_file, cutoff), fg='green') - with open(report_file, 'r') as report: - for line in report: - line = [str(e).strip() for e in line.split('\t')] - if len(line) > 1: - click.secho('{}'.format(line), fg='green') - # Percentage of fragments covered by the clade rooted at this taxon - percentage = int(float(line[0])) - # Number of fragments covered by the clade rooted at this taxon - # num_covered = int(float(line[1])) - # Number of fragments assigned directly to this taxon - # num_assigned = int(float(line[2])) - # NCBI taxonomic ID number - # ncbi_tax = int(float(line[3])) - # Indented scientific name (Mycobacterium\n) - name = str(line[5]).strip() - if percentage < cutoff and 'Mycobacterium' in name: - click.secho('\n{}%: {} is contaminated!\n'.format( - percentage, file_name), fg='red') - raise SystemExit('{}%: {} is contaminated!\n'.format( - percentage, file_name)) - if percentage >= cutoff and 'Mycobacterium' in name: - click.secho('\n{}%: {} is not contaminated!\n'.format( - percentage, file_name), fg='green') - hit = line - break - click.secho('Hit: {}'.format(hit), fg='green') - return hit
--- a/setup.py Mon Mar 04 07:42:05 2019 -0500 +++ b/setup.py Tue Mar 05 03:49:38 2019 -0500 @@ -5,7 +5,7 @@ setup( name='confil', - version='0.1.2', + version='0.1.3', url='https://github.com/COMBAT-TB/confil', description='Contamination filter', long_description=long_description,
--- a/test/test_report.py Mon Mar 04 07:42:05 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -import os - -import pytest - -from confil.report import parse_report -from test_runner import TEST_DATA_DIR - -TEST_REPORT = os.path.join(TEST_DATA_DIR, "test_file.tab") - -# test using a cutoff of 50% - - -@pytest.mark.parametrize("test_input, expected", [ - (type(parse_report(TEST_REPORT, 50)), list), - (parse_report(TEST_REPORT, 50)[5], 'Mycobacterium'), - (parse_report(TEST_REPORT, 50)[0], '55.84') -]) -def test_parse_report(test_input, expected): - assert test_input == expected - - -def test_parse_report_exception(): - with pytest.raises(SystemExit): - parse_report(TEST_REPORT, 90)
--- a/test/test_runner.py Mon Mar 04 07:42:05 2019 -0500 +++ b/test/test_runner.py Tue Mar 05 03:49:38 2019 -0500 @@ -3,11 +3,11 @@ import pytest from click.testing import CliRunner -from confil.confil import confil +from confil.confil import confil, parse_report CURR_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DATA_DIR = os.path.join(CURR_DIR, "test_data/") - +TEST_REPORT = os.path.join(TEST_DATA_DIR, "test_file.tab") FILE_1 = os.path.join(TEST_DATA_DIR, "test_file_1.fastq") FILE_2 = os.path.join(TEST_DATA_DIR, "test_file_2.fastq") @@ -18,6 +18,21 @@ return runner +@pytest.mark.skip(reason="No way of currently testing this. It's KRAKEN!") def test_runner(cli_runner): result = cli_runner.invoke(confil, ["--paired", FILE_1, FILE_2]) assert result.exit_code == 0 + + +@pytest.mark.parametrize("test_input, expected", [ + (type(parse_report(TEST_REPORT, 50)), list), + (parse_report(TEST_REPORT, 50)[5], 'Mycobacterium'), + (parse_report(TEST_REPORT, 50)[0], '55.84') +]) +def test_parse_report(test_input, expected): + assert test_input == expected + + +def test_parse_report_exception(): + with pytest.raises(SystemExit): + parse_report(TEST_REPORT, 90)