# HG changeset patch # User ric # Date 1475057010 14400 # Node ID ba6cf6ede02726592b01df60847043873e9d7d70 Uploaded diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/.gitignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/.gitignore Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,3 @@ +.idea +*.pyc +*~ diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/exporter/export_titer_plates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/exporter/export_titer_plates.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,67 @@ +import logging, csv, argparse, sys, os + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.kb.drivers.omero.utils as vlu + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + +CSV_FIELDS = ['label', 'barcode', 'rows', 'columns', 'plate_status'] + +def make_parser(): + parser = argparse.ArgumentParser(description='dump all TiterPlate objects to a TSV file') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('--ofile', type=str, help='output file', + required=True) + return parser + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format' : LOG_FORMAT, + 'datefmt' : LOG_DATEFMT, + 'level' : log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger() + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + logging.info('Loading TiterPlate objects') + plates = kb.get_objects(kb.TiterPlate) + logging.info('Loaded %d objects' % len(plates)) + + with open(args.ofile, 'w') as ofile: + writer = csv.DictWriter(ofile, CSV_FIELDS, delimiter='\t') + writer.writeheader() + for pl in plates: + logger.debug('Dumping plate %d/%d' % (plates.index(pl) + 1, + len(plates))) + writer.writerow({'label' : pl.label, + 'barcode' : pl.barcode, + 'rows' : pl.rows, + 'columns' : pl.columns, + 'plate_status' : pl.status.enum_label()}) + logger.info('Job done') + + + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/biosample.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/biosample.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,195 @@ + + import BioSample definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $move_to_common_space + --move-to-common-space + #end if + #if $blocking_validation + --blocking-validator + #end if + biosample + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($source_type) != 'use_provided' + --source-type ${source_type} + #end if + #if str($vessel_type_selector.vessel_type) != 'use_provided' + --vessel-type ${vessel_type_selector.vessel_type} + #end if + #if str($vessel_content) != 'use_provided' + --vessel-content=${vessel_content} + #end if + #if str($vessel_status) != 'use_provided' + --vessel-status=${vessel_status} + #end if + #if str($vessel_type_selector) == 'IlluminaBeadChipArray' + #if str($vessel_type_selector.assay_type) != 'use_provided' + --bead-chip-assay-type=${vessel_type_selector.assay_type} + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +A biosample record will have, at least, the following fields:: + + label source + I001-bs-2 V932814892 + I002-bs-2 V932814892 + I003-bs-2 None + +Where label is the label of the biosample container. If a 'None' value +has been passed in the source column, the biosample will be imported +as a new unlinked object into the biobanks. Another example, this time +involving DNA samples:: + + label source used_volume current_volume activation_date + I001-dna V932814899 0.3 0.2 17/03/2007 + I002-dna V932814900 0.22 0.2 21/01/2004 + +A special case is when records refer to biosamples contained in plate +wells. In this case, an additional column must be present with the VID +of the corresponding TiterPlate object. For instance:: + + plate label source + V39030 A01 V932814892 + V39031 A02 V932814893 + V39032 A03 V932814894 + +where the label column is now the label of the well position. + +If row and column (optional) are provided, the program will use them; +if they are not provided, it will infer them from label (e.g., J01 -> +row=10, column=1). Missing labels will be generated as:: + + '%s%03d' % (chr(row+ord('A')-1), column) + +A badly formed label will result in the rejection of the record; the +same will happen if label, row and column are inconsistent. The well +will be filled by current_volume material produced by removing +used_volume material taken from the bio material contained in the +vessel identified by source. row and column are base 1. + +If the sample is a IlluminaBeadChipArray the plate column used in the +PlateWell case will become a illumina_array column and a new column, named +bead_chip_assay_type, is required:: + + illumina_array label source bead_chip_assay_type + V1351235 R01C01 V412441 HUMANEXOME_12V1_B + V1351235 R01C02 V351151 HUMANEXOME_12V1_B + V1351235 R02C01 V345115 HUMANEXOME_12V1_B + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/birth_data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/birth_data.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,57 @@ + + import birth data within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + birth_data + #if str($study) != 'use_provided' + --study ${study} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + study individual timestamp birth_date birth_place + ASTUDY V1234 1310057541608 12/03/1978 006171 + ASTUDY V14112 1310057541608 25/04/1983 006149 + ASTUDY V1241 1310057541608 12/03/2001 006172 + ..... + +where birth_place is a valid ISTAT code for an Italian city or a +foreign Country and birth_date must have the dd/mm/YYYY format. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/data_collection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/data_collection.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,120 @@ + + import DataCollection definitions within + OMERO.biobank + + launcher.sh + --interpreter=python + --runner=importer.py + #if $omero_configuration.level == 'advanced' + --host=$omero_configuration.vl_host + --user=$omero_configuration.vl_user + --passwd=$omero_configuration.vl_passwd + #else + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + #end if + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + data_collection + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($data_sample_type) != 'use_provided' + --data_sample-type=${data_sample_type} + #end if + #if str($label) + --label=${label} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + study label data_sample + BSTUDY dc-01 V0390290 + BSTUDY dc-01 V0390291 + BSTUDY dc-02 V0390292 + BSTUDY dc-02 V390293 + ... + +This will create new DataCollection(s), whose label is defined by the +label column, and link to it, using DataCollectionItem objects, +the DataSample object(s) identified by data_sample (a VID). + +Records that point to an unknown DataSample will abort the data +collection loading. Previously seen collections will be noisily +ignored. It is not legal to use the importer to add items to a +previously known collection. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/data_object.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/data_object.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,82 @@ + + import DataObject definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + data_object + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($mimetype) != 'use_provided' + --mimetype=${mimetype} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + study path data_sample mimetype size sha1 + + TEST01 file:/share/fs/v039303.cel V2902 x-vl/affymetrix-cel 39090 E909090 + .... + +Records that point to an unknown data sample will be noisily +ignored. The same will happen to records that have the same path of a +previously seen data_object + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/data_sample.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/data_sample.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,165 @@ + + import DataSample definitions within OMERO.biobank + + launcher.sh + --interpreter=python + --runner=importer.py + #if $omero_configuration.level == 'advanced' + --host=$omero_configuration.vl_host + --user=$omero_configuration.vl_user + --passwd=$omero_configuration.vl_passwd + #else + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + #end if + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + data_sample + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($source_type) != 'use_provided' + --source-type=${source_type} + #end if + #if str($device_type) != 'use_provided' + --device-type=${device_type} + #end if + #if str($scanner) != 'use_provided' + --scanner=${scanner} + #end if + #if str($data_sample_type) != 'use_provided' + --data-sample-type=${data_sample_type} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + study label source device device_type scanner options + ASTUDY foo01 v03909 v9309 Chip v99020 celID=0009099090 + ASTUDY foo02 v03909 v99022 Scanner v99022 conf1=...,conf2=... + .... + +In this example, the first line corresponds to a dataset obtained by +using chip v9309 on scanner v99020, while the second datasample has +been obtained using a technology directly using a scanner, e.g., an +Illumina HiSeq 2000. The '''scanner''' column is there as a +convenience to support a more detailed description of a chip-based +acquisition. + +The general strategy is to decide what data objects should be +instantiated by looking at the chip column and at its corresponding +maker,model,release. + +The optional column '''scanner''', the vid of the scanner device, is +used in cases, such as Affymetrix genotyping, where it is relevant. + +It is also possible to import DataSample(s) that are the results of +processing other DataSample(s). Here is an example:: + + study label source device device_type options + ASTUDY foo01 v03909 v99021 SoftwareProgram conf1=...,conf2=... + ASTUDY foo02 v03909 v99021 SoftwareProgram conf1=...,conf2=... + .... + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/device.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/device.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,99 @@ + + import Device definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + device + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($device_type) != 'use_provided' + --device-type=${device_type} + #end if + #if str($maker) + --maker=${maker} + #end if + #if str($model) + --model=${model} + #end if + #if str($release) + --relese=${release} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + study device_type label barcode maker model release location + BSTUDY Scanner pula01 8989898 Affymetrix GeneChip Scanner 3000 7G Pula bld. 5 + BSTUDY Chip chip001 8329482 Affymetrix Genome-Wide Human SNP Array 6.0 None + +All devices have a type, a label, an optional barcode, a maker, a +model, a release and an optional physical location. In the example +above, in the first line we have defined a scanner, which is +physically located in the building 5 lab in Pula. The second line +defines a chip. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/diagnosis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/diagnosis.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,53 @@ + + import diagnosis data within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + diagnosis + #if str($study) != 'use_provided' + --study ${study} + #end if + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + study individual timestamp diagnosis + ASTUDY V899 1310057541608 icd10-cm:E10 + ASTUDY V899 1310057541608 icd10-cm:G35 + ASTYDY V1806 1310057541608 exclusion-problem_diagnosis + ... + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/enrollment.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/enrollment.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,78 @@ + + Create new enrollmnents for existing individuals within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + enrollment + #if str($study_label) != 'use_provided' + --study=$study_label + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Import of new enrollments related to existing individuals. +An enrollment is characterized by the following fields:: + + source study label + V044DE795E7F9F42FEB9855288CF577A77 xxx id1 + V06C59B915C0FD47DABE6AE02C731780AF xxx id2 + V01654DCFC5BB640C0BB7EE088194E629D xxx id3 + +where source must be the VID of an existing Individual object, study a +label of an existing Study object and label the enrollment code for +the patient in the study. + +The enrollment sub-operation will retrieve the source individual from +the DB, create a new enrollment related to it and output the VIDs of +newly created enrollments. It is not possible to create two +enrollments with the same code related to the same study, nor is it +possible to enroll a patient twice in the same study, even with +different codes. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/group.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/group.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,53 @@ + + Create a new group within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + group + #if str($group_label) != '' + --group=$group_label + #end if + + + + + + + + + + + + + + + + + + + +Will create a new group of individuals from a file with the following columns:: + + study label individual + foo I0000 V06C59B915C0FD47DABE6AE02C731780AF + foo I0001 V0B718B77691B145BFA8901FCCF6B37998 + ... + +where the column study is optional (it can be provided via the +group_label param). Labels should be unique within the file and the +individual field should contain VIDs of existing (within omero/vl) +Individual objects. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/illumina_bead_chip_measures.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/illumina_bead_chip_measures.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,90 @@ + + import IlluminaBeadChipMeasures definitions within OMERO + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + illumina_bead_chip_measures + #if str($study) != 'use_provided' + --study=${study} + #end if + #if str($source_type) != 'use_provided' + --source_type=${source_type} + #end if + #if str($action_category) != 'use_provided' + --action_category=${action_category} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read a tsv file with the following columns:: + + study label red_channel green_channel source source_type + ASTUDY CHIP_01_R01C01 V1415151235513 V135135661356161 V351351351551 IlluminaBeadChipArray + ASTUDY CHIP_01_R01C02 V2346262462462 V112395151351623 V135113513223 IlluminaBeadChipArray + ASTUDY CHIP_01_R02C01 V1351362899135 V913977551235981 V100941215192 IlluminaBeadChipArray + +This will create new IlluminaBeadChipMeasures whose labels are defined in the +label column. + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/importer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/importer.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,7 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys +from bl.vl.app.importer.main import main + +main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/individual.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/individual.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,71 @@ + + import individual definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=$input + --ofile=$output + --report_file=$report + --logfile=$logfile + #if $blocking_validation + --blocking-validator + #end if + individual + #if str($study) != 'use_provided' + --study $study + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will import a stream of new individual definitions defined by the +following columns:: + + label gender father mother + id2 male None None + id3 female None None + .... + +It is not possible to import the same individual twice: the related +file rows will be noisily ignored. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/laneslot.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/laneslot.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,101 @@ + + import LaneSlot definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + laneslot + #if str($study) != 'use_provided' + --study=${study} + #end if + #if str($source_type) != 'use_provided' + --source_type=${source_type} + #end if + #if str($content) != 'use_provided' + --content=${content} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +A lane slot record will have the following fields:: + + lane tag content source + V123411 ATCACG DNA V4512415 + V123411 CGATGT DNA V1415512 + V412511 DNA V1909012 + V661251 TGACCA DNA V1123111 + V661251 CTTGTA DNA V1211141 + .... + +the content column can be option if passed as script's input value, +tag column is optional too. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/launcher.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/launcher.sh Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,56 @@ +#!/bin/sh + +CMD="" +PYTH_PATH="PYTHONPATH=/SHARE/USERFS/els7/users/galaxy/develop/usr-cluster/lib/p\ +ython2.7/site-packages/:/SHARE/USERFS/els7/users/biobank/lib/" +runner="$(dirname ${BASH_SOURCE[0]})/" +until [ -z $1 ] + do + + opt_host='--host=' + opt_user='--user=' + opt_passwd='--passwd=' + opt_interpreter='--interpreter=' + opt_runner='--runner=' + if [[ $1 == $opt_host* ]]; then + host=`echo $1 | cut -d '=' -f2 | cut -d '.' -f1` + if [ -z $host -o $host == 'None' ]; then + echo 'ERROR. Missing omero host parameter. Please, set Omero Host in your user preferences' > /dev/null >&2 + exit -1 + fi + PYTH_PATH+=$host + HOST=`echo $1 | cut -d '=' -f2` + CMD+=' '$1 + elif [[ $1 == $opt_user* ]]; then + user=`echo $1 | cut -d '=' -f2` + if [ -z $user -o $user == 'None' ]; then + echo 'ERROR. Missing omero user parameter. Please, set Omero User in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_passwd* ]]; then + passwd=`echo $1 | cut -d '=' -f2` + if [ -z $passwd -o $passwd == 'None' ]; then + echo 'ERROR. Missing omero password parameter. Please, set Omero Password in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_runner* ]]; then + runner+=`echo $1 | cut -d '=' -f2` + elif [[ $1 == $opt_interpreter* ]]; then + interpreter=`echo $1 | cut -d '=' -f2` + else + CMD+=' '$1 + fi + shift +done +export $PYTH_PATH/:$PYTHONPATH +profile="/SHARE/USERFS/els7/users/biobank/lib/${HOST}.biobank.profile" +if [ -f $profile ]; then + source $profile + CMD=$interpreter' '$runner$CMD + $CMD +else + echo "ERROR. Biobank profile file doesn't exist. Please, check Omero Host in your user preferences" > /dev/null >&2 + exit -1 +fi diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/marker_alignment.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/marker_alignment.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,113 @@ + + import marker aligments within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + marker_alignment + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($ref_genome) + --ref-genome ${reg_genome} + #end if + #if str($message) + --message ${message} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + marker_vid ref_genome chromosome pos strand allele copies + V0909090 hg18 10 82938938 True A 1 + V0909091 hg18 1 82938999 True A 2 + V0909092 hg18 1 82938938 True B 2 + ... + +Since pos is relative to 5', if the marker has been aligned on the +other strand, it is the responsibility of the aligner app to report +the actual distance from 5', while, at the same time, registering that +the SNP has actually been aligned on the other strand. + +The chromosome field is an integer field with values in the [1, 26] +range, with 23-26 representing, respectively, the X chromosome, the Y +chromosome, the pseudoautosomal regions (XY) and the mitochondrial DNA +(MT). + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/marker_definition.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/marker_definition.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,92 @@ + + import Marker definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + marker_definition + #if str($study) != 'use_provided' + --study ${study} + #end if + --source ${source} + --context ${context} + --release ${release} + --ref-genome ${ref_genome} + --dbsnp-build ${dbsnp_build} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + label rs_label mask strand allele_a allele_b + SNP_A-1780419 rs6576700 [A/G] TOP A G + ... + +Where label is supposed to be the unique label for this marker in the +(source, context, release) context, rs_label is the dbSNP db label for +this snp (it could be the string ``None`` if not defined or not +known). The column mask contains the SNP definition. The strand column +could either be the actual 'illumina style' strand used to define the +alleles in the alleles columns, or the string 'None', which means that +the alleles in the allele column are defined wrt the mask in the +mask column. + +It will, for each row, convert the mask to the TOP strand following +Illumina conventions and then save a record for it in VL. The saved +tuple is (source, context, release, label, rs_label, TOP_mask). There +are no collision controls. + +It will output a a tsv file with the following columns:: + + study label type vid + ASTUDY SNP_A-xxx Marker V000002222 + ... + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/markers_set.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/markers_set.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,74 @@ + + import Marker definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + markers_set + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($label) + --label ${label} + #end if + #if str($maker) != 'use_provided' + --maker ${maker} + #end if + #if str($model) != 'use_provided' + --model ${model} + #end if + #if str($release) + --release ${release} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read in a tsv file with the following columns:: + + marker_vid marker_indx allele_flip + V902909090 0 False + V902909091 1 False + V902909092 2 True + ... + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/samples_container.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/samples_container.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,221 @@ + + import samples container definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + samples_container + #if str($study) != 'use_provided' + --study=${study} + #end if + #if str($container_type_selector.container_type) != 'use_provided' + --container-type=${container_type_selector.container_type} + #if str($container_type_selector.container_type) == 'TiterPlate' + #if str($container_type_selector.plate_shape) != 'use_provided' + --plate-shape=${container_type_selector.plate_shape} + #end if + #elif str($container_type_selector.container_type) == 'FlowCell' + #if str($container_type_selector.flow_cell_slots) != 'use_provided' + --number-of-slots=${container_type_selector.flow_cell_slots} + #end if + #elif str($container_type_selector.container_type) == 'IlluminaArrayOfArrays' + #if str($container_type_selector.ill_shape) != 'use_provided' + --plate-shape=${container_type_selector.ill_shape} + #end if + #if str($container_type_selector.ill_slots) != 'use_provided' + --number_of_slots=${container_type_selector.ill_slots} + #end if + #if str($container_type_selector.array_type) != 'use_provided' + --illumina-array-type=${container_type_selector.array_type} + #end if + #if str($container_type_selector.array_class) != 'use_provided' + --illumina-array-class=${container_type_selector.array_class} + #end if + #if str($container_type_selector.assay_type) != 'use_provided' + --illumina-assay-type=${container_type_selector.assay_type} + #end if + #end if + #end if + #if str($container_status) != 'use_provided' + --container-status=${container_status} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +A container record will have the following fields:: + + label container_status creation_date + A_CONTAINER USABLE 13/02/2012 + B_CONTAINER INSTOCK 12/01/2001 + C_CONTAINER USABLE 25/04/2012 + .... + +the creation_date column is optional, if not specified current date +will be set as the object's creation date, also the container_status +column can be optional if this values is passed as input parameter. + + +When importing new containers, special fields can be included in the +CSV file depending on the type of the objects that you want to +import. + +For TITER PLATES objects the syntax can be the following:: + + label barcode container_status rows columns + A_TITERPLATE XXYYZZ111 INSTOCK 8 12 + B_TITERPLATE XXYYZZ112 INSTOCK 8 12 + C_TITERPLATE XXYYZZ113 READY 8 12 + .... + +rows and columns values can be optional if these values are passed as +input parameters, barcode column is optional. + +For ILLUMINA ARRAY OF ARRAYS objects the syntax can be the following:: + + label barcode container_status rows columns illumina_array_type illumina_array_class illumina_assay_type + A_ILLARRAY XXYYZZ111 INSTOCK 4 2 BeadChip_12x1Q Slide Infinium_HD + B_ILLARRAY XXYYZZ112 INSTOCK 4 2 BeadChip_12x1Q Slide Infinium_HD + C_ILLARRAY XXYYZZ113 INSTOCK 4 2 BeadChip_12x1Q Slide Infinium_HD + +rows, columns, illumina_array_type, illumina_array_class and illumina_assay_type +can be optional if these values are passed as input parameters, barcode column +is optional. + +For FLOW CELL objects the syntax can be the following:: + + label barcode container_status number_of_slots + A_FLOWCELL XXYYZZ221 INSTOCK 8 + B_FLOWCELL XXYYZZ222 INSTOCK 8 + C_FLOWCELL XXYYZZ223 INSTOCK 8 + .... + +number_of_slots column can be optional if this value is passed as +input paramter, barcode column is optional. + +For LANE objects the syntax can be the following:: + + flow_cell slot container_status + V112441441 1 INSTOCK + V112441441 2 INSTOCK + V112441441 3 INSTOCK + V351145519 1 INSTOCK + V351145519 2 INSTOCK + .... + +for Lane objects, no label column has to be provided, the importer +will automatically calculate the labels for each imported object. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/sequencing_data_sample.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/sequencing_data_sample.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,184 @@ + + + Import sequencing related DataSample definitions within OMERO>biobank + + + launcher.sh + --interpreter=python + --runner=importer.py + #if $omero_configuration.level == 'advanced' + --host=$omero_configuration.vl_host + --user=$omero_configuration.vl_user + --passwd=$omero_configuration.vl_passwd + #else + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + #end if + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + seq_data_sample + #if str($study) != 'use_provided' + --study=${study} + #end if + #if str($source_type) != 'use_provided' + --source-type=${source_type} + #end if + #if str($seq_dsample_type) != 'use_provided' + --seq-dsample-type=${seq_dsample_type} + #end if + #if str($dsample_status) != 'use_provided' + --status=${dsample_status} + #end if + #if str($device) != 'use_provided' + --device=${device} + #end if + #if str($history) != 'None' + --history=${history} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will read a tsv file with the following columns:: + + study label source source_type seq_dsample_type status device + FOOBAR seq_out_1 V012141 FlowCell SequencerOutput USABLE V123141 + FOOBAR seq_out_2 V012141 FlowCell SequencerOutput USABLE V123141 + FOOBAR seq_out_3 V1AD124 FlowCell SequencerOutput USABLE V123141 + ... + +where + * seq_dsample_type can assume one of the following values: SequencerOutput, RawSeqDataSample, SeqDataSample + * source_type can assume one of the following values: FlowCell, SequencerOutput, RawSeqDataSample + +study, source_type, seq_dsample_type, status and device columns can be +overwritten by using command line options. + +A special case of the previous file is when seq_dsample_type is +SeqDataSample, in this case a mandatory sample column is required, +this column has to contain IDs of Tube objects. +The file will look like this + + study label source source_type seq_dsample_type status device sample + FOOBAR seq_dsample_1 V041241 SequencerOutput SeqDataSample USABLE VBB2351 V124AA41 + FOOBAR seq_dsample_2 V051561 SequencerOutput SeqDataSample USABLE VBB2351 V4151AAE + FOOBAR seq_dsample_3 V151561 SequencerOutput SeqDataSample USABLE VBB2351 V15199CD + ... + +A file containing ax export of the Galaxy history that produced the +data that are going to be imported can be passed as input parameter, +history details must represented as a string serialized in JSON +format. + + + + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/study.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/study.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,59 @@ + + import study definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + study + + + + + + + + + + + + + + + + + + + + + + + + + + + +Will import a stream of new study definitions defined by the following +tab-separated columns. A typical file will look like the following:: + + label description + BSTUDY A basically empty description of BSTUDY + CSTUDY A basically empty description of CSTUDY + .... + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/unauthorized_access.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/unauthorized_access.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,6 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys + +sys.exit("You are not authorized to use this tool") diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/importer/vessels_collection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/importer/vessels_collection.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,94 @@ + + import VesselsCollection definitions within omero/vl + + launcher.sh + --interpreter=python + --runner=importer.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ifile=${input} + --ofile=${output} + --report_file=${report} + --logfile=${logfile} + #if $blocking_validation + --blocking-validator + #end if + vessels_collection + #if str($study) != 'use_provided' + --study ${study} + #end if + #if str($vessel_type) != 'use_provided' + --vessel_type=${vessel_type} + #end if + #if str($label) + --label=${label} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +The input file to correctly import collections of vessels must have the following +format:: + + label vessel vessel_type + COLLECTION-A V1234545 Tube + COLLECTION-A V1212434 Tube + COLLECTION-A V3434176 Tube + COLLECTION-B V2321001 Tube + COLLECTION-B V1210402 Tube + .... + +Column 'label' contains the names of the collections to be imported, while 'vessel' +contains the VID of the tubes or the plates being part of the collections. + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/library/import_to_library.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/library/import_to_library.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,310 @@ +#!/usr/bin/env python +import sys, os,argparse,logging, yaml, datetime, subprocess, stat + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + +def make_parser(): + parser = argparse.ArgumentParser(description='') + + parser.add_argument('--ini_file', type=str, default="{0}/init_file.yaml".format(os.path.dirname(os.path.realpath(sys.argv[0]))),help='Configuration file (yaml)') + + parser.add_argument('--host', type=str, required=True, help='omero host') + parser.add_argument('--user', type=str, required=True, help='omero user') + parser.add_argument('--passwd', type=str, required=True, help='omero passwd') + + parser.add_argument('--galaxy_host', type=str, help='Galaxy Host (with port)') + parser.add_argument('--galaxy_api_key', type=str, help='Galaxy API key') + + parser.add_argument('--operator', type=str, help='Galaxy user email') + + parser.add_argument('--library', type=str, required=False, help='library name') + parser.add_argument('--folder', type=str, required=False, help='library folder') + + parser.add_argument('--data_objects', type=str, required=True, help='databojects id') + + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, help='logging level (default: INFO)', default='INFO') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + + return parser + +def main(argv): + global logger + global ini_file + global kb + global apiGalaxy + + parser = make_parser() + args = parser.parse_args(argv) + + # Initializing logger + logger = init_logger(args,logging) + + # Reading YAML configuration file + ini_file = init_config(args) + + # Initializing python libraries + init_pythonpath(args,ini_file) + + # Initializing connection to omero biobank + kb = init_omero_biobank(args) + + logger = init_logger(args,logging) + + # Initializing connection to apiGalaxy + apiGalaxy = init_api_galaxy(args) + + # Getting library and folder id + library_id,folder_id = get_library_and_folder_ids(args) + + # Getting data_objects + data_objects = get_data_objects(args) + + import_data_objects(args,data_objects,library_id,folder_id) + +def import_data_objects(args,data_objects,library_id,folder_id): + user_import_dir = get_user_import_dir(args) + + logger.info("copying datasets in user import dir") + files = copy_in_user_import_dir(data_objects,user_import_dir) + + logger.info("wait while copiyng") + polling(files) + + logger.info("ready to import in library {0} under folder {1}".format(args.library,args.folder)) + + logger.info('importing in library') + successfull = 0 + for file_type,folder in user_import_dir.iteritems(): + if len(os.listdir(folder)) == 0: continue + if 'fastq' in file_type: file_type = 'fastqsanger' + status = apiGalaxy.libraries.upload_file_from_server(library_id, folder, folder_id, file_type=file_type,link_data_only='link_to_files') + successfull+=len(status) + if successfull == len(files): + logger.info("SUCCESS") + else: + logger.critical("ERROR WHILE IMPORTING") + + raise SystemExit + +def copy_in_user_import_dir(data_objects,user_import_dir): + files = list() + + for dobj in data_objects: + if dobj.path.startswith('irods://'): + irods_path = dobj.path.replace('irods://','') + phys_path = irods.get_object_info(irods_path)['phys_path'].strip() + + elif dobj.path.startswith('file://'): + irods_path = None + phys_path = dobj.path.replace('file://','') + else: + #continue + irods_path = dobj.path.replace('irods://','') + phys_path = irods.get_object_info(irods_path)['phys_path'].strip() + + data_type = dobj.mimetype.split('/')[-1].replace('+64','') + dest_path = get_destination_path(irods_path,phys_path,dobj.sample,user_import_dir,data_type).strip() + #rsync_command = "qsub -b y /usr/bin/rsync -rcLPhv {0} {1}".format(phys_path,dest_path) + rsync_command = "rsync -rcLPhv {0} {1}".format(phys_path,dest_path) + logger.info('launching copy for {0} dataset'.format(os.path.basename(dest_path))) + subprocess.Popen(rsync_command.split(' ')) + + files.append(dest_path) + + return files + +def polling(files): + all_done = False + founds = list() + while not all_done: + done = True + for dest_path in files: + if dest_path.endswith('.gz'): + unzip_path = dest_path.replace('.gz','') + if not os.path.exists(dest_path) and not os.path.exists(unzip_path): + done = False + elif os.path.exists(dest_path): + + done = False + logger.info("found {0}".format(os.path.basename(dest_path))) + logger.info("gunzipping {0}".format(os.path.basename(dest_path))) + cmd = "gunzip {0}".format(dest_path) + g_unzip = subprocess.check_output(cmd, stderr=subprocess.STDOUT,shell=True).strip() + logger.info(g_unzip) + else: + if not os.path.exists(dest_path): + done = False + elif os.path.exists(dest_path) and dest_path not in founds: + founds.append(dest_path) + logger.info("found {0}".format(os.path.basename(dest_path))) + all_done = done + return True + +def get_user_import_dir(args): + user_import_dir = dict() + subfolder = str(datetime.datetime.now()).split('.')[0].replace(' ','_').replace(':','-') + user_import_dir={'fastq' : "{0}/{1}/{2}_{3}".format(ini_file['LIBRARY_IMPORT_DIR_{0}'.format(args.host.split('.')[0].upper())],args.operator,subfolder,'fastq'), + 'vcf' : "{0}/{1}/{2}_{3}".format(ini_file['LIBRARY_IMPORT_DIR_{0}'.format(args.host.split('.')[0].upper())],args.operator,subfolder,'vcf'), + 'bam' : "{0}/{1}/{2}_{3}".format(ini_file['LIBRARY_IMPORT_DIR_{0}'.format(args.host.split('.')[0].upper())],args.operator,subfolder,'bam') + } + os.umask(0) + for k, folder in user_import_dir.iteritems(): + if not os.path.exists(folder): + os.makedirs(folder,0775) + return user_import_dir + +def get_destination_path(irods_path,phys_path,data_sample,user_import_dir,data_type): + + if isinstance(data_sample, kb.SeqDataSample) or isinstance(data_sample, kb.AlignedSeqDataSample): + if data_sample.sample.label == 'TRAINING_tube_1' : label = 'FATHER' + elif data_sample.sample.label == 'TRAINING_tube_2' : label = 'PROBAND' + elif data_sample.sample.label == 'TRAINING_tube_3' : label = 'MOTHER' + else : continue + #label = data_sample.sample.label + if isinstance(data_sample, kb.GenomeVariationsDataSample): + label = data_sample.label + + filename = "{0}/{1}".format(user_import_dir[data_type],label) + + if irods_path: + attr = get_attributes(irods_path) + + if attr.has_key('read'): filename = "{0}_R{1}".format(filename,attr['read']) + #if attr.has_key('lanes'): filename = "{0}_L{1}".format(filename,attr['lanes']) + if attr.has_key('compression') and attr['compression'] == 'gzip': + filename = "{0}.gz".format(filename) + else: + filename = "{0}_{1}".format(filename,os.path.basename(phys_path)) + filename = filename.replace('.fq','') + return filename + +def get_data_objects(args): + logger.info("getting data objects") + data_objects = list() + data_object_ids = args.data_objects.split(',') + for dataobj in kb.get_objects(kb.DataObject): + if str(dataobj.omero_id) in data_object_ids: + data_objects.append(dataobj) + logging.info("found {0}".format(len(data_objects))) + return data_objects + +def get_library_and_folder_ids(args): + if args.library is None: + logger.critical("Library is a mandatory parameter") + sys.exit() + library_name = args.library.split('?')[0].replace('.',' ') + logger.info("searching for library") + orione_library = apiGalaxy.libraries.get_libraries(name="{0}".format(library_name)) + if len(orione_library) == 0: + logger.critical("sorry, library {0} doesn't exist".format(library_name)) + sys.exit() + library_id = orione_library[0].get('id',None) + + if '?' in args.library and args.library == args.folder: + folder_name = args.library.split('?')[1].replace('.',' ') + else: + return library_id,None + logger.info("searching for folder {0}".format(folder_name)) + + folder = apiGalaxy.libraries.get_folders(library_id=library_id,name=u"/{0}".format(folder_name)) + if len(folder) == 0: + logger.info("not found. creating it..") + try: + folder = apiGalaxy.libraries.create_folder(library_id,folder_name) + except: + logger.critical("impossible to create folder {0}".format(folder_name)) + sys.exit() + + + folder_id = folder[0].get('id',None) + + return library_id,folder_id + +def get_attributes(irods_path): + cmd = ['imeta', 'ls', '-ld', irods_path] + imeta = [i.splitlines() for i in irods.__irods_check_output(cmd).split('----')] + attributes = {} + for i in imeta: + del i[0] + for a in i: + if 'attribute' in a: + key = a.split(':')[1].strip() + if 'value' in a: + value = a.split(':')[1].strip() + attributes[key] = value + return attributes + +def init_logger(args,logging): + log_level = getattr(logging, args.loglevel) + kwargs = { + 'format' : LOG_FORMAT, + 'datefmt' : LOG_DATEFMT, + 'level' : log_level} + + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + + logger = logging.getLogger( __name__ ) + return logger + +def init_config(args): + # Load YAML configuration file + logger.info('loading YAML configuration file: %s' % args.ini_file) + try: + ini_file = yaml.load(open(args.ini_file)) + except: + logger.critical('%s is not a valid YAML configuration file' %args.ini_file) + sys.exit() + + return ini_file + +def init_pythonpath(args,ini_file): + logger.info('exporting pythonpath') + sys.path.reverse() + sys.path.append('/SHARE/USERFS/els7/users/galaxy/develop/usr-cluster/lib/python2.7/site-packages/') + sys.path.append('/u/galaxy/.local/lib/python2.7/site-packages/poster-0.8.1-py2.7.egg') + sys.path.append('/SHARE/USERFS/els7/users/sequencing/usr-cluster/lib/python2.7/site-packages/automator-0.1-py2.7.egg') + sys.path.append("{0}/{1}".format(ini_file['PYTHPATH'],args.host.split('.')[0])) + sys.path.reverse() + + global KB + from bl.vl.kb import KnowledgeBase as KB + + global irods + import automator.agent.irods as irods + + #global bioblend + #import bioblend + + global GalaxyInstance + from bioblend.galaxy import GalaxyInstance + +def init_omero_biobank(args): + logger.info('opening kb connection to {0}'.format(args.host)) + + try: + kb = KB(driver='omero')(args.host, args.user, args.passwd) + return kb + except: + logger.critical('connection refused or failed') + sys.exit() + +def init_api_galaxy(args): + try: + galaxy_host = args.galaxy_host or ini_file['GALAXY_HOST_{0}'.format(args.host.split('.')[0].upper())] + api_key = args.galaxy_api_key + except KeyError, ke: + msg = 'No argument passed and no global variable %s found' % ke + logger.critical(msg) + sys.exit(msg) + + + logger.info('opening connection to %s with key %s' %(galaxy_host,api_key) ) + apiGalaxy = GalaxyInstance(galaxy_host, key=api_key) + return apiGalaxy + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/library/import_to_library.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/library/import_to_library.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,56 @@ + + import DataSet within galaxy library/vl + + irods + + + + launcher.sh + --interpreter=python + --runner=import_to_library.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --galaxy_api_key=$__user_api_key__ + --operator=$__user_email__ + --logfile=${logfile} + --data_objects=${data_objects} + --library=${library_folder} + --folder=${library_folder} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/library/init_file.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/library/init_file.yaml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,7 @@ +GALAXY_HOST_BIOBANK04: 'http://test.galaxy.crs4.it/develop/' +GALAXY_HOST_BIOBANK18: 'http://orione.crs4.it' +PYTHPATH: '/SHARE/USERFS/els7/users/biobank/lib' +LIBRARY_IMPORT_DIR_BIOBANK04: '/SHARE/USERFS/els7/users/galaxy/develop/user_library_import_dir' +LIBRARY_IMPORT_DIR_BIOBANK18: '' +LOG_FOLDER: '/SHARE/USERFS/els7/users/galaxy/tmp/logs' +TMP_DIR: '/SHARE/USERFS/els7/users/galaxy/tmp' \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/library/launcher.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/library/launcher.sh Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,56 @@ +#!/bin/sh + +CMD="" +PYTH_PATH="PYTHONPATH=/SHARE/USERFS/els7/users/galaxy/develop/usr-cluster/lib/p\ +ython2.7/site-packages/:/SHARE/USERFS/els7/users/biobank/lib/" +runner="$(dirname ${BASH_SOURCE[0]})/" +until [ -z $1 ] + do + + opt_host='--host=' + opt_user='--user=' + opt_passwd='--passwd=' + opt_interpreter='--interpreter=' + opt_runner='--runner=' + if [[ $1 == $opt_host* ]]; then + host=`echo $1 | cut -d '=' -f2 | cut -d '.' -f1` + if [ -z $host -o $host == 'None' ]; then + echo 'ERROR. Missing omero host parameter. Please, set Omero Host in your user preferences' > /dev/null >&2 + exit -1 + fi + PYTH_PATH+=$host + HOST=`echo $1 | cut -d '=' -f2` + CMD+=' '$1 + elif [[ $1 == $opt_user* ]]; then + user=`echo $1 | cut -d '=' -f2` + if [ -z $user -o $user == 'None' ]; then + echo 'ERROR. Missing omero user parameter. Please, set Omero User in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_passwd* ]]; then + passwd=`echo $1 | cut -d '=' -f2` + if [ -z $passwd -o $passwd == 'None' ]; then + echo 'ERROR. Missing omero password parameter. Please, set Omero Password in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_runner* ]]; then + runner+=`echo $1 | cut -d '=' -f2` + elif [[ $1 == $opt_interpreter* ]]; then + interpreter=`echo $1 | cut -d '=' -f2` + else + CMD+=' '$1 + fi + shift +done +export $PYTH_PATH/:$PYTHONPATH +profile="/SHARE/USERFS/els7/users/biobank/lib/${HOST}.biobank.profile" +if [ -f $profile ]; then + source $profile + CMD=$interpreter' '$runner$CMD + $CMD +else + echo "ERROR. Biobank profile file doesn't exist. Please, check Omero Host in your user preferences" > /dev/null >&2 + exit -1 +fi diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/all_enrollments.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/all_enrollments.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,89 @@ +import csv, os, sys, argparse + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import LOG_LEVELS, get_logger + + +def make_parser(): + parser = argparse.ArgumentParser(description='Retrieve all enrollments') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices = LOG_LEVELS, + help='logger level', default='INFO') + parser.add_argument('--host', type=str, help='omero hostname') + parser.add_argument('--user', type=str, help='omero user') + parser.add_argument('--passwd', type=str, help='omero password') + parser.add_argument('--ofile', type=str, help='output file path', + required=True) + return parser + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + # This is a temporary hack!!! + to_be_ignored = ['IMMUNOCHIP_DISCARDED', 'CASI_MS_CSM_TMP', + 'CASI_MS_CSM_CODES'] + + logger = get_logger('all_enrollments', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + try: + out_file_path = args.ofile + except IndexError: + logger.error('Mandatory field missing.') + parser.print_help() + sys.exit(2) + + # Create the KnowledgeBase object + kb = KB(driver='omero')(host, user, passwd) + + # Retrieve all studies from omero + studies = kb.get_objects(kb.Study) + studies = [s for s in studies if s.label not in to_be_ignored] + logger.info('Retrieved %d studies from database' % len(studies)) + + csv_header = ['individual_uuid'] + enrolls_map = {} + + # For each study, retrieve all enrollments + for s in studies: + logger.info('Retrieving enrollments for study %s' % s.label) + enrolls = kb.get_enrolled(s) + logger.info('%s enrollments retrieved' % len(enrolls)) + if len(enrolls) > 0: + logger.debug('Building lookup dictionary....') + for e in enrolls: + enrolls_map.setdefault(e.individual.omero_id, {})['individual_uuid'] = e.individual.id + enrolls_map[e.individual.omero_id].setdefault('studies', {}) + enrolls_map[e.individual.omero_id]['studies'].setdefault(s.label,[]) + enrolls_map[e.individual.omero_id]['studies'][s.label].append(e.studyCode) + label = "{0} #{1}".format(s.label,len(enrolls_map[e.individual.omero_id]['studies'][s.label])) + enrolls_map[e.individual.omero_id][label] = e.studyCode + if label not in csv_header: + csv_header.append(label) # Add study label to CSV header + else: + logger.debug('No enrollments found, skip study %s' % s.label) + + # Write to CSV file + logger.debug('Writing CSV file %s' % out_file_path) + with open(out_file_path, 'w') as f: + writer = csv.DictWriter(f, csv_header, + delimiter='\t', quotechar='"', + restval = 'None') + writer.writeheader() + for k, v in enrolls_map.iteritems(): + v.pop("studies",{}) + writer.writerow(v) + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/all_enrollments.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/all_enrollments.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,29 @@ + + + Retrieve all enrollments codes from Omero server + + + launcher.sh + --interpreter=python + --runner=all_enrollments.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --ofile=${output1} + --logfile=${logfile} + + + + + + + + + + + + + It will output a tsv files with the following columns: + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/build_miniped.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/build_miniped.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,216 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +""" +A rough example of basic pedigree info generation. +""" + +import argparse +import collections +import csv +import os +import sys +import yaml + +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.kb.drivers.omero.ehr import EHR +import bl.vl.individual.pedigree as ped +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import LOG_LEVELS, get_logger + +PLINK_MISSING = -9 +PLINK_UNAFFECTED = 1 +PLINK_AFFECTED = 2 + +FIELDS = ["fam_label", "ind_label", "fat_label", "mot_label", "gender"] + + +def load_config(config_file): + with open(config_file) as cfg: + conf = yaml.load(cfg) + return conf + + +class Diagnosis(object): + def __init__(self, logger, yaml_file): + self.logger = logger + if os.path.isfile(yaml_file): + self.conf = load_config(yaml_file) + else: + self.logger.critical('The config file {} does not exists'.format( + yaml_file)) + sys.exit() + + def get_openehr_data(self): + return self.conf['openEHR']['archetype'], self.conf['openEHR']['field'] + + def get_diagnosis_label(self): + return [l for l in self.get_diagnosis().iterkeys()] + + def get_diagnosis(self): + results = collections.OrderedDict() + diagnosis = collections.OrderedDict(sorted( + self.conf['diagnosis'].items())) + for v in diagnosis.itervalues(): + results[v['label']] = v['values'] + return results + + def get_unaffected_diagnosis_dictionary(self): + labels = self.get_diagnosis_label() + d = {} + for k in labels: + d[k] = 1 + return d + + +def make_parser(): + parser = argparse.ArgumentParser( + description='build the first columns of a ped file from VL') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--configfile', type=str, default=os.path.join( + os.path.dirname(os.path.realpath(__file__)), 'build_miniped.yaml'), + help='config file (yaml) with diagnosis dictionary') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('-S', '--study', type=str, required=True, + help="a list of comma separated studies used to " + "retrieve individuals that will be written to " + "ped file") + parser.add_argument('--ofile', type=str, help='output file path', + required=True) + parser.add_argument('--write_header', action='store_true', default=False, + help='Write header into the output file') + return parser + + +def build_families(individuals, logger): + # Individuals with only one parent will be considered like founders + # for i in individuals: + # if ((i.mother is None) or (i.father is None)): + # i.mother = None + # i.father = None + logger.info("individuals: %d" % len(individuals)) + # logger.info("individuals: with 0 or 2 parents: %d" % len(not_one_parent)) + logger.info("analyzing pedigree") + founders, non_founders, dangling, couples, children = ped.analyze( + individuals + ) + logger.info("splitting into families") + return ped.split_disjoint(individuals, children) + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('build_miniped', level=args.loglevel, + filename=args.logfile) + + dobj = Diagnosis(logger, args.configfile) + logger.debug("l {}".format(dobj.get_diagnosis_label())) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + logger.debug('Loading all individuals from omero') + all_inds = kb.get_objects(kb.Individual) # store all inds to cache + logger.debug('%d individuals loaded' % len(all_inds)) + studies = [kb.get_study(s) for s in args.study.split(',')] + # Removing None values + studies = set(studies) + try: + studies.remove(None) + except KeyError: + pass + studies = list(studies) + # Sorting studies + studies = sorted(studies, key=lambda k: k.label.lower()) + if len(studies) == 0: + logger.error( + 'No matches found for labels %s, stopping program' % args.study) + sys.exit(2) + enrolled_map = {} + for study in studies: + logger.info('Loading enrolled individuals for study %s' % study.label) + enrolled = kb.get_enrolled(study) + logger.debug('%d individuals loaded' % len(enrolled)) + for en in enrolled: + if en.individual.id not in enrolled_map: + enrolled_map[en.individual.id] = ( + '%s:%s' % (en.study.label, en.studyCode), + en.individual) + else: + logger.debug('Individual %s already mapped' % en.individual.id) + logger.debug('Loading EHR records') + ehr_records = kb.get_ehr_records() + logger.debug('%s EHR records loaded' % len(ehr_records)) + ehr_records_map = {} + for r in ehr_records: + ehr_records_map.setdefault(r['i_id'], []).append(r) + affection_map = {} + arch, field = dobj.get_openehr_data() + for ind_id, ehr_recs in ehr_records_map.iteritems(): + affection_map[ind_id] = dobj.get_unaffected_diagnosis_dictionary() + ehr = EHR(ehr_recs) + for k, v in dobj.get_diagnosis().iteritems(): + for d in v: + if ehr.matches(arch, field, d): + affection_map[ind_id][k] = PLINK_AFFECTED + + immuno_inds = [i for (ind_id, (st_code, i)) in enrolled_map.iteritems()] + families = build_families(immuno_inds, logger) + logger.info("found %d families" % len(families)) + + def resolve_label(i): + try: + return enrolled_map[i.id][0] + except KeyError: + return i.id + + def resolve_pheno(i): + try: + immuno_affection = affection_map[i.id] + except KeyError: + return [(d, PLINK_MISSING) for d in dobj.get_diagnosis_label()] + return [(d, immuno_affection[d]) for d in dobj.get_diagnosis_label()] + + kb.Gender.map_enums_values(kb) + gender_map = lambda x: 2 if x.enum_label() == kb.Gender.FEMALE.enum_label() \ + else 1 + + for d in dobj.get_diagnosis_label(): + FIELDS.append("_".join([d, "status"])) + with open(args.ofile, "w") as f: + writer = csv.DictWriter(f, FIELDS, delimiter="\t", lineterminator="\n") + if args.write_header: + writer.writeheader() + families_data = [] + logger.info("building families data") + for k, fam in enumerate(families): + fam_label = "FAM_%d" % (k + 1) + for i in fam: + r = {"fam_label": fam_label, + "ind_label": resolve_label(i), + "fat_label": 0 if (i.father is None or i.father not in fam) + else resolve_label(i.father), + "mot_label": 0 if (i.mother is None or i.mother not in fam) + else resolve_label(i.mother), + "gender": gender_map(i.gender)} + for p in resolve_pheno(i): + r["_".join([p[0], "status"])] = p[1] + families_data.append(r) + logger.info("writing miniped") + writer.writerows(families_data) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/build_miniped.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/build_miniped.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,58 @@ + + + Build a reduced ped file from Omero server + + + launcher.sh + --interpreter=python + --runner=build_miniped.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + #if $study + --study=${study} + #end if + --ofile=${output1} + --logfile=${logfile} + #if $write_header + --write_header + #end if + + + + + + + + + + + + + + + It will output a tsv files with a column of codes for each groups of samples. + + The labels of the columns are: + + family + + individual enrollment code (STUDY:CODE) + + father enrollment code (STUDY:CODE) + + mother enrollment code (STUDY:CODE) + + gender + + T1D affection status + + MS affection status + + Nefro affection status + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/build_miniped.yaml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/build_miniped.yaml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,18 @@ +openEHR: + archetype: openEHR-EHR-EVALUATION.problem-diagnosis.v1 + field: at0002.1 +diagnosis: + 1: + label: t1d + values: + - icd10-cm:E10 + 2: + label: ms + values: + - icd10-cm:G35 + 3: + label: nefro + values: + - icd10-cm:E23.2 + - icd10:N00-N08 + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/check_merge_individuals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/check_merge_individuals.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,104 @@ +import sys, csv, argparse, os +from collections import Counter + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import LOG_LEVELS, get_logger + + +def make_parser(): + parser = argparse.ArgumentParser(description='check data that will be passed to the merge_individuals tool') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('--in_file', type=str, required=True, + help='input file') + parser.add_argument('--out_file', type=str, required=True, + help='output file') + return parser + + +def get_invalid_vids(records, logger): + records_map = {} + invalid_vids = [] + + for rec in records: + for k,v in rec.iteritems(): + records_map.setdefault(k, []).append(v) + # Check for duplicated sources + ct = Counter() + for x in records_map['source']: + ct[x] += 1 + for k, v in ct.iteritems(): + if v > 1: + logger.error('ID %s appears %d times as source, this ID has been marked as invalid' % (k, v)) + invalid_vids.append(k) + # Check for VIDs that appear bots in 'source' and 'target' fields + sources = set(records_map['source']) + targets = set(records_map['target']) + commons = sources.intersection(targets) + for c in commons: + logger.error('ID %s appears both in \'source\' and \'target\' columns, this ID has been marked as invalid' % c) + invalid_vids.append(c) + + return set(invalid_vids) + + +def check_row(row, individuals, logger): + try: + source = individuals[row['source']] + logger.debug('%s is a valid Individual ID' % source.id) + target = individuals[row['target']] + logger.debug('%s is a valid Individual ID' % target.id) + return True + except KeyError, ke: + logger.error('%s is not a valid Individual ID' % ke) + return False + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('check_merge_individuals', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.info('Preloading all individuals') + inds = kb.get_objects(kb.Individual) + logger.info('Loaded %d individuals' % len(inds)) + inds_map = {} + for i in inds: + inds_map[i.id] = i + + with open(args.in_file) as infile, open(args.out_file, 'w') as outfile: + reader = csv.DictReader(infile, delimiter='\t') + records = [row for row in reader] + invalid_vids = get_invalid_vids(records, logger) + + writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') + writer.writeheader() + + for record in records: + if record['source'] in invalid_vids or record['target'] in invalid_vids: + logger.error('Skipping record %r because at least one ID was marked as invalid' % record) + else: + if check_row(record, inds_map, logger): + writer.writerow(record) + logger.debug('Record %r written in output file' % record) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/check_merge_individuals.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/check_merge_individuals.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,41 @@ + + + Verify data that will be passed to the merge_individuals tool + + + launcher.sh + --interpreter=python + --runner=check_merge_individuals.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --in_file=${input1} + --out_file=${output1} + --logfile=${logfile} + + + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +It will check merging individuals info using informations from a file like this:: + + source target + V08E18411BC66F4987BCA43EFC6F636224 V0AE5660BF4A7149589BE9DB3308B50327 + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/check_update_parents_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/check_update_parents_data.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,113 @@ +import sys, csv, argparse, logging, os +from collections import Counter + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS + + +def make_parser(): + parser = argparse.ArgumentParser(description='check data that will be passed to the update_parents tool') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('--in_file', type=str, required=True, + help='input file') + parser.add_argument('--out_file', type=str, required=True, + help='output file') + return parser + + +def check_row(row, individuals_map, kb, logger): + logger.debug('Checking record %r' % row) + try: + ind = individuals_map[row['individual']] + logger.info('%s is a valid Individual ID' % ind.id) + if row['father'] != 'None': + father = individuals_map[row['father']] + logger.info('%s is a valid Individual ID' % father.id) + check_gender(father, kb.Gender.MALE) + logger.info('Gender check passed') + else: + logger.info('None value, no check required') + if row['mother'] != 'None': + mother = individuals_map[row['mother']] + logger.info('%s is a valid Individual ID' % mother.id) + check_gender(mother, kb.Gender.FEMALE) + logger.info('Gender check passed') + else: + logger.info('None value, no check required') + return True + except KeyError, ke: + logger.error('%s is not a valid Individual ID, rejecting row' % ke) + return False + except ValueError, ve: + logger.error(ve) + return False + + +def check_gender(individual, gender): + if individual.gender.enum_label() != gender.enum_label(): + raise ValueError('Gender for individual %s is %s, expected %s, rejecting row' % (individual.id, + individual.gender.enum_label(), + gender.enum_label())) + else: + pass + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('check_update_parents_data', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.info('Preloading all individuals from the system') + inds = kb.get_objects(kb.Individual) + logger.info('%d individuals loaded' % len(inds)) + inds_lookup = {} + for i in inds: + inds_lookup[i.id] = i + + with open(args.in_file) as infile, open(args.out_file, 'w') as outfile: + reader = csv.DictReader(infile, delimiter='\t') + records = list(reader) + logger.info('Check for duplicated in \'individual\' column') + recs_by_ind = {} + for rec in records: + recs_by_ind.setdefault(rec['individual'], []).append(rec) + ct = Counter() + duplicated = [] + for k,v in recs_by_ind.iteritems(): + if len(v) > 1: + duplicated.append(k) + for dupl in duplicated: + logger.info('Individual %s is a duplicated' % dupl) + for r in recs_by_ind.pop(dupl): + logger.info('Removing record %r' % r) + good_records = sum(recs_by_ind.itervalues(), []) + logger.info('Duplicated check completed') + writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') + writer.writeheader() + logger.info('Checking records') + for row in good_records: + if check_row(row, inds_lookup, kb, logger): + writer.writerow(row) + logger.debug('Record %r written in output file' % row) + logger.info('Records check completed') + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/check_update_parents_data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/check_update_parents_data.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,42 @@ + + + Verify data that will be passed to the update_parents tool + + + launcher.sh + --interpreter=python + --runner=check_update_parents_data.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --in_file=${input1} + --out_file=${output1} + --logfile=${logfile} + + + + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +It will check parental info of individual using informations from a file like this:: + + individual father mother + V08E18411BC66F4987BCA43EFC6F636224 None None + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/convert_sam.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/convert_sam.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,7 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys +from bl.vl.app.snp_manager.main import main + +main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/convert_sam.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/convert_sam.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,40 @@ + + converter + + launcher.sh + --interpreter=python + --runner=convert_sam.py + --logfile ${log_file} convert_sam -i ${input_file} + -o ${output_file} --reftag ${dbkey} --output-format ${output_type} + ## FIXME: find a way to import the default from the relevant module + --flank-size 125 + + + + + + + + + + + + + + + + + + +**What it does** + +This tool converts SAM alignment data to VL marker alignment or Galaxy +extract genomic DNA input. + +Expects single-end BWA alignment data produced by the previous steps +in the workflow (see markers_to_fastq). + +**NOTE:** if the marker_alignment output format is selected, the +Database/Build property must be set in the input SAM file. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/enrollments_by_platewells.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/enrollments_by_platewells.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,77 @@ +""" +From a list of platewells retrieves the enrollments code of the connected individual +""" +import argparse +import csv +import sys + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import LOG_LEVELS, get_logger + +def make_parser(): + parser = argparse.ArgumentParser(description='From platewells 2 enrollments code') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices = LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('-S', '--study', type=str, required=True, + help='a study used to retrieve individuals') + parser.add_argument('--ifile', type=str, required=True, + help='list of platewells used to fetch data') + parser.add_argument('--ofile', type=str, help='output file path', + required=True) + return parser + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('pws2enrolls', level=args.loglevel, + filename=args.logfile) + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + study = kb.get_study(args.study) + enrolled_map = {e.individual.id:e for e in kb.get_enrolled(study)} + logger.info('Loaded {} enrolled individuals for study {}'.format(len(enrolled_map), study.label)) + plates = kb.get_objects(kb.TiterPlate) + logger.info('Loaded {} plates'.format(len(plates))) + pws_map = {':'.join([w.container.barcode, w.label]):w for w in kb.get_objects(kb.PlateWell) + if w.container.barcode} + logger.info('Loaded {} platewells'.format(len(pws_map))) + #records = [] + of=open(args.ofile, 'w') + writer=csv.DictWriter(of, ['platewell', 'status', 'enrollment'], + delimiter='\t', quotechar='"', restval='None') + writer.writeheader() + with open(args.ifile, 'r') as f: + reader=csv.DictReader(f, delimiter='\t') + logger.info('Searching individuals connected to the platewells') + for r in reader: + ind=kb.dt.get_connected(pws_map[r['platewell']], kb.Individual, + kb.dt.DIRECTION_INCOMING) + try: + record = {'platewell': r['platewell'], + 'status': pws_map[r['platewell']].status.enum_label(), + 'enrollment': ':'.join([study.label,enrolled_map[ind[0].id].studyCode])} + except KeyError as e: + logger.warning('not enrolled {}'.format(r['platewell'])) + record = {'platewell': r['platewell'], + 'status': pws_map[r['platewell']].status.enum_label(), + 'enrollment': ':'.join([study.label,'not_enrolled'])} + writer.writerow(record) + of.close() + + +if __name__ == "__main__": + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/enrollments_by_platewells.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/enrollments_by_platewells.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,66 @@ + + + From a list of platewells retrieves the enrollments code of the connected individual + + + launcher.sh + --interpreter=python + --runner=enrollments_by_platewells.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --logfile=${logfile} + --ifile=$input + --ofile=${output} + #if str($study_label) != 'no_study' + --study=${study_label} + #end if + + + + + + + + + + + + + + + + + +This tool, from a list of platewells retrieves the enrollments code of the connected individual. + +Input file must be in TABULAR format and like:: + + platewell + A9033P3B:C09 + A9033P3B:G09 + A9033P3B:G10 + A9033P3B:H05 + A9033WRT:E08 + A9033WRT:E10 + A9033WRT:F03 + A9033WRT:F04 + ... + +Output file will be like:: + + platewell status enrollment + A9033P3B:C09 DISCARDED AUTOIMMUNITY:X3M6XP517 + A9033P3B:G09 DISCARDED AUTOIMMUNITY:RYMRK2NLJ + A9033P3B:G10 DISCARDED AUTOIMMUNITY:OV13V99M9 + A9033P3B:H05 DISCARDED AUTOIMMUNITY:OV13ZQK19 + A9033WRT:E08 DISCARDED AUTOIMMUNITY:7GMWNX9M8 + A9033WRT:E10 DISCARDED AUTOIMMUNITY:R3MKP0GL4 + A9033WRT:F03 DISCARDED AUTOIMMUNITY:N1VD2Q915 + A9033WZT:A04 CONTENTUSABLE AUTOIMMUNITY:210JRG4MW + ... + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/flowcell_samplesheet.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/flowcell_samplesheet.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,107 @@ + + + Build the samplesheet for a given FlowCell + + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --loglevel=$__app__.config.vl_loglevel + --logfile=${logfile} + --ofile=${outfile} + flowcell_samplesheet + #if $namespace.ignore_namespace + --flowcell=${flowcell} + --ignore_namespace + #else + --flowcell="${namespace.namespace_value}|${flowcell}" + #end if + #if $remove_namespaces + --remove_namespaces + #end if + #if $add_sample_label + --sample_label + #end if + --separator=${csv_separator} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Export data related to a FlowCell identified by the Flowcell ID field +into a csv file like + + FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator + + foofc1,1,v012aa2,hg19,ATCACG,,WG,john doe + + foofc1,2,v0441a1,hg19,GATCAG,,EXOME,john doe + + foofc1,2,v021441,hg19,TAGCTT,,WG,john doe + + ... + +If the checkbox "Add sample labels" is enabled, the output file will +have a new column at the end of each row with the Label of the sample +like + + FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleLabel + + foofc1,1,v012aa2,hg19,ATCACG,,WG,john doe,foosmp1 + + foofc1,2,v0441a1,hg19,GATCAG,,EXOME,john doe,foosmp2 + + foofc1,2,v021441,hg19,TAGCTT,,WG,john doe,foosmp3 + + ... + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/get_kinship_input.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/get_kinship_input.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,86 @@ + + + Build input files for kinship MR application + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --ofile=${output_gen} + extract_gt + #if str($study) != 'no_study' + --study=${study} + #end if + --out_samples_list=${output_sl} + --marker_set=${mset} + #if $transpose_output + --transpose_output + #end if + --compress_output + --compression_level=${compression_level} + #if $ignore_duplicated + --ignore_duplicated + #end if + #if $enable_debug + --loglevel=DEBUG + #if str($data_collection) != 'no_collection' + --data_collection=${data_collection} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/get_parents_from_sibling.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/get_parents_from_sibling.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,89 @@ +''' +From a file like this +individual sibling +V08E18411BC66F4987BCA43EFC6F636224 V0AE5660BF4A7149589BE9DB3308B50327 +V0FAE2B10F690041509739A3F4B314DC8F V00875417B31684EC2A62EE37717913445 +V0382EF862AA4B475697C95D3777043239 V08E376727ED8E4B369DAA3B62A9395E1B +.... + +retrieve indivual's parents using sibling informations and build a file like + +individual father mother +V08E18411BC66F4987BCA43EFC6F636224 V027DE334753424F07B81A70053EF5B873 V035222CAEE0474AFEBB9A161D4B64914E +V0FAE2B10F690041509739A3F4B314DC8F V0E966B53BDCC942C09D6B6D96DE98F4F4 V0F7B6926C6FBE4F0BB38BBC6CFB13A825 +.... + +''' + +import sys, csv, argparse, logging, os + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS + + +def make_parser(): + parser = argparse.ArgumentParser(description='retrieve parents information using sibling') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero passwd') + parser.add_argument('--in_file', type=str, required=True, + help='input file with individual-sibling couples') + parser.add_argument('--out_file', type=str, required=True, + help='output file with parents information') + return parser + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('get_parents_from_sibling', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.info('Retrieving individuals') + inds = kb.get_objects(kb.Individual) + logger.info('Retrieved %d individuals' % len(inds)) + inds_lookup = {} + for ind in inds: + inds_lookup[ind.id] = ind + + with open(args.in_file) as infile: + reader = csv.DictReader(infile, delimiter='\t') + records = [] + for row in reader: + try: + sib = inds_lookup[row['sibling']] + rec = {'individual' : row['individual'], + 'father' : sib.father.id if sib.father else 'None', + 'mother' : sib.mother.id if sib.mother else 'None'} + logger.info('Individual %s, father: %s - mother: %s' % (row['individual'], + rec['father'], + rec['mother'])) + records.append(rec) + except KeyError: + logger.error('Unable to find individual %s' % row['sibling']) + + logger.info('Retrieved parents for %d individuals' % len(records)) + + with open(args.out_file, 'w') as outfile: + writer = csv.DictWriter(outfile, ['individual', 'father', 'mother'], + delimiter='\t') + writer.writeheader() + writer.writerows(records) + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/get_parents_from_sibling.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/get_parents_from_sibling.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,50 @@ + + + Retrieve individual's parents using sibling informations + + + launcher.sh + --interpreter=python + --runner=get_parents_from_sibling.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --in_file=${input1} + --out_file=${output1} + --logfile=${logfile} + + + + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +It will retrieve indivual's parents using sibling informations from a file like this:: + + individual sibling + V08E18411BC66F4987BCA43EFC6F636224 V0AE5660BF4A7149589BE9DB3308B50327 + V0FAE2B10F690041509739A3F4B314DC8F V00875417B31684EC2A62EE37717913445 + V0382EF862AA4B475697C95D3777043239 V08E376727ED8E4B369DAA3B62A9395E1B + +and build a tsv file like this:: + + individual father mother + V08E18411BC66F4987BCA43EFC6F636224 None None + V0FAE2B10F690041509739A3F4B314DC8F V07282522B89FC4F7CA08094537A13C0D1 V09D459311D1254095AE9F00B45E5A101E + V0382EF862AA4B475697C95D3777043239 V04CD9561F753F4853838E2E96819AAAC0 V0382EF862AA4B475697C95D3777043239 + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/get_studies_details.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/get_studies_details.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,59 @@ +''' +''' + +import argparse, csv, sys + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS + + +def make_parser(): + parser = argparse.ArgumentParser(description='retrieve studies details') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero passwd') + parser.add_argument('--out_file', type=str, required=True, + help='output file with studies details') + return parser + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('get_studies', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.info('Retrieving studies') + studies = kb.get_objects(kb.Study) + logger.info('Retrieved {} studies'.format(len(studies))) + records = [] + for s in studies: + enr = kb.get_enrolled(s) + rec = {'label': s.label, + 'description': s.description, + 'enrolled': len(enr)} + records.append(rec) + + with open(args.out_file, 'w') as outfile: + writer = csv.DictWriter(outfile, ['label', 'description', 'enrolled'], + delimiter='\t') + # writer.writeheader() + writer.writerows(sorted(records, key=lambda key: key['label'])) + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/get_studies_details.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/get_studies_details.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,29 @@ + + + Retrieve studies details + + + launcher.sh + --interpreter=python + --runner=get_studies_details.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --out_file=${output1} + --logfile=${logfile} + + + + + + + + + + + +It will produce a list of the studies known by the biobank server + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/global_stats.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/global_stats.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,46 @@ + + + Provide global statistics for a given study. + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ofile=${output1} + --logfile=${logfile} + global_stats + #if str($study) != 'all_known_studies' + --study=${study} + #end if + + + + + + + + + + + + + + + + It will output a tsv files with the following columns:: + + study diagnosis technology gender counts + BSTUDY icd10-cm:G35 AffymetrixCelGENOMEWIDESNP_6 MALE 1 + BSTUDY icd10-cm:E10 AffymetrixCelGENOMEWIDESNP_6 FEMALE 1 + BSTUDY local:at0.3 AffymetrixCelGENOMEWIDESNP_6 MALE 2 + BSTUDY icd10-cm:G35;icd10-cm:E10;icd10-cm:E10 AffymetrixCelGENOMEWIDESNP_6 MALE 1 + BSTUDY icd10-cm:G35 AffymetrixCelGENOMEWIDESNP_6 FEMALE 1 + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/gstudio_datasheet.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/gstudio_datasheet.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,46 @@ + + + Build a Genome Studio datasheet for the given plate + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --ofile=${outfile} + gstudio_datasheet + #if str($plate) != 'no_plate' + --plate=${plate} + --manifest=${manifest} + #end if + + + + + + + + + + + + + + + + + + + + + + + Build a Genome Studio datasheet for the given plate + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/kb_query.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/kb_query.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,9 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys +from bl.vl.app.kb_query.main import main as kb_query + +kb_query(sys.argv[1:]) + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/launcher.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/launcher.sh Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,56 @@ +#!/bin/sh + +CMD="" +PYTH_PATH="PYTHONPATH=/SHARE/USERFS/els7/users/galaxy/develop/usr-cluster/lib/p\ +ython2.7/site-packages/:/SHARE/USERFS/els7/users/biobank/lib/" +runner="$(dirname ${BASH_SOURCE[0]})/" +until [ -z $1 ] + do + + opt_host='--host=' + opt_user='--user=' + opt_passwd='--passwd=' + opt_interpreter='--interpreter=' + opt_runner='--runner=' + if [[ $1 == $opt_host* ]]; then + host=`echo $1 | cut -d '=' -f2 | cut -d '.' -f1` + if [ -z $host -o $host == 'None' ]; then + echo 'ERROR. Missing omero host parameter. Please, set Omero Host in your user preferences' > /dev/null >&2 + exit -1 + fi + PYTH_PATH+=$host + HOST=`echo $1 | cut -d '=' -f2` + CMD+=' '$1 + elif [[ $1 == $opt_user* ]]; then + user=`echo $1 | cut -d '=' -f2` + if [ -z $user -o $user == 'None' ]; then + echo 'ERROR. Missing omero user parameter. Please, set Omero User in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_passwd* ]]; then + passwd=`echo $1 | cut -d '=' -f2` + if [ -z $passwd -o $passwd == 'None' ]; then + echo 'ERROR. Missing omero password parameter. Please, set Omero Password in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_runner* ]]; then + runner+=`echo $1 | cut -d '=' -f2` + elif [[ $1 == $opt_interpreter* ]]; then + interpreter=`echo $1 | cut -d '=' -f2` + else + CMD+=' '$1 + fi + shift +done +export $PYTH_PATH/:$PYTHONPATH +profile="/SHARE/USERFS/els7/users/biobank/lib/${HOST}.biobank.profile" +if [ -f $profile ]; then + source $profile + CMD=$interpreter' '$runner$CMD + $CMD +else + echo "ERROR. Biobank profile file doesn't exist. Please, check Omero Host in your user preferences" > /dev/null >&2 + exit -1 +fi diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/lookup_index.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/lookup_index.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,7 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys +from bl.vl.app.snp_manager.main import main + +main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/lookup_index.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/lookup_index.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,29 @@ + + lookup dbSNP index + + lookup_index.py --logfile ${log_file} lookup_index -i ${input_file} + -o ${output_file} + --index-file "${ filter( lambda x: str( x[0] ) == str( $indices ), $__app__.tool_data_tables[ 'dbsnp_indexes' ].get_fields() )[0][-1] }" + + + + + + + + + + + + + + + +**What it does** + +This tool reads a Galaxy genome segment extractor output in interval +format and performs a lookup in the selected dbSNP index to get the +true rs label. It outputs a new marker definitions file with the true +rs labels and masks. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/map_to_collection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/map_to_collection.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,70 @@ + + + Map a list of objects (vessels or data samples) to the specified + collection. + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --ofile=${outfile} + map_to_collection + --ifile=${infile} + #if str($field_label) != '' + --field_label=${field_label} + #end if + #if str($collection_selector.collection_type) != 'no_coll_selected' + --collection_type=${collection_selector.collection_type} + #if str($collection_selector.collection_type) == 'DataCollection' + #if str($collection_selector.dcoll_label) != 'no_label_selected' + --collection_label=${collection_selector.dcoll_label} + #end if + #elif str($collection_selector.collection_type) == 'VesselsCollection' + #if str($collection_selector.vcoll_label) != 'no_label_selected' + --collection_label=${collection_selector.vcoll_label} + #end if + #end if + #end if + + + + + + + + + + + + + + + + + + Select a Vessels Collection... + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/map_vid.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/map_vid.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,25 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys +from bl.vl.app.kb_query.main import main as kb_query + +def main(argv): + selected_column, new_column_name, input_file = argv[:3] + selected_column = int(selected_column) - 1 + new_column_name = new_column_name.strip() + + # with open(input_file) as f: + # l = f.readline().strip() + # Backport to 2.6 + fi = open(input_file) + l = fi.readline().strip() + fi.close() + + column_names = l.split('\t') + column_name = column_names[selected_column] + + argv = argv[3:] + ['--column=%s,%s' % (column_name, new_column_name)] + kb_query(argv) + +main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/map_vid.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/map_vid.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,145 @@ + + + Map labels of objects known to OMERO.biobank to their VID + + + launcher.sh + --interpreter=python + --runner=map_vid.py + ${selected_column} + ${new_column_name} + ${input1} + #if $omero_configuration.level == 'advanced' + --host=$omero_configuration.vl_host + --user=$omero_configuration.vl_user + --passwd=$omero_configuration.vl_passwd + #else + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + #end if + --operator=$__user_email__ + --ofile=${output1} + --logfile=${logfile} + map_vid + --ifile=${input1} + --source-type=${source_type.source_type} + #if $source_type.source_type == 'Individual' + #if str($source_type.study) != 'use_provided' + --study=${source_type.study} + #end if + #end if + #if $strict_mapping + --strict-mapping + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +The tool resolves VIDs for the given column and rename the column +iteself with a new label. Usually to map the items' VIDs the simple +item label is necessary but in some cases a special syntax is needed: + +* for Individual items, if no default study is provided, the pattern + to be used is **STUDY:STUDY_LABEL**. If a default study is provided, + the column must contain only the STUDY_LABEL + +* for PlateWell items the pattern is **PLATE_LABEL:WELL_LABEL** + +* for DataCollectionItem items the pattern is + **DATA_COLLECTION_LABEL:ITEM_LABEL** + + + + + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/markers_to_fastq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/markers_to_fastq.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,7 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys +from bl.vl.app.snp_manager.main import main + +main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/markers_to_fastq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/markers_to_fastq.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,36 @@ + + converter + + launcher.sh + --interpreter=python + --runner=markers_to_fastq.py --logfile ${log_file} markers_to_fastq + -i ${input_file} -o ${output_file} + + + + + + + + + +**What it does** + +This tool converts VL marker definitions to fastq data. + +VL marker definitions files have the following format (spaces are tabs):: + + label rs_label mask + SNP_A-1780419 rs6576700 GGATACATTTTATTGC[A/G]CTTGCAGAGTATTTTT + SNP_A-1780418 rs17054099 GGATACATTACCCAAA[C/T]GGTCACAGGTCAAAGG + SNP_A-1780415 rs7730126 GGATACATCCCCCCCA[A/G]AAAATGAGAATAAAGC + ... + +Where "label" is a unique identifier, "rs_label" is the dbSNP label +and "mask" is the SNP's mask in the +LEFT_FLANK[ALLELE_A/ALLELE_B/...]RIGHT_FLANK format. One fastq record +is generated for each allele in the mask. The string "None" in the +rs_label column means there is no rs label for the marker. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/plate_dsamples_details.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/plate_dsamples_details.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,86 @@ + + + Retrieve wells and connected data samples related to a known plate + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --ofile=${output} + plate_data_samples + #if str($plate) != 'select_one' + --plate=${plate} + #end if + #if $fetch_all + --fetch_all + #end if + #if str($vcoll_label) != 'no_collection' + --vessels_collection=${vcoll_label} + #end if + #if $vessel_types + --ignore_types=${vessel_types} + #end if + #if str($study_label) != 'no_study' + --map_study=${study_label} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + +Using of the the selectable plates barcode, the tool will generate a +report file for the plate like:: + + PLATE_barcode PLATE_label WELL_label WELL_status DATA_SAMPLE_label + XXYYZZKK test_plate A01 CONTENTUSABLE a01_test_sample + XXYYZZKK test_plate A02 CONTENTUSABLE X + XXYYZZKK test_plate A03 UNKNOWN OR EMPTY X + XXYYZZKK test_plate A04 CONTENTUSABLE a04_test_sample + XXYYZZKK test_plate A05 DISCARDED X + ... + +For each plate, all wells will be generated in the output file, even +the ones not actually recorded into the system, these wells will be +marked with a 'UNKOWN OR EMPTY' status. + +For each well, the tool performs a query in order to find if at least +one data sample is directly connected to the well itself; if at least +one is found, the label of the data sample will be placed in the +DATA_SAMPLE_label column, if no data sample is connected to the well a +'X' will be placed. + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/query.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/query.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,79 @@ + + + Provides a simplified environment to perform complex queries to + BIOBANK. + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ofile=${output1} + --logfile=${logfile} + query + --group=$study + --code-file=$code_file + + + + + + + + + + + + + + + + ${str($query_code).replace('__sq__', "'").replace('__cr____cn__', '\n')} + + + + + + + + +The following is an example of a query that will dump family relations +within the group:: + + writeheader('study', 'id', 'gender', 'father', 'mother') + for i in Individuals(group): + writerow(group.id, enum_label(i.gender), + i.father.id if i.father else 'None', + i.mother.id if i.mother else 'None') + + +The next example will prepare a file that could be used to define a +data collection and then as the input for a genotyping run:: + + writeheader('dc_id', 'gender', 'data_sample', + 'path', 'mimetype', 'size', 'sha1') + for i in Individuals(group): + for d in DataSamples(i, 'AffymetrixCel'): + for o in DataObjects(d): + writerow(group.id, enum_label(i.gender), d.id, + o.path, o.mimetype, o.size, o.sha1) + +In the examples above, '''group''' (actually a study) corresponds to +the group whose label is assigned by the '''--group''' flag. + +**Note** This is clearly an extremely dangerous tool. + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/select_sub_group.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/select_sub_group.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,129 @@ + + + Selects groups of individuals. + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --ofile=${output1} + --logfile=${logfile} + selector + #if str($study) != 'use_all' + --study=$study + #end if + --group-label=$group_label + --total-number=$total_number + --male-fraction=$male_fraction + --reference-disease=$reference_diagnosis + --control-fraction=$control_fraction + #if str($required_datasample) != 'unselect' + --required-datasample=$required_datasample + #end if + #if int($seed) != 0 + --seed=$seed + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +It will select a group of individuals from a specific group (from all +avalable individuals, if no group is selected). The selection is +controlled by the following parameters: + + * total number of individuals selected + * male fraction + * reference disease + * control fraction + * presence of specific datasets + +The results will be presented as a file that can be used to generate a +new group (actually a study). The file will have the following columns:: + + study label individual + XXX 0001 V20940239409 + XXX 0002 V20940239509 + XXX 0003 V20940239609 + XXX 0004 V20940239709 + ... + + where study is the name of the new study + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/snp_manager.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/snp_manager.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,7 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +import sys +from bl.vl.app.snp_manager.main import main as snp_manager + +snp_manager(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/snp_manager.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/snp_manager.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,26 @@ + + get true rs label and mask from dbSNP + + launcher.sh + --interpreter=python + --runner=snp_manager.py + + + + + + + + + + + + + + + + FIXME + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/tools/vessels_by_individual.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/vessels_by_individual.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,74 @@ + + + Retrieve all vessels related to individuals passed with the input + file. Vessel type and a Vessel Collection can be used as filters. + + + launcher.sh + --interpreter=python + --runner=kb_query.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --ofile=${outfile} + vessels_by_individual + --ifile=${infile} + #if str($collection_label) != 'no_collection' + --vessels_collection=${collection_label} + #end if + #if str($vessel_type) != 'no_type' + --vessel_type=${vessel_type} + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/change_source_item.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/change_source_item.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,258 @@ +# The tool changes the source of an object inside the system. +# Expected input file format is +# +# target new_source +# V1415515 V1241441 +# V1351124 V1511141 +# ..... +# +# Where target is the object whose source will be changed with the +# new_source object. New source type will be specified using the +# command line option. + +import csv, argparse, sys, os, json, time + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS +import omero +import omero.model + + +def make_parser(): + parser = argparse.ArgumentParser(description='change the source for given items') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logger level', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('--operator', type=str, required=True, + help='operator username') + parser.add_argument('--in_file', type=str, required=True, + help='list of items with new sources') + parser.add_argument('--target_type', type=str, required=True, + help='type of the target objects') + parser.add_argument('--source_type', type=str, required=True, + help='type of the new source objects') + return parser + + +def do_check(records, targets, sources, + target_type, source_type, + kb, logger): + logger.info('Starting consistency checks') + src_map = dict([(s.id, s) for s in sources]) + trg_map = dict([(t.id, t) for t in targets]) + good_records = [] + targets = {} + sources = {} + for i, r in enumerate(records): + if r['target'] not in trg_map: + logger.warning('No %s with ID %s, rejecting record %d' % (target_type, + r['target'], i)) + continue + if r['new_source'] not in src_map: + logger.warning('No %s with ID %s, rejecting record %d' % (source_type, + r['new_source'], i)) + continue + targets[r['target']] = trg_map[r['target']] + sources[r['new_source']] = src_map[r['new_source']] + good_records.append(r) + logger.info('Done with consistency checks') + return good_records, targets, sources + + +def update_data(records, targets, sources, operator, act_conf, + kb, logger, batch_size = 500): + def get_chunk(batch_size, records): + offset = 0 + while len(records[offset:]) > 0: + yield records[offset:offset+batch_size] + offset += batch_size + dev = get_device(kb, logger) + for i, recs in enumerate(get_chunk(batch_size, records)): + logger.info('Updating batch %d' % i) + batch_to_save = [] + edges_to_delete = [] + for r in recs: + target = targets[r['target']] + # Build the ActionOnAction backup object + if not target.lastUpdate: + last_action = target.action + else: + last_action = target.lastUpdate + old_action = target.action + asconf = {'backup' : {'action' : old_action.id}} + aslabel = 'updater.update_source_item-%f' % time.time() + backup = build_action(operator, old_action.context, + dev, last_action, aslabel, + asconf, kb, logger) + target.lastUpdate = backup + # Build the Action in order to attach the new source to + # the target object + new_source = sources[r['new_source']] + if new_source.is_mapped: + new_source.unload() + asconf = act_conf + aslabel = 'updater.update_source_item-%f' % time.time() + new_act = build_action(operator, old_action.context, + dev, new_source, aslabel, + asconf, kb, logger) + target.action = new_act + if old_action.OME_TABLE == 'Action': + # no old source, just save the new action + batch_to_save.append(target) + else: + # check if the old target and the new one are different + if new_source != old_action.target: + batch_to_save.append(target) + edges_to_delete.append((old_action.target, target)) + if len(batch_to_save) > 0: + kb.save_array(batch_to_save) + else: + logger.info('No record need to be updated') + for vert in edges_to_delete: + kb.dt.destroy_edge(*vert) + + +def build_action(operator, context, device, target, + action_setup_label, action_setup_conf, + kb, logger): + if action_setup_label: + asetup = get_action_setup(action_setup_label, action_setup_conf, + kb, logger) + else: + asetup = None + aconf = { + 'device' : device, + 'actionCategory' : kb.ActionCategory.IMPORT, + 'operator' : 'operator', + 'context' : context, + 'target' : target, + } + if asetup: + aconf['setup'] = asetup + action = kb.factory.create(retrieve_action_type(target, kb), aconf) + return action + + +def retrieve_action_type(target, kb): + tklass = target.ome_obj.__class__.__name__ + for i, k in enumerate(target.ome_obj.__class__.__mro__): + if k is omero.model.IObject: + tklass = target.ome_obj.__class__.__mro__[i-1].__name__ + if tklass == 'Vessel': + return kb.ActionOnVessel + elif tklass == 'Individual': + return kb.ActionOnIndividual + elif tklass == 'DataSample': + return kb.ActionOnDataSample + elif tklass == 'DataCollectionItem': + return kb.ActionOnDataCollectionItem + elif tklass == 'Action': + return kb.ActionOnAction + # elif tklass == 'VLCollection': + # return kb.ActionOnCollection + else: + raise ValueError('No Action related to %s klass' % tklass) + + +def get_action_setup(label, conf, kb, logger): + asetup_conf = { + 'label' : label, + 'conf' : json.dumps(conf), + } + asetup = kb.factory.create(kb.ActionSetup, asetup_conf) + return asetup + + +def get_device(kb, logger): + dev_model = 'UPDATE' + dev_maker = 'CRS4' + dev_release = '0.1' + dev_label = 'updater-%s.update_source_item' % dev_release + device = kb.get_device(dev_label) + if not device: + logger.debug('No device with label %s, creating one' % dev_label) + conf = { + 'maker' : dev_maker, + 'model' : dev_model, + 'release' : dev_release, + 'label' : dev_label, + } + device = kb.factory.create(kb.Device, conf).save() + return device + + +def find_action_setup_conf(args): + action_setup_conf = {} + for x in dir(args): + if not (x.startswith('_') or x.startswith('func')): + action_setup_conf[x] = getattr(args, x) + if 'passwd' in action_setup_conf: + action_setup_conf.pop('passwd') # Storing passwords into an + # Omero obj is not a great idea... + return action_setup_conf + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('change_source_item', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + logger.info('Loading data from input file') + with open(args.in_file) as f: + reader = csv.DictReader(f, delimiter='\t') + records = list(reader) + logger.info('Loaded %d records' % len(records)) + + logger.info('Loading %s type objects' % args.target_type) + targets = kb.get_objects(getattr(kb, args.target_type)) + logger.info('Loaded %d objects' % len(targets)) + if len(targets) == 0: + msg = 'No targets loaded from the system, nothing to do' + logger.critical(msg) + sys.exit(msg) + + logger.info('Loading %s type objects' % args.source_type) + sources = kb.get_objects(getattr(kb, args.source_type)) + logger.info('Loaded %d objects' % len(sources)) + if len(sources) == 0: + msg = 'No sources loaded from the system, nothing to do' + logger.critical(msg) + sys.exit(msg) + + logger.info('Loading Action type objects') + acts = kb.get_objects(kb.Action) + logger.info('Loaded %d objects' % len(acts)) + + records, targets, sources = do_check(records, targets, sources, + args.target_type, args.source_type, + kb, logger) + if len(records) == 0: + msg = 'No records passed consistency checks, nothing to do' + logger.critical(msg) + sys.exit(msg) + + aconf = find_action_setup_conf(args) + + update_data(records, targets, sources, args.operator, + aconf, kb, logger) + + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/change_source_item.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/change_source_item.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,41 @@ + + + Change source items for given objects + + + launcher.sh + --interpreter=python + --runner=change_source_item.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --in_file=${infile} + --target_type=${target_type} + --source_type=${source_type} + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/discard_from_collection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/discard_from_collection.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,91 @@ +import csv, argparse, sys, os + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS + +COLLECTION_TYPES = {'VesselsCollection' : 'VesselsCollectionItem', + 'DataCollection' : 'DataCollectionItem'} + +def make_parser(): + parser = argparse.ArgumentParser(description='remove elements from a Vessels or Data Collection') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logger level', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('-I', '--ifile', type=str, required=True, + help='list of collection items that will be removed') + parser.add_argument('--collection_type', type=str, required=True, + choices=COLLECTION_TYPES.keys(), + help='type of the collection') + parser.add_argument('--collection_label', type=str, required=True, + help='label of the collection') + + return parser + +def load_collection(coll_type, coll_label, kb): + query = 'SELECT coll FROM %s coll WHERE coll.label = :coll_label' % coll_type + coll = kb.find_all_by_query(query, {'coll_label' : coll_label}) + return coll[0] if len(coll) > 0 else None + +def load_collection_items(collection, coll_type, kb): + if COLLECTION_TYPES[coll_type] == 'VesselsCollectionItem': + citems = kb.get_vessels_collection_items(collection) + elif COLLECTION_TYPES[coll_type] == 'DataCollectionItem': + citems = kb.get_data_collection_items(collection) + else: + raise ValueError('Unknown data collection type %s' % COLLECTION_TYPES[coll_type]) + ci_map = {} + for ci in citems: + ci_map[ci.id] = ci + return ci_map + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('discard_from_collection', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + logger.info('Loading collection %s from %s' % (args.collection_label, + args.collection_type)) + coll = load_collection(args.collection_type, args.collection_label, kb) + if not coll: + msg = 'No %s found with label %s' % (args.collection_type, + args.collection_label) + logger.error(msg) + sys.exit(msg) + logger.info('Loading items from collection') + coll_items = load_collection_items(coll, args.collection_type, kb) + logger.info('Fetched %d elements' % len(coll_items)) + + with open(args.ifile) as infile: + reader = csv.DictReader(infile, delimiter='\t') + to_be_deleted = [row['collection_item'] for row in reader] + logger.info('Found %d items to be deleted' % len(to_be_deleted)) + + for tbd in to_be_deleted: + try: + kb.delete(coll_items[tbd]) + logger.info('%s with ID %s deleted' % (COLLECTION_TYPES[args.collection_type], + tbd)) + except KeyError, ke: + logger.warning('No %s related to ID %s' % (COLLECTION_TYPES[args.collection_type], + ke)) + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/discard_from_collection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/discard_from_collection.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,79 @@ + + Discard input elements from the selected collection + + launcher.sh + --interpreter=python + --runner=discard_from_collection.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --ifile=${ifile} + #if str($collection_selector.collection_type) != 'no_coll_selected' + --collection_type=${collection_selector.collection_type} + #if str($collection_selector.collection_type) == 'DataCollection' + #if str($collection_selector.dcoll_label) != 'no_label_selected' + --collection_label=${collection_selector.dcoll_label} + #end if + #elif str($collection_selector.collection_type) == 'VesselsCollection' + #if str($collection_selector.vcoll_label) != 'no_label_selected' + --collection_label=${collection_selector.vcoll_label} + #end if + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool discard from a DataCollection or a VesselCollection one or +more items. + +The expected input file must be like + ++---------------+ +|collection_item| ++---------------+ +|V013AFF22311 | ++---------------+ +|V0ABB3451516 | ++---------------+ +|V012441AAEEC | ++---------------+ + +Input file rows must be VIDs obtained using the **map_vid** tool. + +Collection must be selected using the specific selection lists that +show only the ones imported into the system. + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/drop_parental_info.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/drop_parental_info.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,69 @@ +import sys, csv, argparse, os + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS + + +def make_parser(): + parser = argparse.ArgumentParser(description='set parents of the selected individuals to None') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('--in_file', type=str, required=True, + help='list of the individuals') + parser.add_argument('--out_file', type=str, required=True, + help='output file') + return parser + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('drop_parental_info', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.info('Retrieving individuals') + inds = kb.get_objects(kb.Individual) + logger.info('Retrieved %d individuals' % len(inds)) + inds_lookup = {} + for i in inds: + inds_lookup[i.id] = i + + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + records = [] + for row in reader: + try: + # The 'individual' : inds_lookup[row['individual']].id + # is quite redundant but is a usefull check in order + # to filter wrong VIDs + record = {'individual' : inds_lookup[row['individual']].id, + 'father' : 'None', + 'mother' : 'None'} + records.append(record) + except KeyError, ke: + logger.warning('Individual with VID %s does not exist, skipping line' % ke) + + with open(args.out_file, 'w') as out_file: + writer = csv.DictWriter(out_file, ['individual', 'father', 'mother'], + delimiter = '\t') + writer.writeheader() + writer.writerows(records) + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/drop_parental_info.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/drop_parental_info.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,47 @@ + + + Remove parental info of individuals + + + launcher.sh + --interpreter=python + --runner=drop_parental_info.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --in_file=${input1} + --out_file=${output1} + --logfile=${logfile} + + + + + + + + + + + + +It will remove parental info of individual using informations from a file like this:: + + individual + V08E18411BC66F4987BCA43EFC6F636224 + +and build a tsv file like this:: + + individual father mother + V08E18411BC66F4987BCA43EFC6F636224 None None + +----- + +.. class:: warningmark + +Note that galaxy don't recognize a tsv file with just one column like a tabular file, so need to be converted by hand + + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/launcher.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/launcher.sh Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,56 @@ +#!/bin/sh + +CMD="" +PYTH_PATH="PYTHONPATH=/SHARE/USERFS/els7/users/galaxy/develop/usr-cluster/lib/p\ +ython2.7/site-packages/:/SHARE/USERFS/els7/users/biobank/lib/" +runner="$(dirname ${BASH_SOURCE[0]})/" +until [ -z $1 ] + do + + opt_host='--host=' + opt_user='--user=' + opt_passwd='--passwd=' + opt_interpreter='--interpreter=' + opt_runner='--runner=' + if [[ $1 == $opt_host* ]]; then + host=`echo $1 | cut -d '=' -f2 | cut -d '.' -f1` + if [ -z $host -o $host == 'None' ]; then + echo 'ERROR. Missing omero host parameter. Please, set Omero Host in your user preferences' > /dev/null >&2 + exit -1 + fi + PYTH_PATH+=$host + HOST=`echo $1 | cut -d '=' -f2` + CMD+=' '$1 + elif [[ $1 == $opt_user* ]]; then + user=`echo $1 | cut -d '=' -f2` + if [ -z $user -o $user == 'None' ]; then + echo 'ERROR. Missing omero user parameter. Please, set Omero User in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_passwd* ]]; then + passwd=`echo $1 | cut -d '=' -f2` + if [ -z $passwd -o $passwd == 'None' ]; then + echo 'ERROR. Missing omero password parameter. Please, set Omero Password in your user preferences' > /dev/null >&2 + exit -1 + fi + CMD+=' '$1 + elif [[ $1 == $opt_runner* ]]; then + runner+=`echo $1 | cut -d '=' -f2` + elif [[ $1 == $opt_interpreter* ]]; then + interpreter=`echo $1 | cut -d '=' -f2` + else + CMD+=' '$1 + fi + shift +done +export $PYTH_PATH/:$PYTHONPATH +profile="/SHARE/USERFS/els7/users/biobank/lib/${HOST}.biobank.profile" +if [ -f $profile ]; then + source $profile + CMD=$interpreter' '$runner$CMD + $CMD +else + echo "ERROR. Biobank profile file doesn't exist. Please, check Omero Host in your user preferences" > /dev/null >&2 + exit -1 +fi diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/merge_individuals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/merge_individuals.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,223 @@ +#======================================= +# This tool moves all informations related to an individual (source) to +# another (target). Moved informations are: +# * children (Individual objects) +# * ActionOnInvidual +# * Enrollments +# * EHR records +# +# The tool expects as input a TSV file like this +# source target +# V0468D2D96999548BF9FC6AD24C055E038 V060BAA01C662240D181BB98A51885C498 +# V029CC0A614E2D42D0837602B15193EB58 V01B8122A7C75A452E9F80381CEA988557 +# V0B20C93E8A88D43EFB87A7E6911292A05 V0BED85E8E76A54AA7AB0AFB09F95798A8 +# ... +# +# NOTE WELL: +# * Parents of the "source" indivudal WILL NOT BE ASSIGNED +# to the "target" individual +# * For the Enrollmnent objects, if +# "target" individual has already a code in the same study of "source" +# individual, the script will try to move the Enrollment to the +# "duplicated" study (this will be fixed when a proper ALIASES +# manegement will be introduced) +# ======================================= + +import sys, argparse, csv, time, json, os + +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.kb import KBError +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS + + +def make_parser(): + parser = argparse.ArgumentParser(description='merge informations related to an individual ("source") to another one ("target")') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices = LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('-O', '--operator', type=str, help='operator', + required=True) + parser.add_argument('--in_file', type=str, required = True, + help='input TSV file') + return parser + + +def update_object(obj, backup_values, operator, kb, logger): + logger.debug('Building ActionOnAction for object %s::%s' % + (obj.get_ome_table(), + obj.id) + ) + act_setup = build_action_setup('merge-individuals-%f' % time.time(), + backup_values, kb) + aoa_conf = { + 'setup': act_setup, + 'actionCategory' : kb.ActionCategory.UPDATE, + 'operator': operator, + 'target': obj.lastUpdate if obj.lastUpdate else obj.action, + 'context': obj.action.context + } + logger.debug('Updating object with new ActionOnAction') + obj.lastUpdate = kb.factory.create(kb.ActionOnAction, aoa_conf) + + +def build_action_setup(label, backup, kb, logger): + logger.debug('Creating a new ActionSetup with label %s and backup %r' % (label, backup)) + conf = { + 'label': label, + 'conf': json.dumps({'backup' : backup}) + } + asetup = kb.factory.create(kb.ActionSetup, conf) + return asetup + + +def update_children(source_ind, target_ind, operator, kb, logger): + if source_ind.gender.enum_label() == kb.Gender.MALE.enum_label(): + parent_type = 'father' + elif source_ind.gender.enum_label() == kb.Gender.FEMALE.enum_label(): + parent_type = 'mother' + else: + raise ValueError('%s is not a valid gender value' % (source_ind.gender.enum_label())) + query = ''' + SELECT ind FROM Individual ind + JOIN ind.{0} AS {0} + WHERE {0}.vid = :parent_vid + '''.format(parent_type) + children = kb.find_all_by_query(query, {'parent_vid' : source_ind.id}) + logger.info('Retrieved %d children for source individual' % len(children)) + for child in children: + backup = {} + logger.debug('Changing %s for individual %s' % (parent_type, + child.id)) + backup[parent_type] = getattr(child, parent_type).id + setattr(child, parent_type, target_ind) + update_object(child, backup, operator, kb) + kb.save_array(children) + + +def update_action_on_ind(source_ind, target_ind, operator, kb, logger): + query = '''SELECT act FROM ActionOnIndividual act + JOIN act.target AS ind + WHERE ind.vid = :ind_vid + ''' + src_acts = kb.find_all_by_query(query, {'ind_vid' : source_ind.id}) + logger.info('Retrieved %d actions for source individual' % len(src_acts)) + connected = kb.dt.get_connected(source_ind, direction=kb.dt.DIRECTION_OUTGOING, + query_depth=1) + if source_ind in connected: + connected.remove(source_ind) + for sa in src_acts: + logger.debug('Changing target for action %s' % sa.id) + sa.target = target_ind + logger.debug('Action %s target updated' % sa.id) + kb.save_array(src_acts) + for conn in connected: + kb.dt.destroy_edge(source_ind, conn) + kb.dt.create_edge(conn.action, target_ind, conn) + + +def update_enrollments(source_ind, target_ind, operator, kb, logger): + query = '''SELECT en FROM Enrollment en + JOIN en.individual AS ind + WHERE ind.vid = :ind_vid + ''' + enrolls = kb.find_all_by_query(query, {'ind_vid' : source_ind.id}) + logger.info('Retrieved %d enrollments for source individual' % len(enrolls)) + for sren in enrolls: + try: + sren.individual = target_ind + logger.debug('Changing individual for enrollment %s in study %s' % (sren.studyCode, + sren.study.label)) + kb.save(sren) + logger.info('Changed individual for enrollment %s (study code %s -- study %s)' % (sren.id, + sren.studyCode, + sren.study.label)) + except KBError, kbe: + logger.warning('Unable to update enrollment %s (study code %s -- study %s)' % (sren.id, + sren.studyCode, + sren.study.label)) + move_to_duplicated(sren, operator, kb, logger) + + +def update_ehr_records(source_ind, target_ind, kb): + kb.update_table_rows(kb.eadpt.EAV_EHR_TABLE, '(i_vid == "%s")' % source_ind.id, + {'i_vid' : target_ind.id}) + + +# This method should be considered as a temporary hack that will be +# used untill a proper ALIAS management will be introduced into the +# system +def move_to_duplicated(enrollment, operator, kb, logger): + old_st = enrollment.study + dupl_st = kb.get_study('%s_DUPLICATI' % old_st.label) + if not dupl_st: + logger.warning('No "duplicated" study ({0}_DUPLICATI) found for study {0}'.format(old_st.label)) + return + enrollment.study = dupl_st + try: + kb.save(enrollment) + logger.info('Enrollmnet %s moved from study %s to study %s' % (enrollment.studyCode, + old_st.label, dupl_st.label)) + except: + logger.error('An error occurred while moving enrollment %s from study %s to %s' % (enrollment.studyCode, + old_st.label, + dupl_st.label)) + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('merge_individuals', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.debug('Retrieving Individuals') + individuals = kb.get_objects(kb.Individual) + logger.debug('Retrieved %d Individuals' % len(individuals)) + ind_lookup = {} + for i in individuals: + ind_lookup[i.id] = i + + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + for row in reader: + try: + source = ind_lookup[row['source']] + logger.info('Selected as source individual with ID %s' % source.id) + target = ind_lookup[row['target']] + logger.info('Selected as destination individual with ID %s' % target.id) + except KeyError, ke: + logger.warning('Unable to retrieve individual with ID %s, skipping row' % ke) + continue + + logger.info('Updating children connected to source individual') + update_children(source, target, args.operator, kb, logger) + logger.info('Children update complete') + + logger.info('Updating ActionOnIndividual related to source individual') + update_action_on_ind(source, target, args.operator, kb, logger) + logger.info('ActionOnIndividual update completed') + + logger.info('Updating enrollments related to source individual') + update_enrollments(source, target, args.operator, kb, logger) + logger.info('Enrollments update completed') + + logger.info('Updating EHR records related to source individual') + update_ehr_records(source, target, kb) + logger.info('EHR records update completed') + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/merge_individuals.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/merge_individuals.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,52 @@ + + + Merge individuals' data + + + launcher.sh + --interpreter=python + --runner=dmerge_individuals.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --ifile=${ifile} + + + + + + + + + + + + +This tool moves all informations related to an individual (source) to +another (target). Moved informations are: + + * children (Individual objects) + * ActionOnInvidual + * Enrollments + * EHR records + +The tool expects as input a TSV file like this:: + + source target + V0468D2D96999548BF9FC6AD24C055E038 V060BAA01C662240D181BB98A51885C498 + V029CC0A614E2D42D0837602B15193EB58 V01B8122A7C75A452E9F80381CEA988557 + V0B20C93E8A88D43EFB87A7E6911292A05 V0BED85E8E76A54AA7AB0AFB09F95798A8 + ... + +NOTE WELL: + * Parents of the "source" indivudal WILL NOT BE ASSIGNED + to the "target" individual + * For the Enrollmnent objects, if + "target" individual has already a code in the same study of "source" + individual, the script will try to move the Enrollment to the + "duplicated" study + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/update_parents.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/update_parents.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,110 @@ +import sys, csv, argparse, time, json + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS + + +def make_parser(): + parser = argparse.ArgumentParser(description='update parents') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('-O', '--operator', type=str, help='operator', + required=True) + parser.add_argument('--in_file', type=str, required=True, + help='input file with individual, father and mother') + return parser + + +def update_parents(individual, father, mother, operator, kb, logger): + backup = {} + logger.info('Updating parents for individual %s', individual.id) + if individual.father != father: + backup['father'] = individual.father.id if individual.father else None + logger.info('Setting father to %s (old value %s)' % (father.id if father else None, + backup['father'])) + individual.father = father + if individual.mother != mother: + backup['mother'] = individual.mother.id if individual.mother else None + logger.info('Setting mother to %s (old value %s)' % (mother.id if mother else None, + backup['mother'])) + individual.mother = mother + if len(backup.items()) > 0: + update_object(individual, backup, operator, kb, logger) + return individual + else: + logger.info('No update needed for individual %s' % individual.id) + return None + + +def update_object(obj, backup_values, operator, kb, logger): + logger.debug('Building ActionOnAction for object %s' % obj.id) + act_setup = build_action_setup('update-parents-%f' % time.time(), + backup_values, kb, logger) + aoa_conf = { + 'setup': act_setup, + 'actionCategory': kb.ActionCategory.UPDATE, + 'operator': operator, + 'target': obj.lastUpdate if obj.lastUpdate else obj.action, + 'context': obj.action.context + } + logger.debug('Updating object with new ActionOnAction') + obj.lastUpdate = kb.factory.create(kb.ActionOnAction, aoa_conf) + + +def build_action_setup(label, backup, kb, logger): + logger.debug('Creating a new ActionSetup with label %s and backup %r' % (label, + backup)) + conf = { + 'label': label, + 'conf': json.dumps({'backup': backup}) + } + asetup = kb.factory.create(kb.ActionSetup, conf) + return asetup + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('update_parents', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.info('Retrieving individuals') + inds = kb.get_objects(kb.Individual) + logger.info('Retrieved %d individuals' % len(inds)) + inds_lookup = {} + for i in inds: + inds_lookup[i.id] = i + + with open(args.in_file) as in_file: + to_be_updated = [] + reader = csv.DictReader(in_file, delimiter='\t') + for row in reader: + ind = inds_lookup[row['individual']] + father = inds_lookup[row['father']] if row['father'] != 'None' else None + mother = inds_lookup[row['mother']] if row['mother'] != 'None' else None + ind = update_parents(ind, father, mother, args.operator, kb, logger) + if ind: + to_be_updated.append(ind) + + logger.info('%d individuals are going to be updated' % len(to_be_updated)) + kb.save_array(to_be_updated) + logger.info('Update complete') + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/updater/update_parents_data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/updater/update_parents_data.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,35 @@ + + + Update parental info of individuals + + + launcher.sh + --interpreter=python + --runner=update_parents.py + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + --operator=$__user_email__ + --logfile=${logfile} + --in_file=${input1} + + + + + + + + + + + +It will update parental info of individual using informations from a file like this:: + + individual father mother + V4C5363 V0A3AC5 V0CF6C8 + V0EE642 V0A3AC5 V0CF6C8 + V027BA1 V0DE514 V0C3A91 + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/build_enrollments_import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/build_enrollments_import.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,187 @@ +""" +prepare a tsv to be imported with a study code foreach individuals not +present in a specified study. + +Can be specified also a study from which each individuals enrolled in will +be ignored + +Report file contains enrollments codes in the others studies + +Codes are short hashes from numbers generated using Hashids.org with +study label as salt parameter + +ex: +source study label +V03CB1DB357B274B17B139EA56A2FFA19E AUTOIMMUNITY ORVL5KMK5 +V0BA695C2E326F4C13AD7F6052BB20539B AUTOIMMUNITY 9R0M2E12N +V067C445E35DA04ECCA21FA3E2DF3BBCF6 AUTOIMMUNITY QGZLQJ1RV +... + +""" + +import argparse +import csv +import string +import sys + +from hashids import Hashids +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.utils import LOG_LEVELS, get_logger +import bl.vl.utils.ome_utils as vlu + + +def make_parser(): + parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logger level', default='INFO') + parser.add_argument('--study', type=str, help='Study label', required=True) + parser.add_argument('--study_to_be_ignored', type=str, + help='Study label to be ignored') + parser.add_argument('--host', type=str, help='Omero hostname') + parser.add_argument('--user', type=str, help='Omero user') + parser.add_argument('--passwd', type=str, help='Omero password') + parser.add_argument('--ofile', type=str, help='output file path', + required=True) + parser.add_argument('--reportfile', type=str, help='report file', + default='report.tsv') + return parser + + +def init_hashids(study): + hashids = Hashids(salt=study, min_length=9, + alphabet=string.ascii_uppercase + string.digits) + return hashids + + +def write_csv_to_be_enrolled(logger, hashids, path, inds_map, + highest_id=0): + csv_header = ['source', 'study', 'label'] + study_id = highest_id + + # Write to CSV file + logger.debug('Writing CSV file %s' % path) + with open(path, 'w') as f: + writer = csv.DictWriter(f, csv_header, + delimiter='\t', quotechar='"', + restval='None') + writer.writeheader() + for k, v in inds_map.iteritems(): + study_id += 1 + v['label'] = hashids.encrypt(study_id) + writer.writerow(v) + return + + +def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map): + logger.debug('Writing CSV file %s' % filename) + with open(filename, 'w') as f: + writer = csv.DictWriter(f, csv_header, + delimiter='\t', quotechar='"', + restval='None') + writer.writeheader() + for k, v in enrolls_map.iteritems(): + writer.writerow(v) + return + + +def get_enrollments_codes(logger, kb, inds_map): + """Retrieve enrollments codes in other studies for the individuals + to be enrolled into the specified study""" + # Retrieve all studies from omero + studies = kb.get_objects(kb.Study) + logger.info('Retrieved %d studies from database' % len(studies)) + + csv_header = ['individual_uuid'] + enrolls_map = {} + # For each study, retrieve all enrollments + for s in studies: + logger.info('Retrieving enrollments for study %s' % s.label) + enrolls = kb.get_enrolled(s) + logger.info('%s enrollments retrieved' % len(enrolls)) + if len(enrolls) > 0: + logger.debug('Building lookup dictionary....') + csv_header.append(s.label) # Add study label to CSV header + for e in enrolls: + if e.individual.id in inds_map: + enrolls_map.setdefault(e.individual.omero_id, + {})['individual_uuid'] = e.individual.id + enrolls_map[e.individual.omero_id][s.label] = e.studyCode + else: + logger.debug('No enrollments found, skip study %s' % s.label) + + return csv_header, enrolls_map + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('inds_not_enrolled', level=args.loglevel, + filename=args.logfile) + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + out_file_path = args.ofile + + kb = KB(driver='omero')(host, user, passwd) + + inds = kb.get_objects(kb.Individual) + #len_inds = len(inds) + logger.info('Retrieved {} individuals'.format(len(inds))) + + inds_map = {} + + for i in inds: + inds_map.setdefault(i.id, {})['source'] = i.id + inds_map[i.id]['study'] = args.study + + study = kb.get_by_label(kb.Study, args.study) + if study: + logger.info('{} present in the database'.format(study.label)) + else: + logger.critical('{} not present in the database'.format(args.study)) + sys.exit() + + hashids = init_hashids(study.label) + enrolls = kb.get_enrolled(study) + logger.info("{} enrollments founded in {}".format(len(enrolls), + study.label)) + highest_id = 0 + #ids = [] + + for e in enrolls: + if e.individual.id in inds_map: + del inds_map[e.individual.id] + _ = hashids.decrypt(e.studyCode) + if _ > highest_id: + highest_id = _[0] + + if args.study_to_be_ignored and kb.get_by_label(kb.Study, + args.study_to_be_ignored): + to_be_removed = [args.study_to_be_ignored] + else: + to_be_removed = [] + + for tbr_study in to_be_removed: + enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study)) + logger.info('Retrieved {} enrollments from {}'.format(len(enr), + tbr_study)) + for e in enr: + if e.individual.id in inds_map: + del inds_map[e.individual.id] + + logger.info('{} individuals to be enrolled'.format(len(inds_map))) + + write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id) + + csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map) + write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map) + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/delete_flowcell_results.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/delete_flowcell_results.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,97 @@ +import argparse, sys + +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.utils import get_logger, LOG_LEVELS +import bl.vl.utils.ome_utils as vlu + + +def make_parser(): + parser = argparse.ArgumentParser(description='remove datasamples connected to a specific sample of a flowcell') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero server hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('--flowcell-label', type=str, required=True, + help='flowcell label') + parser.add_argument('--sample-label', type=str, required=True, + help='sample label') + parser.add_argument('--dry-run', action='store_true', + help='run a simulation, don\'t delete anything') + return parser + + +def get_flowcell_samples_map(flowcell, kb, logger): + fc = kb.get_by_label(kb.FlowCell, flowcell) + if not fc: + logger.info('No flowcell with label %s', flowcell) + sys.exit(0) + logger.info('Loading data for flowcell %s', flowcell) + dsamples = kb.dt.get_connected(fc, kb.SeqDataSample) + dsamples_map = {} + for ds in dsamples: + if ds.sample: + dsamples_map.setdefault(ds.sample.label, []).append(ds) + else: + dsamples_map.setdefault('NO_SAMPLE', []).append(ds) + return dsamples_map + + +def print_report(dsamples_map, sample_label, kb, logger): + dsamples = dsamples_map.get(sample_label) + if not dsamples: + logger.info('No sample with label %s is related to the flowcell', sample_label) + sys.exit(0) + for ds in dsamples: + dobjs = kb.get_data_objects(ds) + logger.info('## data sample: %s', ds.label) + for dob in dobjs: + logger.info('### data object: %s --- mimetype: %s', dob.path, dob.mimetype) + + +def delete(dsamples_map, sample_label, kb, logger): + for ds in dsamples_map[sample_label]: + # this is a hack specific for the automator workflow + if not ds.label.startswith('stage1'): + logger.info('Deleting data for %s', ds.label) + dobjs = kb.get_data_objects(ds) + for d in dobjs: + kb.delete(d) + a = ds.action + kb.delete(ds) + try: + kb.delete(a) + except: + pass + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('delete_flowcell_results', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + dsamples_map = get_flowcell_samples_map(args.flowcell_label, kb, logger) + print_report(dsamples_map, args.sample_label, kb, logger) + if not args.dry_run: + delete(dsamples_map, args.sample_label, kb. logger) + pass + else: + logger.debug('SIMULATION, exit now') + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/delete_flowcell_results.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/delete_flowcell_results.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,36 @@ + + + + Delete results produced from a flowcell related to a specific sample + + + + launcher.sh + --interpreter=python + --runner=delete_flowcell_results.py + --logfile=${log_file} + --flowcell-label=${flowcell_label} + --sample-label=${sample_label} + #if $simulate + --dry-run + #end if + + + + + + + + + + + + + +Delete data samples and related data objects produced from the given flowcell and related to the given sample. + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/drop_flowcell_related_items.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/drop_flowcell_related_items.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,124 @@ +import argparse, sys +from collections import Counter + +from bl.vl.kb import KnowledgeBase as KB +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import get_logger, LOG_LEVELS +from bl.vl.kb.drivers.omero.sequencing import SeqDataSample, SequencerOutput + + +def make_parser(): + parser = argparse.ArgumentParser(description='delete all items related to the given flowcell') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('--flowcell-label', type=str, required=True, + help='flowcell label') + parser.add_argument('--datasets-only', action='store_true', + help='delete only data samples and data objects related to the given flowcell') + return parser + + +def get_sources(objs): + sources = set() + for x in objs: + try: + sources.add(x.action.target) + except AttributeError: + # Action has no "target" attribute, no source for item x + pass + return sources + + +def delete_objects(objs, kb, logger, max_retries = 3): + retry_ct = Counter() + while len(objs) > 0: + o = objs.pop(0) + if type(o) in [SeqDataSample, SequencerOutput]: + logger.info('Loading DataObjects for %s:%s' % (o.__class__.__name__, + o.label)) + dobjs = kb.get_data_objects(o) + logger.info('%d DataObjects loaded' % len(dobjs)) + for d in dobjs: + logger.info('Deleting %s:%s' % (d.__class__.__name__, + d.path)) + kb.delete(d) + try: + logger.info('Deleting %s:%s' % (o.__class__.__name__, + o.id)) + act = o.action + kb.delete(o) + try: + logger.info('Deleting source action %s:%s' % (act.__class__.__name__, + act.id)) + kb.delete(act) + except: + logger.info('Can\'t delete action') + except: + logger.info('Can\'t delete, putting back into objects list') + if retry_ct['%s:%s' % (type(o), o.id)] < max_retries: + objs.append(o) + retry_ct['%s:%s' % (type(o), o.id)] += 1 + else: + logger.info('Reached maximum retry limit for the object, skipping') + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('drop_flowcell_related_items', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + + logger.info('Retrieving flowcell with label %s' % args.flowcell_label) + query = 'SELECT fc FROM FlowCell fc WHERE fc.label = :fc_label' + results = kb.find_all_by_query(query, {'fc_label' : args.flowcell_label}) + if len(results) == 0: + logger.info('No FlowCell with label %s in the database' % args.flowcell_label) + sys.exit(0) + fc = results[0] + logger.info('Loaded FlowCell with ID %s' % fc.id) + logger.info('Loading related Lanes') + lanes = list(kb.get_lanes_by_flowcell(fc)) + logger.info('%d Lanes loaded' % len(lanes)) + logger.info('Loading related LaneSlots') + lane_slots = [] + for l in lanes: + lane_slots += list(kb.get_laneslots_by_lane(l)) + logger.info('%d LaneSlots loaded' % len(lane_slots)) + logger.info('Loading related Tubes') + sub_samples = get_sources(lane_slots) + samples = get_sources(sub_samples) + logger.info('%d Tubes loaded' % (len(sub_samples) + len(samples))) + logger.info('Loading related SequencerOutputs') + seq_out = kb.dt.get_connected(fc, kb.SequencerOutput, kb.dt.DIRECTION_OUTGOING) + logger.info('%d SequencerOutputs loaded' % len(seq_out)) + logger.info('Loading related SeqDataSamples') + seq_dsamples = kb.dt.get_connected(fc, kb.SeqDataSample, kb.dt.DIRECTION_OUTGOING) + logger.info('%d SeqDataSamples loaded' % len(seq_dsamples)) + + if args.datasets_only: + delete_items = [seq_dsamples] + else: + delete_items = [seq_dsamples, seq_out, lane_slots, lanes, + [fc], list(sub_samples), list(samples)] + for items in delete_items: + delete_objects(items, kb, logger) + + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/format_vessels_by_individual_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/format_vessels_by_individual_output.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,91 @@ +# This tool format output files from kb_query vessels_by_individual +# into a tabular format with all data related to an individual grouped +# in each row. The tool needs as input a mapping file like +# +# individual_id label +# V12311 A_STUDY:A_CODE +# V135115 A_STUDY:B_CODE +# +# in order to use a known label and not VIDs for each row + +import csv, sys, argparse, logging + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + +def make_parser(): + parser = argparse.ArgumentParser(description='format kb_query vessels_by_individual output file to tabular format') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in_file', type=str, required=True, + help='input file (obtained using kb_query vessels by individual tool)') + parser.add_argument('--map_file', type=str, required=True, + help='mapping file') + parser.add_argument('--out_file', type=str, required=True, + help='output file') + return parser + +def get_mapping(records, grouper_field, grouped_field): + mapping = {} + for rec in records: + mapping.setdefault(rec[grouper_field], []).append(rec[grouped_field]) + return mapping + +def get_labels_mapping(reader, logger): + rows = [r for r in reader] + lmap = get_mapping(rows, 'individual', 'label') + logger.info('%d labels grouped for %d individuals' % (len(rows), + len(lmap))) + return lmap + +def get_vessels_mapping(reader, logger): + rows = [r for r in reader] + vmap = get_mapping(rows, 'individual', 'vessel_label') + logger.info('%d vessels grouped for %d individuals' % (len(rows), + len(vmap))) + return vmap + +def build_record(label, vessels): + record = {'individual_label' : '--'.join(label)} + for v in vessels: + record['vessel_%d' % (vessels.index(v) + 1)] = v + return record + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format' : LOG_FORMAT, + 'datefmt' : LOG_DATEFMT, + 'level' : log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger() + + with open(args.map_file) as mf: + reader = csv.DictReader(mf, delimiter='\t') + labels_map = get_labels_mapping(reader, logger) + + with open(args.in_file) as inf: + reader = csv.DictReader(inf, delimiter='\t') + vessels_map = get_vessels_mapping(reader, logger) + + max_vessels_count = max([len(v) for v in vessels_map.values()]) + csv_fields = ['individual_label'] + for x in xrange(max_vessels_count): + csv_fields.append('vessel_%d' % (x+1)) + + with open(args.out_file, 'w') as ofile: + writer = csv.DictWriter(ofile, csv_fields, delimiter='\t') + writer.writeheader() + for ind, vessels in vessels_map.iteritems(): + writer.writerow(build_record(labels_map[ind], vessels)) + + logger.info('Job completed') + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/format_vessels_by_individual_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/format_vessels_by_individual_output.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,31 @@ + + + Format the output from VLU.vessels_by_individual into a tabular + format + + + launcher.sh + --interpreter=python + --runner=format_vessels_by_individual_output.py + --loglevel=$__app__.config.vl_loglevel + --logfile=${logfile} + --in_file=${in_file} + --map_file=${map_file} + --out_file=${out_file} + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/from_well_to_illumina_measures.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/from_well_to_illumina_measures.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,95 @@ +import sys, argparse, csv + +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.utils import get_logger, LOG_LEVELS +import bl.vl.utils.ome_utils as vlu +from bl.vl.graph.drivers.neo4j import Neo4JDriver + + +def make_parser(): + parser = argparse.ArgumentParser(description='map wells label to illumina bead chip measures') + parser.add_argument('--logfile', type=str, help='log file (deafult=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level (default=INFO)', default='INFO') + parser.add_argument('-H', '--host', type=str, help='OMERO server hostname') + parser.add_argument('-U', '--user', type=str, help='OMERO user') + parser.add_argument('-P', '--passwd', type=str, help='OMERO password') + parser.add_argument('--in_file', type=str, help='input file', + required=True) + parser.add_argument('--out_file', type=str, help='output file', + required=True) + parser.add_argument('--well_column', type=str, help='label of the column that will be mapped', + default='source') + return parser + + +def get_wells_map(plate, kb, logger): + logger.info('Mapping wells for plate %s', plate.label) + wells_map = {} + for w in kb.get_wells_by_plate(plate): + wells_map[w.label] = w + logger.info('Mapped %d wells', len(wells_map)) + return wells_map + + +def get_plates_map(plates_list, kb, logger): + logger.info('Loading TiterPlates map') + plates_map = {} + for pl in kb.get_objects(kb.TiterPlate): + if isinstance(pl, kb.TiterPlate) and pl.barcode in plates_list: + plates_map[pl.barcode] = get_wells_map(pl, kb, logger) + logger.info('Mapped %d plates', len(plates_map)) + return plates_map + + +def get_connected_illumina_measures(well, kb, logger): + logger.debug('Loading connected IlluminaBeadChipMeasures for well %s:%s', well.label, + well.container.label) + return kb.dt.get_connected(well, kb.IlluminaBeadChipMeasures, + direction = Neo4JDriver.DIRECTION_OUTGOING) + + +def wells_to_illumina(in_file, out_file, column_label, kb, logger): + with open(in_file) as ifile, open(out_file, 'w') as ofile: + reader = csv.DictReader(ifile, delimiter='\t') + in_records = [r for r in reader] + plates_barcodes = set([x[column_label].split(':')[0] for x in in_records]) + plates_map = get_plates_map(plates_barcodes, kb, logger) + writer = csv.DictWriter(ofile, reader.fieldnames, delimiter='\t') + writer.writeheader() + logger.info('Mapping wells to illumina bead chip measures') + for rec in in_records: + barcode, well = rec[column_label].split(':') + measures = get_connected_illumina_measures(plates_map[barcode][well], kb, + logger) + if len(measures) != 1: + logger.warning('Found %d measures for well %s:%s, skipping line', len(measures), + barcode, well) + continue + rec[column_label] = measures[0].label + writer.writerow(rec) + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('from_well_to_illumina_measures', level=args.loglevel, + filename=args.logfile) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + wells_to_illumina(args.in_file, args.out_file, args.well_column, + kb, logger) + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/from_well_to_illumina_measures.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/from_well_to_illumina_measures.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,34 @@ + + + + Map well labels to illumina bead chip measures + + + + launcher.sh + --interpreter=python + --runner=from_well_to_illumina_measures.py + --logfile=${logfile} + --in_file=${infile} + --out_file=${out_file} + --well_column=${well_column} + + + + + + + + + + + + + +Map a TSV file's column with PlateWell labels in format PLATE_BARCODE:WELL_LABEL to the label of +the connected IlluminaBeadChipMeasures + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/gdoize_ms.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/gdoize_ms.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,46 @@ + + + Build missing GDOs for the selected markers set + + + py_protobuff_cpp + + + launcher.sh + --interpreter=python + --runner=gdoize_ms + --logfile=${logfile} + #if str($mset_label) != 'select_one' + --markers-set-label=$mset_label + #end if + #if str($study) != 'select_one' + --study-label=$study + #end if + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + + + + + + + + + + + + + + + + + + +Iterate over all genotype data samples corresponding to the given +marker set; create a GDO table row for each genotpye data sample that +does not already have one. + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/launcher.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/launcher.sh Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,27 @@ +#!/bin/sh + +CMD="" +PYTH_PATH="PYTHONPATH=/SHARE/USERFS/els7/users/galaxy/develop/usr-cluster/lib/p\ +ython2.7/site-packages/:/SHARE/USERFS/els7/users/biobank/lib/" +runner="$(dirname ${BASH_SOURCE[0]})/" +until [ -z $1 ] + do + + opt_host='--host=' + opt_interpreter='--interpreter=' + opt_runner='--runner=' + if [[ $1 == $opt_host* ]]; then + PYTH_PATH+=`echo $1 | cut -d '=' -f2 | cut -d '.' -f1` + CMD+=' '$1 + elif [[ $1 == $opt_runner* ]]; then + runner+=`echo $1 | cut -d '=' -f2` + elif [[ $1 == $opt_interpreter* ]]; then + interpreter=`echo $1 | cut -d '=' -f2` + else + CMD+=' '$1 + fi + shift +done +export $PYTH_PATH/:$PYTHONPATH +CMD=$interpreter' '$runner$CMD +$CMD diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_aligned_seq_dsample_import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_aligned_seq_dsample_import.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,76 @@ +import csv, sys, argparse, logging + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def make_parser(): + parser = argparse.ArgumentParser(description='build aligned seq data sample import files') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--study', type=str, required=True, + help='study') + parser.add_argument('--label', type=str, required=True, + help='label') + parser.add_argument('--source', type=str, required=True, + help='source') + parser.add_argument('--device', type=str, required=True, + help='device') + parser.add_argument('--path', type=str, required=True, + help='path') + parser.add_argument('--sample', type=str, required=True, + help='sample') + parser.add_argument('--genome_reference', type=str, required=True, + help='genome reference') + parser.add_argument('--dsample_ofile', type=str, default='./genome_variations_dsample.tsv', + help='output file containing data samples definitions') + parser.add_argument('--dobject_ofile', type=str, default='./genome_variations_dobject.tsv', + help='output file containing data samples definitions') + + return parser + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('prepare_aligned_seq_dsample_inport') + + + with open(args.dsample_ofile, 'w') as ofile: + out_file_header = ['study', 'label', 'source', 'source_type', 'seq_dsample_type', 'status', 'device', 'sample', 'genome_reference'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + writer.writerow({'study': args.study, + 'label': args.label, + 'source': args.source, + 'device': args.device, + 'source_type' : 'Tube', + 'seq_dsample_type': 'AlignedSeqDataSample', + 'status': 'USABLE', + 'sample' : args.sample, + 'genome_reference': args.genome_reference}) + logger.info('Done writing file %s' % args.dsample_ofile) + + with open(args.dobject_ofile, 'w') as ofile: + out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + writer.writerow({'study': args.study, + 'path': args.path, + 'data_sample': args.label, + 'mimetype': 'x-vl/bam', + 'size': '-1', + 'sha1': 'N.A.'}) + logger.info('Done writing file %s' % args.dobject_ofile) + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_aligned_seq_dsample_import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_aligned_seq_dsample_import.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,58 @@ + + + Prepare Tabular file to Import Aligned SeqDatasample + + + + launcher.sh + --interpreter=python + --runner=prepare_aligned_seq_dsample_import.py + --study=${study} + --label=${label} + --source=${source} + --device=${device} + --genome_reference=${genome_reference} + --path=${path} + --sample=${sample} + --logfile=${log_file} + --dsample_ofile=${dsample_ofile} + --dobject_ofile=${dobject_ofile} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_enrollments_import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_enrollments_import.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,97 @@ +""" +Split a file like:: + +source enrollment +V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141 +V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390 + +into two separated a new TSV files + +source study label +V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141 +V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390 + +""" + +import sys, argparse, csv +from bl.vl.utils import LOG_LEVELS, get_logger + + +def get_parser(): + parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', type=str, required=True, + help='input TSV file') + parser.add_argument('--out-enrollments', type=str, required=True, + help='input file with Enrollments definitions') + return parser + + +def get_enrollments_definitions(records, logger): + logger.info('Creating enrollment definitions') + enr_defs = [] + for rec in records: + try: + edef = {} + edef['source'] = rec['source'] + try: + edef['study'], edef['label'] = rec['enrollment'].split(':') + except ValueError: + logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment']) + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + enr_defs.append(edef) + logger.info('Retrieved %d enrollment definitions', len(enr_defs)) + return enr_defs + + +def get_parents_definitions(records, logger): + logger.info('Creating parents definitions') + parents_defs = [] + for rec in records: + try: + pdef = dict() + pdef['individual'] = rec['individual'] + if rec['father'] != 'None' or rec['mother'] != 'None': + pdef['father'] = rec['father'] + pdef['mother'] = rec['mother'] + parents_defs.append(pdef) + else: + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + logger.info('Retrieved %d parents definitions', len(parents_defs)) + return parents_defs + + +def main(argv): + parser = get_parser() + args = parser.parse_args(argv) + + logger = get_logger('prepare_enrollments_import', level=args.loglevel, + filename=args.logfile) + + logger.info('Start processing file %s', args.in_file) + + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + records = [row for row in reader] + logger.info('Loaded %d records', len(records)) + + enrollment_defs = get_enrollments_definitions(records, logger) + with open(args.out_enrollments, 'w') as enr_out: + enr_writer = csv.DictWriter(enr_out, + ['source', 'study', 'label'], + delimiter='\t') + enr_writer.writeheader() + enr_writer.writerows(enrollment_defs) + + logger.info('Job completed') + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_enrollments_import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_enrollments_import.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,45 @@ + + + + Prepare input files for enrollments import workflow + + + + launcher.sh + --interpreter=python + --runner=prepare_enrollments_import.py + --logfile=${log_file} + --in-file=${in_file} + --out-enrollments=${enrs_out} + + + + + + + + + + + + + +Split a file like:: + + source enrollment + V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141 + V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390 + +into two separated a new TSV files + + source study label + V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141 + V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390 + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_genome_variations_dsample_import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_genome_variations_dsample_import.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,75 @@ +import csv, sys, argparse, logging + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def make_parser(): + parser = argparse.ArgumentParser(description='build sequencer output import files') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--study', type=str, required=True, + help='study') + parser.add_argument('--label', type=str, required=True, + help='label') + parser.add_argument('--source', type=str, required=True, + help='source') + parser.add_argument('--device', type=str, required=True, + help='device') + parser.add_argument('--path', type=str, required=True, + help='path') + parser.add_argument('--genome_reference', type=str, required=True, + help='genome reference') + parser.add_argument('--dsample_ofile', type=str, default='./genome_variations_dsample.tsv', + help='output file containing data samples definitions') + parser.add_argument('--dobject_ofile', type=str, default='./genome_variations_dobject.tsv', + help='output file containing data samples definitions') + + return parser + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('prepare_genome_variations_dsample_inport') + + + with open(args.dsample_ofile, 'w') as ofile: + out_file_header = ['study', 'label', 'source', 'device', 'device_type', 'source_type','data_sample_type', + 'status','genome_reference'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + writer.writerow({'study': args.study, + 'label': args.label, + 'source': args.source, + 'device': args.device, + 'device_type': 'Device', + 'source_type' : 'Tube', + 'data_sample_type': 'GenomeVariationsDataSample', + 'status': 'USABLE', + 'genome_reference': args.genome_reference}) + logger.info('Done writing file %s' % args.dsample_ofile) + + with open(args.dobject_ofile, 'w') as ofile: + out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + writer.writerow({'study': args.study, + 'path': args.path, + 'data_sample': args.label, + 'mimetype': 'x-vl/vcf', + 'size': '-1', + 'sha1': 'N.A.'}) + logger.info('Done writing file %s' % args.dobject_ofile) + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_genome_variations_dsample_import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_genome_variations_dsample_import.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,55 @@ + + + Prepare Tabular file to Import Genome Variation Datasample + + + + launcher.sh + --interpreter=python + --runner=prepare_genome_variations_dsample_import.py + --study=${study} + --label=${label} + --source=${source} + --device=${device} + --genome_reference=${genome_reference} + --path=${path} + --logfile=${log_file} + --dsample_ofile=${dsample_ofile} + --dobject_ofile=${dobject_ofile} + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_illumina_import_inputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_illumina_import_inputs.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,155 @@ +from bl.core.io.illumina import GenomeStudioSampleSheetReader as gsr +from bl.vl.utils import LOG_LEVELS, get_logger +import csv, argparse, sys, re + + +def make_parser(): + parser = argparse.ArgumentParser('Split GenomeStudio samplesheet in TSV files to import data within OMERO') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--input-file', type=str, required=True, + help='GenomeStudio samplesheet') + parser.add_argument('--arrays-out-file', type=str, + help='output file containing IlluminaArrayOfArrays definitions', + default='./array_of_arrays.tsv') + parser.add_argument('--bead-chip-out-file', type=str, + help='output file containing IlluminaBeadChipArray definitions', + default='./bead_chip.tsv') + parser.add_argument('--array-measure-out-file', type=str, + help='output file containing IlluminaBeadChipMeasure definitions', + default='./array_measure.tsv') + parser.add_argument('--array-measures-out-file', type=str, + help='output file containing IlluminaBeadChipMeasures definitions', + default='./array_measures.tsv') + parser.add_argument('--study', type=str, required=True, + help='Study label that will be used in the import procedure') + return parser + + +def get_assay_type_enum(manifest_file): + return manifest_file.strip().replace('.bpm', '').replace('-', '_').replace(' ', '_').upper() + + +def prepare_array_of_arrays_input(barcode, study, elements): + ICHIPCORDS_PATTERN = re.compile(r'^r(\d{2})c(\d{2})$', re.IGNORECASE) + rows = [] + cols = [] + for x in elements: + m = re.match(ICHIPCORDS_PATTERN, x['array_label']) + rows.append(int(m.groups()[0])) + cols.append(int(m.groups()[1])) + return { + 'barcode': barcode, + 'rows': max(rows), + 'columns': max(cols), + 'label': barcode, + 'study': study, + } + + +def barcodes_to_labels(elements, wells_map, strict_mapping, logger): + from copy import deepcopy + + mapped_elements = [] + for e in elements: + if e['source'] in wells_map: + new_el = deepcopy(e) + new_el['source'] = wells_map[e['source']] + mapped_elements.append(new_el) + else: + logger.warning('Unable to map well %s' % e['source']) + + if strict_mapping and len(mapped_elements) < len(elements): + msg = 'Mapped %d records of %d' %(len(elements), len(mapped_elements)) + logger.critical(msg) + sys.exit(msg) + return mapped_elements + + +def prepare_bead_chip_array_input(array_barcode, assay_type, study, elements): + return [{ + 'illumina_array': array_barcode, + 'label': x['array_label'], + 'source': x['source'], + 'bead_chip_assay_type': assay_type, + 'study': study, + } for x in elements] + + +def prepare_bead_chip_measure_input(array_barcode, study, elements, + device='generic_illumina_scanner', + status='USABLE'): + records = [] + for channel in ['Grn', 'Red']: + records.extend( + [ + { + 'label': '%s_%s_%s' % (array_barcode, x['array_label'], channel), + 'source': '%s:%s' % (array_barcode, x['array_label']), + 'scanner': device, + 'status': status, + 'study': study, + } for x in elements + ] + ) + return records + + +def prepare_bead_chip_array_measures_input(array_barcode, study, elements): + return [{ + 'study': study, + 'label': '%s_%s' % (array_barcode, x['array_label']), + 'red_channel': '%s_%s_Red' % (array_barcode, x['array_label']), + 'green_channel': '%s_%s_Grn' %(array_barcode, x['array_label']), + 'source': '%s:%s' % (array_barcode, x['array_label']), + } for x in elements] + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('prepare_illumina_import_inputs', level=args.loglevel, + filename=args.logfile) + + logger.info('Processing file %s', args.input_file) + with open(args.input_file) as in_file: + reader = gsr(in_file) + assay_type = get_assay_type_enum(reader.header['A']) + arrays_map = {} + for r in reader: + arrays_map.setdefault(r['SentrixBarcode_A'], []).append({'source': r['Sample_ID'], + 'array_label': r['SentrixPosition_A']}) + with open(args.arrays_out_file, 'w') as array_file,\ + open(args.bead_chip_out_file, 'w') as chip_file,\ + open(args.array_measures_out_file, 'w') as measures_file,\ + open(args.array_measure_out_file, 'w') as measure_file: + arrays_writer = csv.DictWriter(array_file, + ['study', 'label', 'barcode', 'rows', 'columns'], + delimiter='\t') + arrays_writer.writeheader() + chip_writer = csv.DictWriter(chip_file, + ['study', 'illumina_array', 'label', 'source', + 'bead_chip_assay_type'], + delimiter='\t') + chip_writer.writeheader() + measure_writer = csv.DictWriter(measures_file, + ['study', 'label', 'source', 'scanner', 'status'], + delimiter='\t') + measure_writer.writeheader() + measures_writer = csv.DictWriter(measure_file, + ['study', 'label', 'red_channel', 'green_channel', + 'source'], + delimiter='\t') + measures_writer.writeheader() + for k, v in arrays_map.iteritems(): + arrays_writer.writerow(prepare_array_of_arrays_input(k, args.study, v)) + chip_writer.writerows(prepare_bead_chip_array_input(k, assay_type, args.study, v)) + measure_writer.writerows(prepare_bead_chip_measure_input(k, args.study, v)) + measures_writer.writerows(prepare_bead_chip_array_measures_input(k, args.study, v)) + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_illumina_import_inputs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_illumina_import_inputs.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,45 @@ + + + + Prepare inputs file used to import illumina data + + + + launcher.sh + --interpreter=python + --runner=prepare_illumina_import_inputs.py + --input-file=${infile} + --logfile=${log_file} + --study=${study} + --arrays-out-file=${arrays_out_file} + --bead-chip-out-file=${bead_chip_out_file} + --array-measure-out-file=${measure_out_file} + --array-measures-out-file=${measures_out_file} + + + + + + + + + + + + + + + + +Prepare file to import IlluminaArrayOfArrays and IlluminaBeadChip objects reading data from a +GenomeStudio samplesheet + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_individuals_import.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_individuals_import.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,103 @@ +""" +Split a file like:: + + individual gender father mother + ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 + ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 + +into two separated TSV files, the first one will be used to import new individuals and enrollments, +the second one will be used to update father and mother informations for the individuals in the first +file. +""" + +import sys, argparse, csv +from bl.vl.utils import LOG_LEVELS, get_logger + + +def get_parser(): + parser = argparse.ArgumentParser('Prepare input files for individuals import workflow') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', type=str, required=True, + help='input TSV file') + parser.add_argument('--out-individuals', type=str, required=True, + help='input file with Individuals definitions') + parser.add_argument('--out-parents', type=str, required=True, + help='input file with parents definitions') + return parser + + +def get_individual_definitions(records, logger): + logger.info('Creating individual definitions') + ind_defs = [] + for rec in records: + try: + idef = {'father': 'None', 'mother': 'None'} + idef['gender'] = rec['gender'] + try: + idef['study'], idef['label'] = rec['individual'].split(':') + except ValueError: + logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual']) + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + ind_defs.append(idef) + logger.info('Retrieved %d individual definitions', len(ind_defs)) + return ind_defs + + +def get_parents_definitions(records, logger): + logger.info('Creating parents definitions') + parents_defs = [] + for rec in records: + try: + pdef = dict() + pdef['individual'] = rec['individual'] + if rec['father'] != 'None' or rec['mother'] != 'None': + pdef['father'] = rec['father'] + pdef['mother'] = rec['mother'] + parents_defs.append(pdef) + else: + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + logger.info('Retrieved %d parents definitions', len(parents_defs)) + return parents_defs + + +def main(argv): + parser = get_parser() + args = parser.parse_args(argv) + + logger = get_logger('prepare_individuals_import', level=args.loglevel, + filename=args.logfile) + + logger.info('Start processing file %s', args.in_file) + + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + records = [row for row in reader] + logger.info('Loaded %d records', len(records)) + + individual_defs = get_individual_definitions(records, logger) + with open(args.out_individuals, 'w') as inds_out: + inds_writer = csv.DictWriter(inds_out, + ['study', 'label', 'gender', 'father', 'mother'], + delimiter='\t') + inds_writer.writeheader() + inds_writer.writerows(individual_defs) + + parents_defs = get_parents_definitions(records, logger) + with open(args.out_parents, 'w') as parents_out: + parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'], + delimiter='\t') + parents_writer.writeheader() + parents_writer.writerows(parents_defs) + + logger.info('Job completed') + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_individuals_import.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_individuals_import.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,44 @@ + + + + Prepare input files for individuals import\parents update workflow + + + + launcher.sh + --interpreter=python + --runner=prepare_individuals_import.py + --logfile=${log_file} + --in-file=${in_file} + --out-individuals=${inds_out} + --out-parents=${parents_out} + + + + + + + + + + + + + +Split a file like:: + + individual gender father mother + ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 + ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 + +into two separated TSV files, the first one will be used to import new individuals and enrollments, +the second one will be used to update father and mother informations for the individuals in the first +file. + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,283 @@ +""" +This tool produces files that can be used as input to import +* samples +* flowcells +* lanes +* laneslots +within OMERO.biobank using import applications. +If the optional 'study-output-file' parameter is given as input, the +script will produce the input file for a new study definition. +If the optional 'tubes-subsamples-output-file' is given, the script +will generate another file with tubes definitions where each tube is +produced appliying a specific laboratory protocol to an existing +tube. Existing tubes are the ones in tubes-out-file, new tubes' labels +are created using the pattern :: +The config_parameters field must point to a YAML configuration file +with the following structure: + + config_parameters: + study_label: study_label + namespace: namespace + +where study_label is mandatory +""" + +import csv, sys, argparse, logging, yaml +# Needed to import flowcell data +from bioblend.galaxy import GalaxyInstance +import nglimsclient, os + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + +def make_parser(): + parser = argparse.ArgumentParser(description='split sequencing samplesheet') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', '-i', type=str, required=True, + help='input file') + parser.add_argument('--tubes-out-file', type=str, + help='output file containing tube definitions', + default='./tubes_def.tsv') + parser.add_argument('--flowcells-out-file', type=str, + help='output file containing flowcell definitions', + default='./flowcells_def.tsv') + parser.add_argument('--lanes-out-file', type=str, + help='output file containing lane definitions', + default='./lanes_def.tsv') + parser.add_argument('--laneslots-out-file', type=str, + help='output file containing laneslot definitions', + default='./laneslots_def.tsv') + parser.add_argument('--config-parameters', type=str, required=True, + help='a YAML configuration file containing study label and labels namespace, ' + 'namespace is optional') + parser.add_argument('--study-output-file', type=str, + help='output file containing study definition') + parser.add_argument('--tubes-subsamples-output-file', type=str, + help='output file containing tubes subsamples (samples produced applying a ' + 'laboratory protocol to existing samples)') + return parser + + +def get_samplesheet_translator(samplesheet_type='default'): + translator = {'default': {'flowcell_id': 'FCID', + 'tube_id': 'SampleID', + 'lane_id': 'Lane', + 'sample_tag': 'Index', + 'protocol': 'Recipe', + 'operator': 'Operator', + 'sample_project': 'SampleProject'} + } + return translator[samplesheet_type] + +def add_namespace(namespace, label, separator='|'): + return separator.join([namespace, label]) + +def write_tubes_file(records, study_label, translator, ofile, + namespace = None, logger = None): + ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content', + 'vessel_status', 'source', 'source_type'] + with open(ofile, 'w') as out_file: + writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') + writer.writeheader() + tubes_def = set([r[translator['tube_id'].strip()] for r in records]) + for x in tubes_def: + writer.writerow({'study' : study_label, + 'label' : x if not namespace else add_namespace(namespace, x), + 'vessel_type' : 'Tube', + 'vessel_content' : 'DNA', + 'vessel_status' : 'UNKNOWN', + 'source' : 'None', + 'source_type' : 'NO_SOURCE'}) + + +def write_subsamples_file(records, study_label, translator, ofile, + namespace = None, logger = None): + ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content', + 'vessel_status', 'source', 'source_type', 'options'] + with open(ofile, 'w') as out_file: + writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') + writer.writeheader() + subsamples_def = set([('%s::%s' % (r[translator['tube_id']].strip(), r[translator['protocol']].strip()), + r[translator['tube_id']].strip(), + r[translator['protocol']].strip()) for r in records]) + for x in subsamples_def: + writer.writerow({'study' : study_label, + 'label' : x[0] if not namespace else add_namespace(namespace, x[0]), + 'vessel_type' : 'Tube', + 'vessel_content' : 'DNA', + 'vessel_status' : 'UNKNOWN', + 'source' : x[1] if not namespace else add_namespace(namespace, x[1]), + 'source_type' : 'Tube', + 'options' : 'protocol=%s' % x[2]}) + + +def write_flowcells_file(records, study_label, translator, ofile, + namespace = None, logger=None): + ofile_fields = ['study', 'label', 'barcode', 'container_status', + 'number_of_slots'] + with open(ofile, 'w') as out_file: + writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') + writer.writeheader() + flowcells_def = set([(r[translator['flowcell_id']].strip()) for r in records]) + for x in flowcells_def: + writer.writerow({'study' : study_label, + 'label' : x if not namespace else add_namespace(namespace, x), + 'barcode' : x if not namespace else add_namespace(namespace, x), + 'container_status' : 'INSTOCK', + 'number_of_slots' : '8'}) + + +def write_lanes_file(records, study_label, translator, ofile, + namespace = None, logger=None): + ofile_fields = ['study', 'flow_cell', 'slot', 'container_status'] + with open(ofile, 'w') as out_file: + writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') + writer.writeheader() + lanes_def = set([(r[translator['flowcell_id']].strip(), + r[translator['lane_id']].strip()) + for r in records]) + for x in lanes_def: + writer.writerow({'study' : study_label, + 'flow_cell' : x[0] if not namespace else add_namespace(namespace, x[0]), + 'slot' : x[1], + 'container_status' : 'INSTOCK'}) + + +def write_laneslots_file(records, study_label, translator, ofile, + subsamples_enabled=False, + namespace = None, logger=None): + logger.debug ('subsamples_ensabled: %r' % subsamples_enabled) + ofile_fields = ['study', 'lane', 'tag', 'content', 'source', + 'source_type', 'options'] + # Get NGLIMS host and key + try: + galaxy_host = os.environ['NGLIMS_GALAXY_HOST'] + api_key = os.environ['NGLIMS_GALAXY_API_KEY'] + except KeyError as ke: + msg = 'No environment variables %s set to configure access to the Galaxy server' % ke + sys.exit(msg) + # Get flowcell label (assuming label is the same for all records) + fc_id = records[0][translator['flowcell_id']].strip() + # Get flowcell details from nglims + gi = nglimsclient.setup(GalaxyInstance(galaxy_host, api_key)) + if gi.nglims.exists_flowcell_id(fc_id): + fc_data = gi.nglims.flowcell_complete_details(fc_id) + with open(ofile, 'w') as out_file: + writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') + writer.writeheader() + laneslots_def = set() + for r in records: + fc_id = r[translator['flowcell_id']].strip() if not namespace else \ + add_namespace(namespace, r[translator['flowcell_id']]).strip() + if subsamples_enabled: + source_tube_id = '%s::%s' % (r[translator['tube_id']].strip(), + r[translator['protocol']].strip()) + else: + source_tube_id = r[translator['tube_id']].strip() + # Identify adapter + adapter = [i['adapter'] for i in fc_data['details'] if i['name']==r[translator['tube_id']].strip() and i['lane']==int(r[translator['lane_id']].strip())] + laneslots_def.add(('%s:%s' % (fc_id, r[translator['lane_id']].strip()), + r[translator['sample_tag']].strip(), + source_tube_id, + r[translator['protocol']].strip(), + r[translator['operator']].strip(), + r[translator['sample_project']].strip(), + adapter[0])) + for x in laneslots_def: + writer.writerow({'study' : study_label, + 'lane' : x[0], + 'tag' : x[1], + 'content' : 'DNA', + 'source' : x[2] if not namespace else \ + add_namespace(namespace, x[2]), + 'source_type' : 'Tube', + 'options' : 'protocol=%s,operator=%s,sample_project=%s,adapter=%s' % + (x[3], x[4], x[5], x[6])}) + + +def write_study_file(study_label, records, translator, ofile, logger=None): + ofile_fields = ['label', 'description'] + with open(ofile, 'w') as out_file: + writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t', ) + writer.writeheader() + writer.writerow({'label': study_label}) + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format' : LOG_FORMAT, + 'datefmt' : LOG_DATEFMT, + 'level' : log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('prepare_seq_dsample_inputs') + + with open(args.in_file, 'rU') as f: + logger.info('Loading data from file %s' % args.in_file) + reader = csv.DictReader(f, delimiter='\t') + recs = [r for r in reader] + translator = get_samplesheet_translator() + + with open(args.config_parameters) as cfgf: + conf = yaml.load(cfgf) + if not conf.has_key('config_parameters'): + raise RuntimeError('Bad configuration file') + else: + try: + study_label = conf['config_parameters']['study_label'] + except KeyError: + raise RuntimeError('No study_label provided') + if conf['config_parameters'].has_key('namespace'): + namespace = conf['config_parameters']['namespace'] + else: + namespace = None + + if args.study_output_file: + logger.info('Writing Study definition file %s' % args.study_output_file) + write_study_file(study_label, recs, translator, args.study_output_file, logger) + logger.info('Done writing file %s' % args.study_output_file) + + logger.info('Writing Tube definitions file %s' % args.tubes_out_file) + write_tubes_file(recs, study_label, translator, + args.tubes_out_file, namespace, + logger) + logger.info('Done writing file %s' % args.tubes_out_file) + + if args.tubes_subsamples_output_file: + logger.info('Writing Tubes\' subsamples definitions file %s' \ + % args.tubes_subsamples_output_file) + write_subsamples_file(recs, study_label, translator, + args.tubes_subsamples_output_file, + namespace, logger) + logger.info('Done writing file %s' % args.tubes_subsamples_output_file) + + logger.info('Writing FlowCell definitions file %s' % args.flowcells_out_file) + write_flowcells_file(recs, study_label, translator, + args.flowcells_out_file, namespace, + logger) + logger.info('Done writing file %s' % args.flowcells_out_file) + + logger.info('Writing Lane definitions file %s' % args.lanes_out_file) + write_lanes_file(recs, study_label, translator, + args.lanes_out_file, namespace, + logger) + logger.info('Done writing file %s' % args.lanes_out_file) + + logger.info('Writing LaneSlot definitions file %s' % args.laneslots_out_file) + write_laneslots_file(recs, study_label, translator, + args.laneslots_out_file, + 'tubes_subsamples_output_file' in args, # Check if subsamples have been created + namespace, + logger) + logger.info('Done writing file %s' % args.laneslots_out_file) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,77 @@ + + + Extract OMERO.biobank objects from a sequencing samplesheet + + + + launcher.sh + --interpreter=python + --runner=prepare_seq_dsample_inputs.py + --in-file=${infile} + --logfile=${log_file} + --tubes-out-file=${tubes_ofile} + --flowcells-out-file=${flowcells_ofile} + --lanes-out-file=${lanes_ofile} + --laneslots-out-file=${laneslots_ofile} + --config-parameters=${config_params} + --study-output-file=${study_ofile} + --tubes-subsamples-output-file=${subsamples_ofile} + + + + + + + + + + + + + + + + + + + + + + +This tool produces files that can be used as input to import + * samples + * flowcells + * lanes + * laneslots + +within OMERO.biobank using import applications. + +If the optional 'study-output-file' parameter is given as input, the +script will produce the input file for a new study definition. + +If the optional 'tubes-subsamples-output-file' is given, the script +will generate another file with tubes definitions where each tube is +produced appliying a specific laboratory protocol to an existing +tube. Existing tubes are the ones in tubes-out-file, new tubes' labels +are created using the pattern **tube_label::protocol** +The config_parameters field must point to a YAML configuration file +with the following structure: + + config_parameters: + study_label: study_label + + namespace: namespace + +where study_label is mandatory + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_seq_out_inputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_seq_out_inputs.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,99 @@ +""" +This tool produces files that can be used as input to import + * SequencerOutput data samples + * SequencerOutput data objects +within OMERO.biobank using import applications. + +Input file must be like + + run_directory path + 130418_SN194_0303_BC1NYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/130418_SN194_0303_BC1NYHACXX/raw + 160418_SN194_0304_BCAZYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/160418_SN194_0304_BCAZYHACXX/raw + .... +""" + +import csv, sys, argparse, logging + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def make_parser(): + parser = argparse.ArgumentParser(description='build sequencer output import files') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', '-i', type=str, required=True, + help='input file') + parser.add_argument('--dsamples-out-file', type=str, default='./seq_out_dsamples.tsv', + help='output file containing data samples definitions') + parser.add_argument('--dobjects-out-file', type=str, default='./seq_out_dobjects.tsv', + help='output file containing data objects definitions') + parser.add_argument('--study', '-s', type=str, required=True) + return parser + + +def write_dsamples_file(records, out_file, study_label): + + def parse_run_directoty(run_dir): + _, device, _, flowcell = run_dir.split('_') + return device, flowcell[1:] + + with open(out_file, 'w') as ofile: + out_file_header = ['study', 'label', 'source', 'source_type', 'seq_dsample_type', + 'status', 'device'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + for r in records: + device, flowcell = parse_run_directoty(r) + writer.writerow({'study': study_label, + 'label': r, + 'source': flowcell, + 'source_type': 'FlowCell', + 'seq_dsample_type': 'SequencerOutput', + 'status': 'USABLE', + 'device': device}) + + +def write_dobjects_file(records, out_file, study_label): + with open(out_file, 'w') as ofile: + out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + for r in records: + writer.writerow({'study': study_label, + 'path': r['path'], + 'data_sample': r['run_directory'], + 'mimetype': 'x-vl/illumina-run-folder', + 'size': '-1', + 'sha1': 'N.A.'}) + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('prepare_seq_dsample_inputs') + + with open(args.in_file) as f: + logger.info('Loading data from file %s', args.in_file) + reader = csv.DictReader(f, delimiter='\t') + recs = [r for r in reader] + + logger.info('Writing DataSample data to file %s', args.dsamples_out_file) + write_dsamples_file(set([r['run_directory'] for r in recs]), + args.dsamples_out_file, args.study) + logger.info('Writing DataObjects data to file %s', args.dobjects_out_file) + write_dobjects_file(recs, args.dobjects_out_file, args.study) + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/prepare_seq_out_inputs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_seq_out_inputs.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,46 @@ + + + Extract OMERO.biobank objects that can be used to import SequencerOutput data + + + + launcher.sh + --interpreter=python + --runner=prepare_seq_out_inputs.py + --in-file=${infile} + --logfile=${log_file} + --dsamples-out-file=${dsamples_ofile} + --dobjects-out-file=${dobjects_ofile} + --study=${study} + + + + + + + + + + + + + + +This tool produces files that can be used as input to import + * SequencerOutput data samples + * SequencerOutput data objects +within OMERO.biobank using import applications. + +Input file must be like + + run_directory path + 130418_SN194_0303_BC1NYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/130418_SN194_0303_BC1NYHACXX/raw + 160418_SN194_0304_BCAZYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/160418_SN194_0304_BCAZYHACXX/raw + .... + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/split_by_study.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/split_by_study.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,90 @@ +""" +Split a file like:: + + individual gender father mother + ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 + ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 + BSTUDY:12515 FEMALE BSTUDY:3512 BSTUDY:124 + +into multiple files based on the STUDY value of the label stored in the "individual" column. +Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line +will be skipped. +""" + +import sys, argparse, csv, os +from bl.vl.utils import LOG_LEVELS, get_logger + + +def get_parser(): + parser = argparse.ArgumentParser('Split a file containing pedigree informations in multiple files using the study as split criteria') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', type=str, required=True, + help='input TSV file') + parser.add_argument('--record-id', type=int, + help='Output ID record given by Galaxy') + parser.add_argument('--out-path', type=str, help='Output directory', + default='.') + return parser + + +def split_element(element, logger): + try: + study, code = element.split(':') + return study, code + except ValueError: + logger.error('Label %s is not a label with format STUDY:ENROLLMENT_CODE', element) + return None, None + + +def map_by_study(records, logger): + records_map = {} + for rec in records: + study, code = split_element(rec['individual'], logger) + if not study and not code: + logger.debug('Skipping record %r', rec) + continue + records_map.setdefault(study, []).append(rec) + logger.info('Records splitted between %d studies', len(records_map.keys())) + return records_map + + +def dump_records(study_label, records, header, output_path, logger, galaxy_record_id=None): + + def get_file_name(study, out_path, galaxy_id=None): + if not galaxy_id: + file_name = '%s_individuals.tsv' % study + else: + file_name = 'primary_%d_%s_visible_tabular' % (galaxy_id, study.replace('_', '-')) + return os.path.join(out_path, file_name) + + fname = get_file_name(study_label, output_path, galaxy_record_id) + with open(fname, 'w') as ofile: + logger.info('Dumping %d records to file %s', len(records), fname) + writer = csv.DictWriter(ofile, header, delimiter='\t') + writer.writeheader() + writer.writerows(records) + + +def main(argv): + parser = get_parser() + args = parser.parse_args(argv) + + logger = get_logger('split_by_study', level=args.loglevel, filename=args.logfile) + + logger.info('Start processing file %s', args.in_file) + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + records = [row for row in reader] + + records_map = map_by_study(records, logger) + # Force the header of the output files in order to prevent problems when running the workflow later + header = ['individual', 'gender', 'father', 'mother'] + for study, records in records_map.iteritems(): + dump_records(study, records, header, args.out_path, logger, args.record_id) + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/split_by_study.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/split_by_study.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,42 @@ + + + + Split a file containing pedigree informations in multiple files using the study as split criteria + + + + launcher.sh + --interpreter=python + --runner=split_by_study.py + --in-file=${in_file} + --logfile=${log_file} + --record-id=$log_file.id + --out-path=$__new_file_path__ + + + + + + + + + + + +Split a file like:: + + individual gender father mother + ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 + ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 + BSTUDY:12515 FEMALE BSTUDY:3512 BSTUDY:124 + +into multiple files based on the STUDY value of the label stored in the "individual" column. +Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line +will be skipped. + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/wells_barcode_to_label.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/wells_barcode_to_label.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,112 @@ +""" +Map PlateWell labels written as PLATE_BARCODE:WELL_LABEL to labels written as +PLATE_LABEL:WELL_LABEL which is the PlateWell label format required by the map_vid +application. +The inputs are a TSV file and the label of the column of this file containing the +PlateWell labels that are going to be mapped. +""" + +import csv, argparse, sys, copy + +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.utils import LOG_LEVELS, get_logger +import bl.vl.utils.ome_utils as vlu + + +def get_wells_map(kb, plate_barcodes, logger): + wells_map = {} + logger.info('Start building PlateWells map') + res = kb.get_by_field(kb.TiterPlate, 'barcode', plate_barcodes) + logger.debug('Plates %r --- Results: %r', plate_barcodes, res) + for _, pl in res.iteritems(): + if pl.OME_TABLE == 'TiterPlate': + if pl.barcode: + for w in kb.get_wells_by_plate(pl): + logger.debug('Mapping well %s of plate %s', w.label, w.container.label) + wells_map['%s:%s' % (w.container.barcode, w.label)] = '%s:%s' % (w.container.label, + w.label) + else: + logger.debug('TiterPlate %s has no barcode', pl.label) + else: + logger.debug('Object is a %r, skipping it', pl.OME_TABLE) + logger.info('Mapped %d PlateWells', len(wells_map)) + return wells_map + + +def get_plates_list(records, plates_column, logger): + plates = set() + logger.info('Retrieving TiterPlate barcodes from %d records', len(records)) + for r in records: + plates.add(r[plates_column].split(':')[0]) + logger.info('Found %d TiterPlate objects', len(plates)) + return list(plates) + + +def make_parser(): + parser = argparse.ArgumentParser('Map barcodes in PlateWell labels to TiterPlate labels') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('-H', '--host', type=str, help='OMERO host') + parser.add_argument('-U', '--user', type=str, help='OMERO user') + parser.add_argument('-P', '--passwd', type=str, help='OMERO password') + parser.add_argument('--in-file', type=str, required=True, + help='input TSV file') + parser.add_argument('--column-label', type=str, required=True, + help='the label of the columun containing the values that will be mapped') + parser.add_argument('--out-file', type=str, required=True, + help='output TSV file') + parser.add_argument('--strict-mapping', action='store_true', + help='if output records are less than the input ones, raise an error') + return parser + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('wells_barcode_to_label', level=args.loglevel, + filename=args.logfile) + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + logger.info('Starting job') + + kb = KB(driver='omero')(host, user, passwd) + # wells_map = get_wells_map(kb, logger) + + with open(args.in_file) as in_file, open(args.out_file, 'w') as out_file: + reader = csv.DictReader(in_file, delimiter='\t') + if args.column_label not in reader.fieldnames: + msg = 'No column %s in file %s' % (args.column_label, args.in_file) + logger.critical(msg) + raise RuntimeError(msg) + records = [row for row in reader] + plates = get_plates_list(records, args.column_label, logger) + wells_map = get_wells_map(kb, plates, logger) + logger.info('Mapping %d records', len(records)) + writer = csv.DictWriter(out_file, reader.fieldnames, delimiter='\t') + writer.writeheader() + mapped_records = [] + for rec in records: + mapped = copy.deepcopy(rec) + logger.debug('Mapping value %s', mapped[args.column_label]) + if mapped[args.column_label] in wells_map: + mapped[args.column_label] = wells_map[mapped[args.column_label]] + mapped_records.append(mapped) + if args.strict_mapping and len(mapped_records) < len(records): + msg = 'Mapped %d record of %d' % (len(mapped_records), len(records)) + logger.critical(msg) + sys.exit(msg) + logger.info('%d records mapped', len(mapped_records)) + writer.writerows(mapped_records) + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank/utils/wells_barcode_to_label.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/wells_barcode_to_label.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,59 @@ + + + + Map plate barcodes in well labels to plate labels + + + + launcher.sh + --interpreter=python + --runner=wells_barcode_to_label.py + --in-file=${in_file} + --logfile=${log_file} + --column-label=${column_label} + #if $strict_mapping + --strict-mapping + #end if + --out-file=${out_file} + --host=$__user_omero_host__ + --user=$__user_omero_user__ + --passwd=$__user_omero_password__ + + + + + + + + + + + + + + + + + + + + + + +Map PlateWell labels written as PLATE_BARCODE:WELL_LABEL to labels written as +PLATE_LABEL:WELL_LABEL which is the PlateWell label format required by the map_vid +application. +The inputs are a TSV file and the label of the column of this file containing the +PlateWell labels that are going to be mapped. + + + \ No newline at end of file diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/biobank_tool_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank_tool_conf.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,81 @@ + + + +

+ + + + + + + + + + + + + + + + + + + +

+ +

+ + + + + + + + + + + + + + + + + + + + + + + +

+ +

+ + + + +

+ +

+ + + + + + + + + + + + + + + +

+ + diff -r 000000000000 -r ba6cf6ede027 galaxy-tools/orione_biobank_tool_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/orione_biobank_tool_conf.xml Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,41 @@ + + + + +

+ + + + + + + + + + + + + + + + + + + +

+ +

+ + + + + +

+ +

+ + + + +