Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/MAGE.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/MAGE.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1038 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Funcgen::MAGE + +=head1 SYNOPSIS + +my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); +$imp->register_experiment(); + + +=head1 DESCRIPTION + +B<This program> is a base main class for all MAGE type array importers(e.g. Nimblegen). + +=cut + +################################################################################ + +package Bio::EnsEMBL::Funcgen::Parsers::MAGE; + +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(get_date open_file run_system_cmd); +use Bio::EnsEMBL::Utils::Exception qw( throw deprecate ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::MAGE::XMLUtils; + + +use File::Path; +use strict; +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + + + +################################################################################ + +=head2 new + + Description : Constructor method + + Arg [1] : hash containing optional attributes: + + ReturnType : Bio::EnsEMBL::Funcgen::MAGE + Example : my $Exp = Bio::EnsEMBL::Nimblegen->new(%params); + Exceptions : throws if mandatory params are not set or DB connect fails + Caller : General + Status : Medium - potential for %params names to change, remove %attrdata? + +=cut + +################################################################################ + +sub new{ + my ($caller) = shift; + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + #This needs to be an Importer! + throw("This is base class for the experiment Bio::EnsEMBL::Funcgen::Parsers, needs to inherit from Bio::EnsEMBL::Funcgen::Importer") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + + #Are we not passing any Helper params? + #Log file etc is set in the run script + + my ($write_mage, $no_mage, $vendor) + = rearrange(['WRITE_MAGE', 'NO_MAGE', 'VENDOR'], @_); + + + #$self->{'update_xml'} = $update_xml || 0; + $self->{'write_mage'} = $write_mage || 0; + $self->{'no_mage'} = $no_mage || 0; + $self->{'vendor'} = $vendor; + + + if ($self->vendor ne 'NIMBLEGEN'){ + $self->{'no_mage'} = 1; + warn "Hardcoding no_mage for non-NIMBLEGEN imports"; + } + + + if($self->{'no_mage'} && $self->{'write_mage'}){ + throw('-no_mage and -write_mage options are mutually exclusive, please select just one'); + } + + return $self; +} + +=head2 process_experiment_config + + Example : $self->init_experiment_import(); + Description: Initialises import by creating working directories + and by storing the Experiemnt + Returntype : none + Exceptions : warns and throws depending on recover and Experiment status + Caller : general + Status : at risk - merge with register exeriment + +=cut + +#This is actually processing the tab2mage file & writing XML + +sub process_experiment_config{ + my $self = shift; + + #Here, this is where we need to call the a Parser from Importer to do this for only MAGE experiments + #validate_import? + + #This is only used for the first test below. + my $exp_adaptor = $self->db->get_ExperimentAdaptor(); + my $xml = $exp_adaptor->fetch_mage_xml_by_experiment_name($self->name);# if $self->{'write_xml'}; + + #DO NOT CHANGE THIS LOGIC! + #write mage if we specify or we don't have a the final xml or the template + #recovery is turned on to stop exiting when previously stored chips are found from the 'write_mage' run. + #This does mean that if you import without running the write_mage step + #you could potentially be overwriting someone elses experiment info + #No way of getting around this, need to make warning obvious, add to end of log!!! + #We always want to write and update xml and ResultSets if we're running the 2nd stage of the import + #Why would we ever want to skip the validate process? + #Leave for now as this is working as we want it + #But propose to remove skip functionality + + if( ! $self->{'no_mage'}){ + + if($self->{'write_mage'} || !( -f $self->get_config('tab2mage_file') || $xml)){ + $self->{'write_mage'} = 1; + $self->backup_file($self->get_config('tab2mage_file')); + } + #elsif($xml && (! $self->{'update_xml'})){#Changed this so we always update + #elsif(! $self->{'update_xml'}){ + + + + #Here, we need to always update_xml + #If we are doing the 2nd stage + #Currently this is skipping as we haven't explicitly set it + #To remove this... + #what we need to do is check that we don't test for update_xml, + # i.e. assuming that we're running the second stage of the import. + # Therefore we need a boolean to set whether it is the first stage..else update_xml implicit + # write mage is explicit flag + # Or if we have not tab2mage file? + # we can then override this explicitly with update_xml? + # WE're never likely edit the xml directly, so we always want to validate and update + # so write mage flag become update_experiment? No this is no obvious behaviour + # We need to warn about removing the write_mage flag after we have updated it + # Otherwise we will never get to 2nd stage + + + #No mage is still valid as we may want to jus import and experiment + #Before receiving correct meta data + #When we can then rerun the import with -write_mage to update the resultsets + + # $self->{'recover'} = 1; + # $self->{'skip_validate'} = 1; + #} + elsif( -f $self->get_config('tab2mage_file')){#Run Tab2Mage + + $self->backup_file($self->get_config('mage_xml_file')); + my $cmd = 'tab2mage.pl -e '.$self->get_config('tab2mage_file').' -k -t '.$self->get_dir('output'). + ' -c -d '.$self->get_dir('results'); + + $self->log('Reading tab2mage file'); + my $t2m_exit_code = run_system_cmd($cmd, 1);#no exit flag due to non-zero exit codes + warn "tab2mage exit code is $t2m_exit_code"; + + if(! ($t2m_exit_code > -1) && ($t2m_exit_code <255)){ + throw("tab2mage failed. Please check and correct:\t".$self->get_config('tab2mage_file')."\n...and try again"); + } + + $self->{'recover'} = 1; + } + } + + return; +} + +=heead init_tab2mage_export + + Example : $self->init_tab2mage_export; + Description: Writes the standard experiment section of the tab2mage file + Returntype : FileHandle + Exceptions : ??? + Caller : general + Status : at risk + +=cut + +sub init_tab2mage_export{ + my $self = shift; + + $self->backup_file($self->get_config('tab2mage_file')) if(-f $self->get_config('tab2mage_file')); + + my $t2m_file = open_file($self->get_config('tab2mage_file'), '>'); + + #reformat this + my $exp_section = "experiment section\ndomain\t".(split/@/, $self->contact())[1]."\naccession\t\n". + "quality_control\tbiological_replicate\nexperiment_design_type\tbinding_site_identification\n". + "name\t".$self->name()."\nrelease_date\t\nsubmission_date\t\nsubmitter\t???\n". + "submitter_email\t???\ninvestigator\t???\ninvestigator_email\t???\norganization\t???\naddress\t". + "???\npublication_title\t\nauthors\t\njournal\t\nvolume\t\nissue\t\npages\t\nyear\t\npubmed_id\t\n"; + + my $protocol_section = "Protocol section\naccession\tname\ttext\tparameters\n"; + + foreach my $protocol(sort (keys %{$self->get_config('protocols')})){ + $protocol_section .= $self->get_config('protocols')->{$protocol}->{'accession'}. + "\t".$self->get_config('protocols')->{$protocol}->{'name'}. + "\t".$self->get_config('protocols')->{$protocol}->{'text'}."\t"; + + $protocol_section .= (defined $self->get_config('protocols')->{$protocol}->{'parameters'}) ? + $self->get_config('protocols')->{$protocol}->{'parameters'}."\t\n" : "\t\n"; + } + + #File[raw] Array[accession] Array[serial] Protocol[grow] Protocol[treatment] Protocol[extraction] Protocol[labeling] Protocol[hybridization] Protocol[scanning] BioSource Sample Extract LabeledExtract Immunoprecipitate Hybridization BioSourceMaterial SampleMaterial ExtractMaterial LabeledExtractMaterial Dye BioMaterialCharacteristics[Organism] BioMaterialCharacteristics[BioSourceType] BioMaterialCharacteristics[StrainOrLine] BioMaterialCharacteristics[CellType] BioMaterialCharacteristics[Sex] FactorValue[StrainOrLine] FactorValue[Immunoprecipitate] + + + #Need to do this bit better? + #have array of fields. We can then populate a hash in the read method based on field names, then use the array to print in order + + my $hyb_header = "\nHybridization section\n".join("\t", @{$self->hybridisation_fields()}); + + print $t2m_file $exp_section."\n".$protocol_section."\n".$hyb_header."\n"; + + return $t2m_file; +} + + +#Move to MAGE package? + +sub hybridisation_fields{ + my $self = shift; + + return ['File[raw]', 'Array[accession]', 'Array[serial]', + (map 'Protocol['.$_.']', (sort (keys %{$self->get_config('protocols')}))), + 'BioSource', 'Sample', 'Extract', 'LabeledExtract', 'Immunoprecipitate', 'Hybridization', + 'BioSourceMaterial', 'SampleMaterial', 'ExtractMaterial', 'LabeledExtractMaterial', + 'Dye', 'BioMaterialCharacteristics[Organism]', 'BioMaterialCharacteristics[BioSourceType]', + 'BioMaterialCharacteristics[StrainOrLine]', 'BioMaterialCharacteristics[CellType]', + 'BioMaterialCharacteristics[Sex]', 'FactorValue[StrainOrLine]', 'FactorValue[Immunoprecipitate]']; +} + + + +#=head2 register_experiment +# +# Example : $imp->register_experiment() +# Description: General control method, performs all data import and normalisations +# Arg [1] : optional - dnadb DBAdaptor +# Returntype : none +# Exceptions : throws if arg is not Bio::EnsEMBL::DBSQL::DBAdaptor +# Caller : general +# Status : Medium +# +#=cut + +#write/validate_mage + +sub write_validate_experiment_config{ + my $self = shift; + + + if($self->{'write_mage'} || $self->{'no_mage'}){ + $self->read_data("array"); + + if(! $self->{'no_mage'}){ + $self->log("PLEASE CHECK AND EDIT AUTOGENERATED TAB2MAGE FILE:\t".$self->get_config('tab2mage_file')); + #we could make this print only if it was set by the user, not by the Importer + $self->log('REMEMBER TO REMOVE -write_mage FLAG BEFORE UPDATING'); + exit; + } + } + elsif(! $self->{'no_mage'}){#This should be a no_channel flag, set dependent on import mode(gff_chip, gff_chan) + #Need to accomodate chip level imports in validate? + + if (! $self->{'skip_validate'}){ + + $self->log("Validating mage file:\t".$self->get_config('mage_xml_file')); + + + #Updating ResultSets: + #Given that we might want to add a chip to an experiment we will also need to update the tab2MAGE + #mage_xml and ResultSets accordingly. + + #This should happen if we specify update_xml + #Should recovery also always force update? + #Considering the two run modes, write tab2mage & validate and import + #There is a subtle difference between recovery and update mage_xml + #Do we always run in recovery mode for the validate&import step? + #Yes we do, so can't guarantee the this means we want to update. + #So we need to change update_xml to update to reflect the changed functionality on ResultSets + + #If we run an update without on then chips will be loaded but xml and ResultSets will not be altered :( + #If we're running the 2nd stage we should always be updating the xml anyway!!!! + #As there is no reason to rerun the validate & import step without it.(unless we're debugging maybe) + #So why should we ever run without it? + + #To update ResultSets we validate as normal and then update where appropriate + #What has precedence? Replicate name? + #Update echip types as appropriate + #What if this invalidates original rsets? + #Then list sets not covered for removal by script? + + + + my (%echips, @log); + my $rset_adaptor = $self->db->get_ResultSetAdaptor; + my $chan_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name('RawValue'); + + #need to change this to default analysis + #There we issues with setting VSN_GLOG as an env var + #as this is tested for and the norm was skipped for some reason? + my $chip_anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->norm_method()); + + #Try import sets like this first, so we know ther is new data + my $chan_rset = $self->get_import_ResultSet($chan_anal, 'channel'); + my $rset = $self->get_import_ResultSet($chip_anal, 'experimental_chip'); + + + #Else get them anyway and log + if(! $rset){ + + if($chan_rset){ + $self->log('Identified partial Channel only import, updating MAGE-XML'); + } + else{ + ($chan_rset) = @{$rset_adaptor->fetch_all_by_name_Analysis($self->experiment->name.'_IMPORT', $chan_anal)}; + #Don't need to test for >1 here as this has already been done in get_import_ResultSet + $self->log('All ExperimentalChips imported, updating MAGE-XML only'); + } + + ($rset) = @{$rset_adaptor->fetch_all_by_name_Analysis($self->experiment->name.'_IMPORT', $chip_anal)}; + } + + + #This will never happen now due to the change tab2mage rules in init_experiment + #Remove? + if(! $rset){ + throw('Cannot find ResultSet, are you trying to import a new experiment which already has a tab2mage file present? Try removing the file, or specifying the -write_mage flag to parse_and_import.pl'); + } + + if(! -l $self->get_dir('output').'/MAGE-ML.dtd'){ + system('ln -s '.$ENV{'EFG_DATA'}.'/MAGE-ML.dtd '.$self->get_dir('output').'/MAGE-ML.dtd') == 0 || + throw('Failed to link MAGE-ML.dtd'); + } + + + $self->log('VALIDATING MAGE XML'); + my $reader = Bio::MAGE::XML::Reader->new(); + my $mage_xml ||= $self->get_config('mage_xml_file'); + $self->{'mage'} = $reader->read($mage_xml); + + #this should only ever return 1 for an import + foreach my $mage_exp(@{$self->{'mage'}->getExperiment_package->getExperiment_list()}){ + + if($mage_exp->getName() ne $self->name()){ + $self->log('MAGE experiment name ('.$mage_exp->getName().') does not match import name ('.$self->name().')'); + } + + #add more experiment level validation here? + + foreach my $assay (@{$mage_exp->getBioAssays()}){ + + if($assay->isa('Bio::MAGE::BioAssay::PhysicalBioAssay')){#channel + $self->log('Validating PhysicalBioAssay "'.$assay->getName()."'\n");#hyb name(this is the file name for measured assays + + my $bioassc = $assay->getBioAssayCreation();#This is a Hybridisation + my $array = $bioassc->getArray();#this is an ArrayChip + my $design_id = $array->getArrayDesign->getIdentifier(); + my $chip_uid = $array->getArrayIdentifier(); + + + foreach my $echip(@{$rset->get_ExperimentalChips()}){ + + + if($echip->unique_id() eq $chip_uid){ + $self->log("Found ExperimentalChip:\t".$chip_uid); + + if(! exists $echips{$chip_uid}){ + $echips{$chip_uid} = {( + total_biorep => undef, + total_biotechrep => undef, + experimental_biorep => undef, + experimental_biotechrep => undef, + total_dye => undef, + experimental_dye => undef, + cell_type => undef, + feature_type => undef, + )}; + } + + #Validate ArrayChip + my ($achip) = @{$self->db->get_ArrayChipAdaptor->fetch_all_by_ExperimentalChips([$echip])}; + + if($achip->design_id() ne $design_id){ + push @log, "ArrayDesign Identifier (${design_id}) does not match ArrayChip design ID (". + $achip->design_id().")\n\tSkipping channel and replicate validation"; + #skip the channel/replicate validation here? + } + else { #validate channels and replicate names + + foreach my $src_biomat (@{$bioassc->getSourceBioMaterialMeasurements()}) { #Channel materials(X1)? + my $biomat = $src_biomat->getBioMaterial(); #LabelledExtract (IP/Control) + #we could sub this passing $echip and biomat? + #messy to pass regexs and populate correct echip hash attrs + #also messy to populate log + #keeping nested loop also prevents further obfuscation + #do we need to do all the defined checks, or maybe just the first one? + #Then we can skip all following warning? + + foreach my $treat (@{$biomat->getTreatments()}) { + #As there is effectively one more level of material extraction for the IP channel + #this loop will returns materials an iteration out of sync for each channel + + foreach my $ssrc_biomat (@{$treat->getSourceBioMaterialMeasurements()}) { #Channel measurement(x1) + my $sbiomat = $ssrc_biomat->getBioMaterial(); + #This will either be techrep name for control of IP name for experimental channel + #SOM0035_BR1_TR2 IP #Immunoprecicpitate + #SOM0035_BR1_TR2 #Extract + + if ($sbiomat->getName() =~ /BR[0-9]+_TR[0-9]+$/o) { #Total + + if (! defined $echips{$chip_uid}{'total_biotechrep'}) { + $echips{$chip_uid}{'total_biotechrep'} = $sbiomat->getName(); + } + else{ + push @log, "Found two TOTAL Channels on same chip with biotechreps:\t".$sbiomat->getName(). + " and ".$echips{$chip_uid}{'total_biotechrep'}; + } + }else{#Experimental + + #get feature type from assay + my $fv_ref = $assay->getBioAssayFactorValues(); + if(! defined $fv_ref){ + throw('No FactorValues found, you must populate the "Immunoprecipitate" field. Maybe you forgot to specify -feature_type?'); + } + + my ($feature_type); + + foreach my $fvalue(@{$fv_ref}){ + + if($fvalue->getValue()->getCategory() eq 'Immunoprecipitate'){ + $feature_type = $fvalue->getName(); + $feature_type =~ s/anti\s*-\s*//; + $feature_type =~ s/\s*antibody\s*//; + } + } + $echips{$chip_uid}{'feature_type'} = $feature_type; + } + + foreach my $ttreat (@{$sbiomat->getTreatments()}) { + + foreach my $tsrc_biomat (@{$ttreat->getSourceBioMaterialMeasurements()}) { + my $tbiomat = $tsrc_biomat->getBioMaterial(); + #SOM0035_BR1_TR2 #Extract (exp) + #SOM0035_BR1 #Sample (total) + + if ($tbiomat->getName() =~ /BR[0-9]+_TR[0-9]+$/o) { #experimental + + if (! defined $echips{$chip_uid}{'experimental_biotechrep'}) { + $echips{$chip_uid}{'experimental_biotechrep'} = $tbiomat->getName(); + } + else{ + push @log, "Found two EXPERIMENTAL Channels on same chip with biotechreps:\t".$tbiomat->getName(). + " and ".$echips{$chip_uid}{'experimental_biotechrep'}; + } + + my $dye = $biomat->getLabels()->[0]->getName(); + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->type() eq 'EXPERIMENTAL') { + + if (uc($dye) ne uc($chan->dye())) { + push @log, "EXPERIMENTAL channel dye mismatch:\tMAGE = ".uc($dye).' vs DB '.uc($chan->dye); + } else { + $echips{$chip_uid}{'experimental_dye'} = uc($dye); + } + } + } + } + else { #control + + if (! defined $echips{$chip_uid}{'total_biorep'}) { + $echips{$chip_uid}{'total_biorep'} = $tbiomat->getName(); + } + else{ + push @log, "Found two TOTAL Channels on same chip with biotechreps:\t".$tbiomat->getName(). + " and ".$echips{$chip_uid}{'total_biorep'}; + } + + my $dye = $biomat->getLabels()->[0]->getName(); + + foreach my $chan (@{$echip->get_Channels()}) { + + if ($chan->type() eq 'TOTAL') { + + if (uc($dye) ne uc($chan->dye())) { + push @log, "TOTAL channel dye mismatch:\tMAGE = ".uc($dye).' vs DB '.uc($chan->dye); + } + else { + $echips{$chip_uid}{'total_dye'} = uc($dye); + } + } + } + } + #could do one more iteration and get Source and FeatureType? + #we should really extend this, and then update the EC cell_type and feature_types + #these features might not be biotmats tho...need to check + + + foreach my $ftreat (@{$tbiomat->getTreatments()}) { + + foreach my $fsrc_biomat (@{$ftreat->getSourceBioMaterialMeasurements()}) { + my $fbiomat = $fsrc_biomat->getBioMaterial(); + #EXPERIMENTAL - biorep + #TOTAL - source/cell type + my $cell_type; + + if($fbiomat->getName() =~ /BR[0-9]+$/o){#EXPERIMETNAL + + if(! defined $echips{$chip_uid}{'experimental_biorep'}){ + $echips{$chip_uid}{'experimental_biorep'} = $fbiomat->getName(); + } + else{ + push @log, "Found two Experimental Channels on same chip with bioreps:\t".$fbiomat->getName(). + " and ".$echips{$chip_uid}{'experimental_biorep'}; + } + + + #last treatment/measurement/biomat level should go here + #as TOTAL channel does not have another level and will fail + foreach my $xtreat (@{$fbiomat->getTreatments()}) { + + foreach my $xsrc_biomat (@{$xtreat->getSourceBioMaterialMeasurements()}) { + my $xbiomat = $xsrc_biomat->getBioMaterial(); + + foreach my $char(@{$xbiomat->getCharacteristics()}){ + $cell_type = $char->getValue() if($char->getCategory() eq 'CellType'); + } + } + } + + }else{#this should be BioSource + #which should have CellType as characteristic + #we could change tab2mage and have this as a factor value, + #but don't want to start messing with "standard" format + + foreach my $char(@{$fbiomat->getCharacteristics()}){ + $cell_type = $char->getValue() if($char->getCategory() eq 'CellType'); + } + } + + #can have cell_type validation here + if(! defined $echips{$chip_uid}{'cell_type'}){ + $echips{$chip_uid}{'cell_type'} = $cell_type; + } + elsif( $echips{$chip_uid}{'cell_type'} ne $cell_type){ + push @log, "Found Channels on same chip (${chip_uid}) with different cell types:\t". + $cell_type." and ".$echips{$chip_uid}{'cell_type'}; + } + } + } + } + } + } + } + } + } + } #end of echip + } #end of foreach echip + } #end of physbioassay + } #end of foreach assay + } #end of foreach exp + + + + #we should fail here with log before we update the result sets + + #we need to build rep names + #we're currently using sample labels, in the tab2mage file + #altho' previous sets have been using exp name + #these have been manually patched afterwards + + #More desirable to have exp name as rset name, but no way of doing BR validation + #based on sample label, if we don't have it in the tab2mage + #if we change it in the DB then we need to update the tab2mage + + #no way to do this when generating tab2mage as the user hasn't yet defined the reps + #we could just make reps based on sample labels + #then we just assume that alterations made by the user are correct + #as we can no longer validate using sample labels + #can still validate using cell/feature type + + #no longer need vendor specific validation as this will be done in tab2mage generation + + + #We need to validate reps here + #then update ec records as appropriate and then create rsets + + my (%bio_reps, %tech_reps); + my $ct_adaptor = $self->db->get_CellTypeAdaptor(); + my $ft_adaptor = $self->db->get_FeatureTypeAdaptor(); + +#select rs.*, ec.*, c.* from result_set rs, chip_channel cc, channel c, experimental_chip ec where rs.result_set_id=cc.result_set_id and cc.table_name='experimental_chip' and cc.table_id=ec.experimental_chip_id and cc.table_id=c.experimental_chip_id order by name; + + foreach my $echip (@{$rset->get_ExperimentalChips()}) { + + my ($biorep, $biotechrep); + + if (! exists $echips{$echip->unique_id()}) { + push @log, "No MAGE entry found for ExperimentalChip:\t".$echip->unique_id(); + } + else { + + foreach my $chan_type('total', 'experimental'){ + + $biorep = $echips{$echip->unique_id()}{$chan_type.'_biorep'}; + $biotechrep = $echips{$echip->unique_id()}{$chan_type.'_biotechrep'}; + + if (! defined $biotechrep) { + push @log, 'ExperimentalChip('.$echip->unique_id().') Extract field do not meet naming convention(SAMPLE_BRN_TRN)'; + } #! defined biorep? will never occur at present + elsif ($biotechrep !~ /\Q$biorep\E/) { + push @log, "Found Extract(techrep) vs Sample(biorep) naming mismatch\t${biotechrep}\tvs\t$biorep"; + } + + if ( ! $echips{$echip->unique_id()}{$chan_type.'_dye'}) { + push @log, "No ".uc($chan_type)." channel found for ExperimentalChip:\t".$echip->unique_id(); + } + + } + + #Is this is really implicit in the test above + if($echips{$echip->unique_id()}{'experimental_biorep'} ne $echips{$echip->unique_id()}{'total_biorep'}){ + push @log, "Found biorep mismatch between channels of ExperimentalChip ".$echip->unique_id().":\n". + "\tEXPERIMENTAL\t".$echips{$echip->unique_id()}{'experimental_biorep'}."\tTOTAL\t". + $echips{$echip->unique_id()}{'total_biorep'}; + } + + #Is this is really implicit in the test above + if($echips{$echip->unique_id()}{'experimental_biotechrep'} ne $echips{$echip->unique_id()}{'total_biotechrep'}){ + push @log, "Found biotechrep mismatch between channels of ExperimentalChip ".$echip->unique_id().":\n". + "\tEXPERIMENTAL\t".$echips{$echip->unique_id()}{'experimental_biotechrep'}."\tTOTAL\t". + $echips{$echip->unique_id()}{'total_biotechrep'}; + } + + + } + + + #Now we need to validate ec has same feature/cell type as other ecs in this br + #this does not handle import sets which ARE allowed to have same name but different types + + #warn "Processing ".$echip->unique_id()." $biorep $biotechrep"; + + + if(exists $bio_reps{$biorep}){ + + + if(! defined $bio_reps{$biorep}{'cell_type'}){ + push @log, "Found undefined CellType for biorep $biorep"; + } + elsif($bio_reps{$biorep}{'cell_type'}->name() ne $echips{$echip->unique_id()}{'cell_type'}){ + push @log, "Found CellType mismatch between $biorep and ExperimentalChip ".$echip->unique_id(); + } + + + if(! defined $bio_reps{$biorep}{'feature_type'}){ + push @log, "Found undefined FeatureType for biorep $biorep"; + } + elsif($bio_reps{$biorep}{'feature_type'}->name() ne $echips{$echip->unique_id()}{'feature_type'}){ + push @log, "Found FeatureType mismatch between $biorep and ExperimentalChip ".$echip->unique_id(); + } + + #warn "$biorep exists with\t".$bio_reps{$biorep}{'cell_type'}->name().' '.$bio_reps{$biorep}{'feature_type'}->name(); + + #We need to set the tech rep here too! + #Do we need to validate this also, as above. + #This would be overkill due to the inherant nature of the TR to BR relationship + + if(! exists $tech_reps{$biotechrep}){ + $tech_reps{$biotechrep}{'cell_type'} = $bio_reps{$biorep}{'cell_type'}; + $tech_reps{$biotechrep}{'feature_type'} = $bio_reps{$biorep}{'feature_type'}; + } + + + }else{ + + #warn "Creating new BR $biorep and TR $biotechrep"; + + if(defined $echips{$echip->unique_id()}{'cell_type'}){ + + my $cell_type = $ct_adaptor->fetch_by_name($echips{$echip->unique_id()}{'cell_type'}); + + if(! defined $cell_type){ + push @log, 'CellType '.$echips{$echip->unique_id()}{'cell_type'}.' does not exist in the database, please use the import_type.pl script'; + }else{ + $bio_reps{$biorep}{'cell_type'} = $cell_type; + $tech_reps{$biotechrep}{'cell_type'} = $cell_type; + # warn "Setting ".$echip->unique_id()." $biorep $biotechrep ".$cell_type->name; + } + }else{ + warn "No CellType specified for ExperimentalChip:\t".$echip->unique_id()."\n"; + } + + + if(defined $echips{$echip->unique_id()}{'feature_type'}){ + my $feature_type = $ft_adaptor->fetch_by_name($echips{$echip->unique_id()}{'feature_type'}); + + if(! defined $feature_type){ + push @log, 'FeatureType '.$echips{$echip->unique_id()}{'feature_type'}.' does not exist in the database, please use the import_type.pl script'; + } + else{ + $bio_reps{$biorep}{'feature_type'} = $feature_type; + $tech_reps{$biotechrep}{'feature_type'} = $feature_type; + + #warn "Setting ".$echip->unique_id()." $biorep $biotechrep ".$feature_type->name; + } + }else{ + warn "No FeatureType specified for ExperimentalChip:\t".$echip->unique_id()."\n"; + } + } + + push @{$tech_reps{$biotechrep}{'echips'}}, $echip->unique_id(); + push @{$bio_reps{$biorep}{'echips'}}, $echip->unique_id(); + } + + + + + if (@log) { + $self->log("MAGE VALIDATION REPORT\n::\t".join("\n::\t", @log)); + throw("MAGE VALIDATION FAILED\nPlease correct tab2mage file and try again:\t".$self->get_config('tab2mage_file')); + } else { + $self->log('MAGE VALDIATION SUCCEEDED'); + } + + + #we also need to build the tech rep results sets(not displayable) + #do we need to have result sets for each biorep too? + #update ExperimentalChip replicate info + my (%rsets); + my %types = ( + feature => {}, + cell => {}, + ); + + + + #This needs to update and split the import/top level sets so they are of same types + #update ec type here as we have ec context + #careful not to update multiple times, just once for each ec + + my $eca = $self->db->get_ExperimentalChipAdaptor(); + + + foreach my $echip (@{$rset->get_ExperimentalChips()}) { + my ($cell_type, $feature_type); + + #Set biorep info and rset + foreach my $biorep (keys %bio_reps){ + + foreach my $chip_uid(@{$bio_reps{$biorep}{'echips'}}){ + + if($chip_uid eq $echip->unique_id()){ + $echip->biological_replicate($biorep); + $cell_type = $bio_reps{$biorep}{'cell_type'}; + $feature_type = $bio_reps{$biorep}{'feature_type'}; + + if(! defined $rsets{$biorep}){ + + $rsets{$biorep} = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -NAME => $biorep,#this may not be unique, prepend with exp name? Force method to use Experiment_and_name? + -ANALYSIS => $rset->analysis(), + -TABLE_NAME => 'experimental_chip', + -FEATURE_TYPE => $feature_type, + -CELL_TYPE => $cell_type, + ); + + #record cell and feature types + $types{'feature'}{$feature_type->name()} = $feature_type; + $types{'cell'}{$cell_type->name()} = $cell_type; + $self->log("Created BioRep ResultSet:\t".$rsets{$biorep}->log_label); + } + + $rsets{$biorep}->add_table_id($echip->dbID(), $rset->get_chip_channel_id($echip->dbID())); + } + } + } + + #reset echip types + $echip->feature_type($feature_type); + $echip->cell_type($cell_type); + + + #set tech rep info and rset + foreach my $techrep(keys %tech_reps){ + + foreach my $chip_uid(@{$tech_reps{$techrep}{'echips'}}){ + + if($chip_uid eq $echip->unique_id()){ + $echip->technical_replicate($techrep); + + if(! defined $rsets{$techrep}){ + $rsets{$techrep} = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -NAME => $techrep,#this may not be unique, prepend with exp name? Force method to use Experiment_and_name? + -ANALYSIS => $rset->analysis(), + -TABLE_NAME => 'experimental_chip', + -FEATURE_TYPE => $tech_reps{$techrep}{'feature_type'}, + -CELL_TYPE => $tech_reps{$techrep}{'cell_type'}, + ); + + $self->log("Created TechRep ResultSet:\t".$rsets{$techrep}->log_label); + } + $rsets{$techrep}->add_table_id($echip->dbID(), $rset->get_chip_channel_id($echip->dbID())); + } + } + } + + $echip->adaptor->update_replicate_types($echip);#store rep info + } + + + ### Reset/Update/Clean import sets type fields + my $sql; + + if(scalar keys %{$types{'feature'}} >1){ + $self->log('Resetting IMPORT FeatureType to NULL for multi-FeatureType Experiment'); + $sql = "UPDATE result_set set feature_type_id='NULL' where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + + }else{ + my ($ftype) = values %{$types{'feature'}}; + + if(! defined $rset->feature_type()){ + $self->log('Updating IMPORT FeatureType to '.$ftype->name()); + $sql = "UPDATE result_set set feature_type_id=".$ftype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + } + elsif($rset->feature_type->dbID ne $ftype->dbID()){ + $self->log('WARNING: FeatureType mismatch. Updating IMPORT FeatureType('.$rset->feature_type->name().') to match meta('.$ftype->name.')'); + $sql = "UPDATE result_set set feature_type_id=".$ftype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + + } + } + + $self->db->dbc->do($sql) if $sql; + + undef $sql; + + if(scalar keys %{$types{'cell'}} >1){ + $self->log('Resetting IMPORT CellType to NULL for multi-CellType Experiment'); + my $sql = "UPDATE result_set set cell_type_id='NULL' where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + }else{ + my ($ctype) = values %{$types{'cell'}}; + + if(! defined $rset->cell_type()){ + $self->log('Updating IMPORT CellType to '.$ctype->name()); + $sql = "UPDATE result_set set cell_type_id=".$ctype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + } + elsif($rset->cell_type->dbID ne $ctype->dbID()){ + $self->log('WARNING: CellType mismatch. Updating IMPORT CellType('.$rset->cell_type->name().') to match meta('.$ctype->name.')'); + $sql = "UPDATE result_set set cell_type_id=".$ctype->dbID()." where result_set_id in (".$rset->dbID().', '.$chan_rset->dbID().')'; + } + } + + $self->db->dbc->do($sql) if $sql; + + ### Generate new top level sets here based on br type combos + #we risk duplicating sets here if import set is set to one cell/featuretype + #duplicate anyway, as import is really just for easy tracking of all chips during import + + my %toplevel_sets; + my $toplevel_cnt = 1; + #could tidy up toplevel_sets implmentation + + foreach my $new_rset(values %rsets){ + + my $ftype_name = (defined $new_rset->{'feature_type'}) ? $new_rset->{'feature_type'}->name() : undef; + my $ctype_name = (defined $new_rset->{'cell_type'}) ? $new_rset->{'cell_type'}->name() : undef; + + if(! exists $toplevel_sets{$ftype_name}){ + $toplevel_sets{$ftype_name} = {}; + $toplevel_sets{$ftype_name}{'feature_type'} = $new_rset->{'feature_type'}; + } + + + + if(! exists $toplevel_sets{$ftype_name}{$ctype_name}){ + $toplevel_sets{$ftype_name}{$ctype_name}{'cell_type'} = $new_rset->{'cell_type'}; + $toplevel_sets{$ftype_name}{$ctype_name}{'rsets'} = [$new_rset]; + }else{ + push @{$toplevel_sets{$ftype_name}{$ctype_name}{'rsets'}}, $new_rset; + } + } + + + + #build toplevel sets for each feature/cell type combo using constituent rsets + foreach my $ftype_name(keys %toplevel_sets){ + + foreach my $ctype_name(keys %{$toplevel_sets{$ftype_name}}){ + + next if $ctype_name eq 'feature_type';#skip feature type + + #we need to give these a different key so we're not overwriting in the rset hash + $rsets{$self->experiment->name().'_'.$toplevel_cnt} = Bio::EnsEMBL::Funcgen::ResultSet->new + ( + -NAME => $self->experiment->name(), + -ANALYSIS => $rset->analysis(), + -TABLE_NAME => 'experimental_chip', + -FEATURE_TYPE => $toplevel_sets{$ftype_name}{'feature_type'}, + -CELL_TYPE => $toplevel_sets{$ftype_name}{$ctype_name}{'cell_type'}, + ); + + $self->log("Created toplevel ResultSet for:\t". $rsets{$self->experiment->name().'_'.$toplevel_cnt}->log_label); + + #add consituent table ids + foreach my $new_rset(@{$toplevel_sets{$ftype_name}{$ctype_name}{'rsets'}}){ + + foreach my $ec_id(@{$new_rset->table_ids()}){ + + #Only add it if it has not already been added + if(! $rsets{$self->experiment->name().'_'.$toplevel_cnt}->get_chip_channel_id($ec_id)){ + $rsets{$self->experiment->name().'_'.$toplevel_cnt}->add_table_id($ec_id, $new_rset->get_chip_channel_id($ec_id)); + } + } + } + $toplevel_cnt++; + } + } + + #ResultSet update strategy + #To avoid messyness in resolving result_set differences + #Simply delete all that are not used as supporting sets + #and load new ones, log old supporting rsets for manual + #reassignment and rollback. + #If we have clash between an old set and a new set, rename old + #set and log + #We might not always have the previous data files. + #But we might want to maintain all the previous rsets and just add a new one + #At present this would require acquiring the previous Tab2Mage file + #and adding the new data to it. + #We could do with a way to merge data already in the DB with new meta data to form a new Tab2Mage file + #and validate that + + + my @previous_rep_sets; + my @supporting_rset_dsets; + + + #Get non-import Sets + map {push @previous_rep_sets, $_ if $_->name !~ /_IMPORT$/} + @{$rset_adaptor->fetch_all_by_Experiment_Analysis($self->experiment, $chip_anal)}; + + + #rollback_ResultSet if possible? + #This is just checking if they are supporting, not actually rolling them back + if(@previous_rep_sets){ + $self->log('Found previously stored ResultSets'); + + foreach my $prev_rset(@previous_rep_sets){ + #This should not rollback anything, just return skipped sets + #i.e. sets which have a product feature set + #It also used to delete the supporting set records which maybe important for redefining the DataSet below + my $rset_dset = $self->rollback_ResultSet($prev_rset); + push @supporting_rset_dsets, $rset_dset if @$rset_dset; + } + } + + #Note: If we remove chips from an experiment, they are only removed from the non-import sets + #To fully remove them, you need to use the rollback_experiment.pl script with -chip_ids + #can we log this in get_import_ResultSet? + + $self->log('Storing ResultSets'); + + #Store new tech, biol and toplevel type rsets + foreach my $new_rset(values %rsets){ + my $replace_txt; + + #Rename old set if we have a name/anal/type clash + foreach my $prs(@supporting_rset_dsets){ + + my ($pset, $dset) = @$prs; + + if($pset->log_label eq $new_rset->log_label){ + my $new_name = "OLD_".$rset->log_label; + $self->log("Found update supporting ResultSet clash, renaming to:\t${new_name}"); + $self->unlink_ResultSet_DataSet($rset, $dset, $new_name); + + #This pset dbID has already been removed + #Will get updated with new rset dbID when updating DataSet + $replace_txt = 'Proposed ResultSet(dbID) replacement for DataSet('.$dset->name."):\t".$pset->dbID.' > '; + } + } + + + $new_rset->add_status('DAS_DISPLAYABLE'); + my ($new_rset) = @{$rset_adaptor->store($new_rset)}; + + if(defined $replace_txt){ + $self->log($replace_txt.$new_rset->dbID); + } + } + + my $xml_file = open_file($self->get_config('mage_xml_file')); + + #slurp in changing separator to null so we get it all in one string. + $self->experiment->mage_xml(do{ local ($/); <$xml_file>}); + close($xml_file); + + $self->experiment($self->db->get_ExperimentAdaptor->update_mage_xml_by_Experiment($self->experiment())); + } + } + + return; +} + + +1;