Source code for datacatalog.managers.sampleset.processor

import bacanora
import json
import importlib
import inspect
import itertools
import os
import sys
from pprint import pprint

from datacatalog.agavehelpers import from_agave_uri

from ...identifiers.typeduuid import get_uuidtype
from ...utils import dynamic_import
from ..common import Manager
from ...linkedstores.basestore.exceptions import CatalogError
from ... import jsonschemas

[docs]class UnknownReference(CatalogError):
    pass

[docs]class SampleSetProcessorError(CatalogError):
    pass

[docs]class SampleSetProcessor(Manager):
    """Manager class to process and load sample set JSON documents"""

    def __init__(self,
                 mongodb,
                 agave=None,
                 samples_file=None,
                 samples_uri=None,
                 path_prefix='/uploads', *args, **kwargs):
        Manager.__init__(self, mongodb, agave=agave, *args, **kwargs)
        self.prefix = path_prefix
        self.stats = {'samples': {'count': 0, 'elapsed': 0.0},
                      'measurements': {'count': 0, 'elapsed': 0.0},
                      'files': {'count': 0, 'elapsed': 0.0}}
        self.samples_file = samples_file
        self.samples_uri = samples_uri
        # self.setup(samples_file, samples_uri)

[docs]    def setup(self, samples_file=None, samples_uri=None):

        self.logger.debug('Initializing SampleSetProcessor')

        samples_file = getattr(self, 'samples_file', samples_file)
        samples_uri = getattr(self, 'samples_uri', samples_uri)

        # Index the URIand get its UUID
        abs_file_path = None
        file_name = None
        samples_file_uuid = None
        system_id = None

        if samples_uri is not None:
            system_id, file_path, file_name = from_agave_uri(samples_uri)
            abs_file_path = os.path.join(file_path, file_name)
            resp = self.stores['file'].index(abs_file_path,
                                             storage_system=system_id)
            samples_file_uuid = resp.get('uuid', None)
        # No samples file was provided, which means we need to download URI
        if samples_file is None:
            bacanora.download(self.client, abs_file_path,
                              system_id=system_id)
            samples_file = file_name
        setattr(self, 'samples_file_uuid', samples_file_uuid)

        # We can now safely Assume the file is accessible for loading
        document = json.load(open(samples_file, 'r'))
        self.logger.debug('Document.size: {} bytes'.format(sys.getsizeof(document)))

        # Challenge Problem
        doc_cp = document.get('challenge_problem', 'UNKNOWN')
        cp = self.get('challenge_problem', 'id', doc_cp)
        setattr(self, 'challenge_problem', cp)
        self.logger.debug('Challenge_problem: {}'.format(cp))

        # Experiment ID
        doc_exp = document.get('experiment_id', 'UNKNOWN')
        setattr(self, 'experiment_id', doc_exp)
        self.logger.debug('Experiment_id: {}'.format(doc_exp))

        # Experiment Design
        doc_exd = document.get('experiment_reference_url', 'UNKNOWN')
        exd = self.get('experiment_design', 'uri', doc_exd)
        setattr(self, 'experiment_design', exd)
        self.logger.debug('experiment_design: {}'.format(exd))

        # Samples
        setattr(self, '_samples', document.get('samples', []))
        self.logger.debug('count.samples: {}'.format(len(self._samples)))

        self.logger.debug('ready ({})'.format(samples_file))
        return self

[docs]    def get(self, doctype, identifier, identifier_value):
        query = {identifier: identifier_value}
        resp = self.stores[doctype].find_one_by_id(**query)
        if resp is None:
            raise UnknownReference('Unable to get {}.{}={}'.format(doctype, identifier, identifier_value))
        else:
            return resp

    def _update_param(self, strategy):
        """Shim in case we need to validate or add new strategy to LinkedStore"""
        return strategy

[docs]    def process_experiment(self, parent_uuid=None, strategy='merge'):
        try:
            # For now, this is a dummy experimental record
            expt_doc = {
                'experiment_id': self.experiment_id,
                'child_of': [parent_uuid]
            }
            if getattr(self, 'samples_file_uuid', None) is not None:
                expt_doc['derived_from'] = [getattr(self, 'samples_file_uuid')]
            # if 'child_of' in expt_doc:
            #     expt_doc['child_of'].append(parent_uuid)
            # else:
            #     expt_doc['child_of'] = [parent_uuid]
            # For now, ALWAYS replace lab-specific experiment record
            resp = self.stores['experiment'].add_update_document(
                expt_doc, strategy=self._update_param('replace'))
            new_parent_uuid = resp['uuid']
            assert get_uuidtype(new_parent_uuid) == 'experiment', '{} is mistyped'.format(new_parent_uuid)
            self.process_samples(parent_uuid=new_parent_uuid,
                                 strategy=self._update_param(strategy))
        except Exception as exc:
            raise SampleSetProcessorError('Failed to process experiment', exc)

[docs]    def process_samples(self, parent_uuid=None, strategy='merge'):
        try:
            # Samples was cached as _samples at init()
            if not isinstance(self._samples, list):
                raise TypeError('"samples" must be a list')
            for sample in self._samples:
                self.logger.debug('processing.sample: {}'.format(sample['sample_id']))
                if getattr(self, 'samples_file_uuid', None) is not None:
                    sample['derived_from'] = [getattr(self, 'samples_file_uuid')]
                if 'child_of' in sample:
                    sample['child_of'].append(parent_uuid)
                else:
                    sample['child_of'] = [parent_uuid]
                # Don't propagate measurements subdocument in sample record.
                # That's what the linkages are for!
                if 'measurements' in sample:
                    measurements = sample.pop('measurements')
                    self.logger.debug('count.measurements: {}'.format(len(measurements)))
                else:
                    measurements = None
                setattr(self, '_measurements', measurements)
                resp = self.stores['sample'].add_update_document(sample, strategy=self._update_param(strategy))
                new_parent_uuid = resp['uuid']
                assert get_uuidtype(new_parent_uuid) == 'sample', '{} is mistyped'.format(new_parent_uuid)
                if self._measurements is not None:
                    self.process_measurements(new_parent_uuid, strategy=self._update_param(strategy))
        except Exception as exc:
            raise SampleSetProcessorError('Failed to process sample(s)', exc)

[docs]    def process_measurements(self, parent_uuid=None, strategy='merge'):
        try:
            if not isinstance(self._measurements, list):
                raise TypeError('"measurements" must be a list')
            for meas in self._measurements:
                self.logger.debug('processing.measurement: {}'.format(meas['measurement_id']))
                if getattr(self, 'samples_file_uuid', None) is not None:
                    meas['derived_from'] = [getattr(self, 'samples_file_uuid')]
                if 'child_of' in meas:
                    meas['child_of'].append(parent_uuid)
                else:
                    meas['child_of'] = [parent_uuid]
                if 'files' in meas:
                    files = meas.pop('files')
                    self.logger.debug('count.files: {}'.format(len(files)))
                else:
                    files = None
                setattr(self, '_files', files)
                resp = self.stores['measurement'].add_update_document(meas, strategy=self._update_param(strategy))
                new_parent_uuid = resp['uuid']
                assert get_uuidtype(new_parent_uuid) == 'measurement', '{} is mistyped'.format(new_parent_uuid)
                if self._files is not None:
                    self.process_files(new_parent_uuid, strategy=self._update_param(strategy))
        except Exception as exc:
            raise SampleSetProcessorError('Failed to process measurement(s)', exc)

[docs]    def process_files(self, parent_uuid=None, strategy='merge'):
        try:
            if not isinstance(self._files, list):
                raise TypeError('"files" must be a list')
            for ffile in self._files:
                self.logger.debug('processing.file: {}'.format(ffile['file_id']))
                if getattr(self, 'samples_file_uuid', None) is not None:
                    ffile['derived_from'] = [getattr(self, 'samples_file_uuid')]
                ffile['name'] = self.contextualize(ffile['name'])
                if 'child_of' in ffile:
                    ffile['child_of'].append(parent_uuid)
                else:
                    ffile['child_of'] = [parent_uuid]
                self.stores['file'].add_update_document(ffile, strategy=self._update_param(strategy))
        except Exception as exc:
            raise SampleSetProcessorError('Failed to process file(s)', exc)

[docs]    def contextualize(self, filename):
        if filename.startswith('/'):
            filename = filename[1:]
        return os.path.join(self.prefix, filename)

[docs]    def process(self, strategy='merge'):
        """Recursiveley loads contents of a sample set into the catalog

        Args:
            replace (bool, optional): Replace existing records. Default is to merge.

        Returns:
            bool: Returns `True` on success
        """
        # HACK Avoid RecursionError('maximum recursion depth exceeded in comparison',)
        # sys.setrecursionlimit(100000)
        try:
            expt_design_uuid = getattr(self, 'experiment_design').get('uuid')
            self.process_experiment(parent_uuid=expt_design_uuid, strategy=strategy)
            return True
        except Exception as exc:
            raise SampleSetProcessorError('Failed to process file', exc)