Source code for datacatalog.linkedstores.basestore.diff

import base64
import json
from copy import copy
from jsondiff import diff
from pprint import pprint
from ...settings import MONGO_DELETE_FIELD
from datacatalog import linkages, settings
from datacatalog.extensible import ExtensibleAttrDict
from datacatalog.jsonschemas import DateTimeEncoder
from datacatalog.utils import time_stamp, current_time, msec_precision

CREATE = 'create'
DELETE = 'delete'
REPLACE = 'replace'
UPDATE = 'update'
ACTIONS = (CREATE, DELETE, REPLACE, UPDATE)
DEFAULT_ACTION = UPDATE

[docs]class DocumentDiff(ExtensibleAttrDict): def __init__(self, delta, uuid, admin, action): setattr(self, 'delta', delta) setattr(self, 'uuid', uuid) setattr(self, 'admin', admin) setattr(self, 'action', action) setattr(self, 'timestamp', msec_precision(current_time())) def __delta_dict(self): return json.dumps(json.loads(self.delta), sort_keys=True, indent=0, separators=(',', ':')) def __doc(self, encoded=True): delta_enc = self.__delta_dict() if encoded: delta_enc = base64.urlsafe_b64encode(delta_enc.encode('utf-8')) doc = {'uuid': self.uuid, 'date': self.timestamp, 'diff': delta_enc, 'action': self.action, '_admin': self.admin} return doc
[docs] def document(self, encoded=True): """Renders DiffRecord into a MongoDB-compatible record """ return self.__doc(encoded)
[docs] def json(self, encoded=True): return json.dumps(self.document(encoded), sort_keys=True, separators=(',', ':'), cls=DateTimeEncoder)
def __repr__(self): return self.json(encoded=False) @property def updated(self): """Were any differences found? """ return json.loads(self.delta) != dict()
[docs]def diff_list(list1, list2): list1_set = set() list2_set = set() list_diff = [] # O(3N) time # index list 2 for index, element in enumerate(list2): list2_set.add(str(index) + str(element)) # check list1 against list2, index list1 for index, element in enumerate(list1): list1_set.add(str(index) + str(element)) check = str(index) + str(element) if check not in list2_set: list_diff.append(str(element)) # check list 2 against list1 for index, element in enumerate(list2): check = str(index) + str(element) if check not in list1_set: list_diff.append(str(element)) return list_diff
# https://github.com/xlwings/jsondiff/issues/18 # Lists are evaluated recursively and lead to maximum recursion depth exceeded in comparisons # A potential work-around: scan through the two documents, diffing "long" lists as we go, # remove them from the JSON document, perform the regular jsondiff comparison, # and merge the diffs back into the result # "long" lists here are lists with over 100 elements
[docs]def diff_remove_long_lists(doc1, doc2): CANDIDATE_LIST_LENGTH = 100 # return results here diff_dict = {} # long lists we will check candidate_long_list_keys = set() # track lists of objects and dictionaries as separate candidates; recurse candidate_dict_keys = set() candidate_list_keys = set() for candidate_key in doc1: candidate = doc1[candidate_key] if isinstance(candidate, list): if len(candidate) > CANDIDATE_LIST_LENGTH: candidate_long_list_keys.add(candidate_key) else: candidate_list_keys.add(candidate_key) elif isinstance(candidate, dict): candidate_dict_keys.add(candidate_key) for candidate_key in doc2: candidate = doc2[candidate_key] if isinstance(candidate, list): if len(candidate) > CANDIDATE_LIST_LENGTH: candidate_long_list_keys.add(candidate_key) else: candidate_list_keys.add(candidate_key) elif isinstance(candidate, dict): candidate_dict_keys.add(candidate_key) # diff the long list for candidate_long_list_key in candidate_long_list_keys: if candidate_long_list_key in doc1 and candidate_long_list_key in doc2: list1 = doc1[candidate_long_list_key] list2 = doc2[candidate_long_list_key] list_diff = diff_list(list1, list2) del doc1[candidate_long_list_key] del doc2[candidate_long_list_key] if len(list_diff) > 0: diff_dict[candidate_long_list_key] = list_diff # recurse on child dictionary keys for candidate_dict_key in candidate_dict_keys: if candidate_dict_key in doc1 and candidate_dict_key in doc2: child1 = doc1[candidate_dict_key] child2 = doc2[candidate_dict_key] if type(child1) == dict and type(child2) == dict: (child1, child2, child_diff_dict) = diff_remove_long_lists(child1, child2) else: child_diff_dict = {} # update children in case they are modified doc1[candidate_dict_key] = child1 doc2[candidate_dict_key] = child2 # merge into parent result if len(child_diff_dict) > 0: diff_dict[candidate_dict_key] = child_diff_dict # recurse on child list keys for candidate_list_key in candidate_list_keys: if candidate_list_key in doc1 and candidate_list_key in doc2: child1_array = doc1[candidate_list_key] child2_array = doc2[candidate_list_key] # type check - make sure these are both lists if not isinstance(child1_array, list) : diff_dict[candidate_list_key] = child1_array continue if not isinstance(child2_array, list): diff_dict[candidate_list_key] = child2_array continue for index, child1 in enumerate(child1_array): if index < len(child2_array): child2 = child2_array[index] # are these objects? primitives would be caught above if type(child1) == dict and type(child2) == dict: (child1, child2, child_diff_dict) = diff_remove_long_lists(child1, child2) # update list children in case they are modified child1_array[index] = child1 child2_array[index] = child2 # merge parent result # track using an index key if len(child_diff_dict) > 0: diff_dict[candidate_list_key + "_" + str(index)] = child_diff_dict # update child lists in case they are modified doc1[candidate_list_key] = child1_array doc2[candidate_list_key] = child2_array return (doc1, doc2, diff_dict)
[docs]def get_diff(source={}, target={}, action=DEFAULT_ACTION): """Determine the differences between two documents Generates a document for the `updates` store that describes the diff between source and target documents. The resulting document includes the document UUID, a timestamp, the document's tenancy details, and the JSON-diff encoded in URL-safe base64. The encoding is necessary because JSON diff and patch formats include keys beginning with `$`, which are prohibited in MongoDB documents. Args: source (dict): Source document target (dict): Target document action (str): Type of update action to represent Returns: dict: A json-diff record LinkEdgesDiff: a record of differences in linkage fields bool: Whether the json-diff was empty or not """ if action not in ACTIONS: raise ValueError('{} is not a valid update log action'.format(action)) doc_uuid = source.get('uuid', target.get('uuid', None)) if doc_uuid is None: raise KeyError('No "uuid" in source or target') doc_admin = source.get('_admin', target.get('_admin', {})) cmp_source_doc = copy(source) cmp_target_doc = copy(target) docs = [cmp_source_doc, cmp_target_doc] safe_docs = list() for doc in docs: # strip linkages for lf in linkages.ALL: if lf in doc: del doc[lf] # strip identifiers for filt in ('uuid', '_id'): if filt in doc: del doc[filt] # Filter _private keys save for the system-wide soft-delete field for key in list(doc.keys()): if key.startswith('_') and key != MONGO_DELETE_FIELD: del doc[key] safe_docs.append(json.loads(json.dumps(doc, cls=DateTimeEncoder))) # doc1 and doc2 have have their long lists removed, diffs are in diff_dict (doc1, doc2, diff_dict) = diff_remove_long_lists(safe_docs[0], safe_docs[1]) delta = diff(doc1, doc2, syntax='explicit', dump=True) # delta is a string - inline any long list diffs if they exist if len(diff_dict) > 0: delta_json = json.loads(delta) delta_json.update(diff_dict) delta = json.dumps(delta_json) doc_diff_obj = DocumentDiff(delta, doc_uuid, doc_admin, action) return doc_diff_obj