From 2fda1556ef7714d562317049f46db6ff59db5c2f Mon Sep 17 00:00:00 2001 From: "Kostis Anagnostopoulos @ STUW025" Date: Wed, 24 Sep 2014 03:20:07 +0200 Subject: [PATCH 1/5] issue #158: TRY to speed-up scope & $ref url-handling by keeping fragments separated from URL (and avoid redunant frag/defrag). Conflicts: jsonschema/tests/test_benchmarks.py issue #158: Use try-finally to ensure resolver scopes_stack empty when iteration breaks (no detectable performance penalty). * Replace non-python-2.6 DefragResult with named-tuple. * Add test-case checking scopes_stack empty. Conflicts: jsonschema/tests/test_validators.py jsonschema/validators.py --- jsonschema/compat.py | 10 +++++-- jsonschema/validators.py | 58 +++++++++++++++++++++++++--------------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/jsonschema/compat.py b/jsonschema/compat.py index 6ca49ab6b..0afd9eaaa 100644 --- a/jsonschema/compat.py +++ b/jsonschema/compat.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals -import sys + +from collections import namedtuple import operator +import sys + try: from collections import MutableMapping, Sequence # noqa @@ -40,6 +43,9 @@ def urlsplit(url): return SplitResult(scheme, netloc, path, query, fragment) +DefragResult = namedtuple('DefragResult', 'url fragment') + + def urldefrag(url): if "#" in url: s, n, p, q, frag = urlsplit(url) @@ -47,7 +53,7 @@ def urldefrag(url): else: defrag = url frag = '' - return defrag, frag + return DefragResult(defrag, frag) # flake8: noqa diff --git a/jsonschema/validators.py b/jsonschema/validators.py index c347bf145..8f480627a 100644 --- a/jsonschema/validators.py +++ b/jsonschema/validators.py @@ -11,7 +11,8 @@ from jsonschema import _utils, _validators from jsonschema.compat import ( - Sequence, urljoin, urlsplit, urldefrag, unquote, urlopen, + Sequence, urljoin, urlsplit, urldefrag, unquote, urlopen, DefragResult, + str_types, int_types, iteritems, ) from jsonschema.exceptions import ErrorTree # Backwards compatibility # noqa @@ -79,7 +80,10 @@ def iter_errors(self, instance, _schema=None): if _schema is None: _schema = self.schema - with self.resolver.in_scope(_schema.get(u"id", u"")): + scope = _schema.get(u"id") + if scope: + self.resolver.push_scope(scope) + try: ref = _schema.get(u"$ref") if ref is not None: validators = [(u"$ref", ref)] @@ -103,6 +107,9 @@ def iter_errors(self, instance, _schema=None): if k != u"$ref": error.schema_path.appendleft(k) yield error + finally: + if scope: + self.resolver.pop_scope() def descend(self, instance, schema, path=None, schema_path=None): for error in self.iter_errors(instance, schema): @@ -222,7 +229,7 @@ class RefResolver(object): :argument str base_uri: URI of the referring document :argument referrer: the actual referring document - :argument dict store: a mapping from URIs to documents to cache + :argument dict store: a mapping from URIs (without fragments!) to documents to cache :argument bool cache_remote: whether remote refs should be cached after first resolution :argument dict handlers: a mapping from URI schemes to functions that @@ -233,6 +240,7 @@ class RefResolver(object): def __init__( self, base_uri, referrer, store=(), cache_remote=True, handlers=(), ): + base_uri = urldefrag(base_uri) self.base_uri = base_uri self.resolution_scope = base_uri # This attribute is not used, it is for backwards compatibility @@ -240,12 +248,13 @@ def __init__( self.cache_remote = cache_remote self.handlers = dict(handlers) + self.scopes_stack = [] self.store = _utils.URIDict( (id, validator.META_SCHEMA) for id, validator in iteritems(meta_schemas) ) self.store.update(store) - self.store[base_uri] = referrer + self.store[base_uri.url] = referrer @classmethod def from_schema(cls, schema, *args, **kwargs): @@ -259,14 +268,19 @@ def from_schema(cls, schema, *args, **kwargs): return cls(schema.get(u"id", u""), schema, *args, **kwargs) - @contextlib.contextmanager - def in_scope(self, scope): + def push_scope(self, scope, is_defragged=False): old_scope = self.resolution_scope - self.resolution_scope = urljoin(old_scope, scope) - try: - yield - finally: - self.resolution_scope = old_scope + self.scopes_stack.append(old_scope) + if not is_defragged: + scope = urldefrag(scope) + self.resolution_scope = DefragResult( + urljoin(old_scope.url, scope.url, allow_fragments=False) + if scope.url else old_scope.url, + scope.fragment + ) + + def pop_scope(self): + self.resolution_scope = self.scopes_stack.pop() @contextlib.contextmanager def resolving(self, ref): @@ -278,24 +292,26 @@ def resolving(self, ref): """ - full_uri = urljoin(self.resolution_scope, ref) - uri, fragment = urldefrag(full_uri) - if not uri: - uri = self.base_uri + ref = urldefrag(ref) - if uri in self.store: - document = self.store[uri] - else: + url = urljoin(self.resolution_scope.url, ref.url, allow_fragments=False) \ + if ref.url else self.resolution_scope.url + + try: + document = self.store[url] + except KeyError: try: - document = self.resolve_remote(uri) + document = self.resolve_remote(url) except Exception as exc: raise RefResolutionError(exc) + uri = DefragResult(url, ref.fragment) old_base_uri, self.base_uri = self.base_uri, uri + self.push_scope(uri, is_defragged=True) try: - with self.in_scope(uri): - yield self.resolve_fragment(document, fragment) + yield self.resolve_fragment(document, ref.fragment) finally: + self.pop_scope() self.base_uri = old_base_uri def resolve_fragment(self, document, fragment): From 22701dc6526433201d2781b77566a6dba42768a8 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 27 Feb 2015 19:53:54 -0500 Subject: [PATCH 2/5] Fix test failures --- jsonschema/tests/test_validators.py | 4 ++-- jsonschema/validators.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/jsonschema/tests/test_validators.py b/jsonschema/tests/test_validators.py index 2b14372ab..b3512eddc 100644 --- a/jsonschema/tests/test_validators.py +++ b/jsonschema/tests/test_validators.py @@ -815,7 +815,7 @@ def test_it_retrieves_unstored_refs_via_urlopen(self): def test_it_can_construct_a_base_uri_from_a_schema(self): schema = {"id" : "foo"} resolver = RefResolver.from_schema(schema) - self.assertEqual(resolver.base_uri, "foo") + self.assertEqual(resolver.base_uri.url, "foo") with resolver.resolving("") as resolved: self.assertEqual(resolved, schema) with resolver.resolving("#") as resolved: @@ -828,7 +828,7 @@ def test_it_can_construct_a_base_uri_from_a_schema(self): def test_it_can_construct_a_base_uri_from_a_schema_without_id(self): schema = {} resolver = RefResolver.from_schema(schema) - self.assertEqual(resolver.base_uri, "") + self.assertEqual(resolver.base_uri.url, "") with resolver.resolving("") as resolved: self.assertEqual(resolved, schema) with resolver.resolving("#") as resolved: diff --git a/jsonschema/validators.py b/jsonschema/validators.py index 8f480627a..d0431f347 100644 --- a/jsonschema/validators.py +++ b/jsonschema/validators.py @@ -229,7 +229,7 @@ class RefResolver(object): :argument str base_uri: URI of the referring document :argument referrer: the actual referring document - :argument dict store: a mapping from URIs (without fragments!) to documents to cache + :argument dict store: a mapping from URIs to documents to cache :argument bool cache_remote: whether remote refs should be cached after first resolution :argument dict handlers: a mapping from URI schemes to functions that @@ -275,7 +275,7 @@ def push_scope(self, scope, is_defragged=False): scope = urldefrag(scope) self.resolution_scope = DefragResult( urljoin(old_scope.url, scope.url, allow_fragments=False) - if scope.url else old_scope.url, + if scope.url else old_scope.url, scope.fragment ) @@ -294,8 +294,13 @@ def resolving(self, ref): ref = urldefrag(ref) - url = urljoin(self.resolution_scope.url, ref.url, allow_fragments=False) \ - if ref.url else self.resolution_scope.url + if ref.url: + url = urljoin( + self.resolution_scope.url, + ref.url, + allow_fragments=False) + else: + url = self.resolution_scope.url try: document = self.store[url] From 551e9a783f0da29cf107a0be6fc8f4d67a500400 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 27 Feb 2015 21:15:08 -0500 Subject: [PATCH 3/5] Remove unused base_uri --- jsonschema/tests/test_validators.py | 4 ++-- jsonschema/validators.py | 35 ++++++++++------------------- 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/jsonschema/tests/test_validators.py b/jsonschema/tests/test_validators.py index b3512eddc..d966ab821 100644 --- a/jsonschema/tests/test_validators.py +++ b/jsonschema/tests/test_validators.py @@ -815,7 +815,7 @@ def test_it_retrieves_unstored_refs_via_urlopen(self): def test_it_can_construct_a_base_uri_from_a_schema(self): schema = {"id" : "foo"} resolver = RefResolver.from_schema(schema) - self.assertEqual(resolver.base_uri.url, "foo") + self.assertEqual(resolver.resolution_scope.url, "foo") with resolver.resolving("") as resolved: self.assertEqual(resolved, schema) with resolver.resolving("#") as resolved: @@ -828,7 +828,7 @@ def test_it_can_construct_a_base_uri_from_a_schema(self): def test_it_can_construct_a_base_uri_from_a_schema_without_id(self): schema = {} resolver = RefResolver.from_schema(schema) - self.assertEqual(resolver.base_uri.url, "") + self.assertEqual(resolver.resolution_scope.url, "") with resolver.resolving("") as resolved: self.assertEqual(resolved, schema) with resolver.resolving("#") as resolved: diff --git a/jsonschema/validators.py b/jsonschema/validators.py index d0431f347..cbc32c610 100644 --- a/jsonschema/validators.py +++ b/jsonschema/validators.py @@ -12,7 +12,6 @@ from jsonschema import _utils, _validators from jsonschema.compat import ( Sequence, urljoin, urlsplit, urldefrag, unquote, urlopen, DefragResult, - str_types, int_types, iteritems, ) from jsonschema.exceptions import ErrorTree # Backwards compatibility # noqa @@ -82,7 +81,7 @@ def iter_errors(self, instance, _schema=None): scope = _schema.get(u"id") if scope: - self.resolver.push_scope(scope) + self.resolver.push_scope(urldefrag(scope)) try: ref = _schema.get(u"$ref") if ref is not None: @@ -109,7 +108,7 @@ def iter_errors(self, instance, _schema=None): yield error finally: if scope: - self.resolver.pop_scope() + self.resolver.scopes_stack.pop() def descend(self, instance, schema, path=None, schema_path=None): for error in self.iter_errors(instance, schema): @@ -241,14 +240,12 @@ def __init__( self, base_uri, referrer, store=(), cache_remote=True, handlers=(), ): base_uri = urldefrag(base_uri) - self.base_uri = base_uri - self.resolution_scope = base_uri # This attribute is not used, it is for backwards compatibility self.referrer = referrer self.cache_remote = cache_remote self.handlers = dict(handlers) - self.scopes_stack = [] + self.scopes_stack = [base_uri] self.store = _utils.URIDict( (id, validator.META_SCHEMA) for id, validator in iteritems(meta_schemas) @@ -268,19 +265,15 @@ def from_schema(cls, schema, *args, **kwargs): return cls(schema.get(u"id", u""), schema, *args, **kwargs) - def push_scope(self, scope, is_defragged=False): + def push_scope(self, scope): old_scope = self.resolution_scope - self.scopes_stack.append(old_scope) - if not is_defragged: - scope = urldefrag(scope) - self.resolution_scope = DefragResult( - urljoin(old_scope.url, scope.url, allow_fragments=False) - if scope.url else old_scope.url, - scope.fragment - ) + url = (urljoin(old_scope.url, scope.url, allow_fragments=False) + if scope.url else old_scope.url) + self.scopes_stack.append(scope._replace(url=url)) - def pop_scope(self): - self.resolution_scope = self.scopes_stack.pop() + @property + def resolution_scope(self): + return self.scopes_stack[-1] @contextlib.contextmanager def resolving(self, ref): @@ -291,7 +284,6 @@ def resolving(self, ref): :argument str ref: reference to resolve """ - ref = urldefrag(ref) if ref.url: @@ -310,14 +302,11 @@ def resolving(self, ref): except Exception as exc: raise RefResolutionError(exc) - uri = DefragResult(url, ref.fragment) - old_base_uri, self.base_uri = self.base_uri, uri - self.push_scope(uri, is_defragged=True) + self.push_scope(DefragResult(url, ref.fragment)) try: yield self.resolve_fragment(document, ref.fragment) finally: - self.pop_scope() - self.base_uri = old_base_uri + self.scopes_stack.pop() def resolve_fragment(self, document, fragment): """ From 84f0a38287acfa03272177f2079cc49f39a478ac Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Sun, 1 Mar 2015 17:44:03 -0500 Subject: [PATCH 4/5] Resolve in_scope --- jsonschema/validators.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/jsonschema/validators.py b/jsonschema/validators.py index cbc32c610..54e07622e 100644 --- a/jsonschema/validators.py +++ b/jsonschema/validators.py @@ -275,6 +275,14 @@ def push_scope(self, scope): def resolution_scope(self): return self.scopes_stack[-1] + @contextlib.contextmanager + def in_scope(self, scope): + self.push_scope(scope) + try: + yield + finally: + self.scopes_stack.pop() + @contextlib.contextmanager def resolving(self, ref): """ From 5a09ea3fd2cfbb23dffff9ad97232714d02c2dde Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Sun, 1 Mar 2015 17:58:35 -0500 Subject: [PATCH 5/5] Add benchmark script. --- bench.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 bench.py diff --git a/bench.py b/bench.py new file mode 100644 index 000000000..e7318ed75 --- /dev/null +++ b/bench.py @@ -0,0 +1,74 @@ +#!/usr/env/bin python +""" +Benchmark the performance of jsonschema. + +Example benchmark: + + wget http://swagger.io/v2/schema.json + wget http://petstore.swagger.io/v2/swagger.json + python bench.py -r 5 schema.json swagger.json + +""" +from __future__ import print_function +import argparse +import cProfile +import json +import time + +import jsonschema + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('schema', help="path to a schema used to benchmark") + parser.add_argument('document', help="document to validate with schema") + parser.add_argument('-r', '--repeat', type=int, help="number of iterations") + parser.add_argument('--profile', + help="Enable profiling, write profile to this filepath") + return parser.parse_args() + + +def run(filename, schema, document): + resolver = jsonschema.RefResolver( + 'file://{0}'.format(filename), + schema, + store={schema['id']: schema}) + jsonschema.validate(document, schema, resolver=resolver) + + +def format_time(time_): + return "%.3fms" % (time_ * 1000) + + +def run_timeit(schema_filename, document_filename, repeat, profile): + with open(schema_filename) as schema_file: + schema = json.load(schema_file) + + with open(document_filename) as fh: + document = json.load(fh) + + if profile: + profiler = cProfile.Profile() + profiler.enable() + + times = [] + for _ in range(repeat): + start_time = time.time() + run(schema_filename, schema, document) + times.append(time.time() - start_time) + + if profile: + profiler.disable() + profiler.dump_stats(profile) + + print(", ".join(map(format_time, sorted(times)))) + print("Mean: {0}".format(format_time(sum(times) / repeat))) + + +def main(): + args = parse_args() + run_timeit(args.schema, args.document, args.repeat, args.profile) + + +if __name__ == "__main__": + main()