diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index acd64a7a3e..1c18049d28 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -27,7 +27,6 @@ jobs:
         # 6th block: These have some XFAILs
           - |
             --ignore=pymc3/tests/test_distribution_defaults.py
-            --ignore=pymc3/tests/test_distributions.py
             --ignore=pymc3/tests/test_distributions_random.py
             --ignore=pymc3/tests/test_distributions_timeseries.py
             --ignore=pymc3/tests/test_missing.py
@@ -35,33 +34,25 @@ jobs:
             --ignore=pymc3/tests/test_model_graph.py
             --ignore=pymc3/tests/test_modelcontext.py
             --ignore=pymc3/tests/test_models_linear.py
-            --ignore=pymc3/tests/test_ndarray_backend.py
             --ignore=pymc3/tests/test_parallel_sampling.py
-            --ignore=pymc3/tests/test_posterior_predictive.py
-            --ignore=pymc3/tests/test_posteriors.py
             --ignore=pymc3/tests/test_profile.py
             --ignore=pymc3/tests/test_random.py
-            --ignore=pymc3/tests/test_sampling.py
             --ignore=pymc3/tests/test_shared.py
             --ignore=pymc3/tests/test_smc.py
             --ignore=pymc3/tests/test_starting.py
             --ignore=pymc3/tests/test_step.py
             --ignore=pymc3/tests/test_tracetab.py
-            --ignore=pymc3/tests/test_transforms.py
             --ignore=pymc3/tests/test_tuning.py
             --ignore=pymc3/tests/test_types.py
             --ignore=pymc3/tests/test_util.py
             --ignore=pymc3/tests/test_variational_inference.py
-
             --ignore=pymc3/tests/test_sampling_jax.py
-
             --ignore=pymc3/tests/test_dist_math.py
             --ignore=pymc3/tests/test_minibatches.py
             --ignore=pymc3/tests/test_pickling.py
             --ignore=pymc3/tests/test_plots.py
             --ignore=pymc3/tests/test_special_functions.py
             --ignore=pymc3/tests/test_updates.py
-
             --ignore=pymc3/tests/test_dist_math.py
             --ignore=pymc3/tests/test_examples.py
             --ignore=pymc3/tests/test_glm.py
diff --git a/pymc3/aesaraf.py b/pymc3/aesaraf.py
index 0c425a90a3..61ec4a9da6 100644
--- a/pymc3/aesaraf.py
+++ b/pymc3/aesaraf.py
@@ -11,6 +11,7 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
+from typing import Dict, List
 
 import aesara
 import numpy as np
@@ -222,7 +223,7 @@ def __hash__(self):
         return hash(type(self))
 
 
-def make_shared_replacements(vars, model):
+def make_shared_replacements(point, vars, model):
     """
     Makes shared replacements for all *other* variables than the ones passed.
 
@@ -231,6 +232,7 @@ def make_shared_replacements(vars, model):
 
     Parameters
     ----------
+    point: dictionary mapping variable names to sample values
     vars: list of variables not to make shared
     model: model
 
@@ -239,15 +241,22 @@ def make_shared_replacements(vars, model):
     Dict of variable -> new shared variable
     """
     othervars = set(model.vars) - set(vars)
-    return {var: aesara.shared(var.tag.test_value, var.name + "_shared") for var in othervars}
+    return {var: aesara.shared(point[var.name], var.name + "_shared") for var in othervars}
 
 
-def join_nonshared_inputs(xs, vars, shared, make_shared=False):
+def join_nonshared_inputs(
+    point: Dict[str, np.ndarray],
+    xs: List[TensorVariable],
+    vars: List[TensorVariable],
+    shared,
+    make_shared: bool = False,
+):
     """
     Takes a list of aesara Variables and joins their non shared inputs into a single input.
 
     Parameters
     ----------
+    point: a sample point
     xs: list of aesara tensors
     vars: list of variables to join
 
@@ -266,17 +275,20 @@ def join_nonshared_inputs(xs, vars, shared, make_shared=False):
         tensor_type = joined.type
         inarray = tensor_type("inarray")
     else:
-        inarray = aesara.shared(joined.tag.test_value, "inarray")
+        if point is None:
+            raise ValueError("A point is required when `make_shared` is True")
+        joined_values = np.concatenate([point[var.name].ravel() for var in vars])
+        inarray = aesara.shared(joined_values, "inarray")
 
-    inarray.tag.test_value = joined.tag.test_value
+    if aesara.config.compute_test_value != "off":
+        inarray.tag.test_value = joined.tag.test_value
 
     replace = {}
     last_idx = 0
     for var in vars:
-        arr_len = aet.prod(var.shape)
-        replace[var] = reshape_t(inarray[last_idx : last_idx + arr_len], var.shape).astype(
-            var.dtype
-        )
+        shape = point[var.name].shape
+        arr_len = np.prod(shape, dtype=int)
+        replace[var] = reshape_t(inarray[last_idx : last_idx + arr_len], shape).astype(var.dtype)
         last_idx += arr_len
 
     replace.update(shared)
diff --git a/pymc3/distributions/__init__.py b/pymc3/distributions/__init__.py
index dfc96175ac..aa58ea82b5 100644
--- a/pymc3/distributions/__init__.py
+++ b/pymc3/distributions/__init__.py
@@ -31,6 +31,8 @@
     int, np.ndarray, Tuple[Union[int, Variable], ...], List[Union[int, Variable]], Variable
 ]
 
+no_transform_object = object()
+
 
 @singledispatch
 def logp_transform(op: Op):
@@ -340,17 +342,17 @@ def logpt(
     else:
         logp_var = _logcdf(rv_node.op, rv_var, *dist_params, **kwargs)
 
-    transform = getattr(rv_value_var.tag, "transform", None) if rv_value_var else None
-
-    if transform and transformed and not cdf:
+    if transformed and not cdf:
         (logp_var,), _ = apply_transforms((logp_var,))
 
-        if jacobian:
-            transformed_jacobian = transform.jacobian_det(rv_var, rv_value)
-            if transformed_jacobian:
-                if logp_var.ndim > transformed_jacobian.ndim:
-                    logp_var = logp_var.sum(axis=-1)
-                logp_var += transformed_jacobian
+    transform = getattr(rv_value_var.tag, "transform", None) if rv_value_var else None
+
+    if transform and transformed and not cdf and jacobian:
+        transformed_jacobian = transform.jacobian_det(rv_var, rv_value)
+        if transformed_jacobian:
+            if logp_var.ndim > transformed_jacobian.ndim:
+                logp_var = logp_var.sum(axis=-1)
+            logp_var += transformed_jacobian
 
     # Replace random variables with their value variables
     (logp_var,), replaced = rvs_to_value_vars((logp_var,), {rv_var: rv_value})
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 5864eac3ab..43f0446525 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -727,35 +727,33 @@ def NegBinom(a, m, x):
 
     @classmethod
     def dist(cls, mu=None, alpha=None, p=None, n=None, *args, **kwargs):
-        mu, alpha = cls.get_mu_alpha(mu, alpha, p, n)
-        mu = aet.as_tensor_variable(floatX(mu))
-        alpha = aet.as_tensor_variable(floatX(alpha))
-        # mode = intX(aet.floor(mu))
-        return super().dist([mu, alpha], *args, **kwargs)
+        n, p = cls.get_mu_alpha(mu, alpha, p, n)
+        n = aet.as_tensor_variable(floatX(n))
+        p = aet.as_tensor_variable(floatX(p))
+        return super().dist([n, p], *args, **kwargs)
 
     @classmethod
     def get_mu_alpha(cls, mu=None, alpha=None, p=None, n=None):
-        if alpha is None:
-            if n is not None:
-                n = aet.as_tensor_variable(intX(n))
-                alpha = n
+        if n is None:
+            if alpha is not None:
+                n = aet.as_tensor_variable(floatX(alpha))
             else:
                 raise ValueError("Incompatible parametrization. Must specify either alpha or n.")
-        elif n is not None:
+        elif alpha is not None:
             raise ValueError("Incompatible parametrization. Can't specify both alpha and n.")
 
-        if mu is None:
-            if p is not None:
-                p = aet.as_tensor_variable(floatX(p))
-                mu = alpha * (1 - p) / p
+        if p is None:
+            if mu is not None:
+                mu = aet.as_tensor_variable(floatX(mu))
+                p = n / (mu + n)
             else:
                 raise ValueError("Incompatible parametrization. Must specify either mu or p.")
-        elif p is not None:
+        elif mu is not None:
             raise ValueError("Incompatible parametrization. Can't specify both mu and p.")
 
-        return mu, alpha
+        return n, p
 
-    def logp(value, mu, alpha):
+    def logp(value, n, p):
         r"""
         Calculate log-probability of NegativeBinomial distribution at specified value.
 
@@ -769,6 +767,8 @@ def logp(value, mu, alpha):
         -------
         TensorVariable
         """
+        alpha = n
+        mu = alpha * (1 - p) / p
         negbinom = bound(
             binomln(value + alpha - 1, value)
             + logpow(mu / (mu + alpha), value)
@@ -779,9 +779,9 @@ def logp(value, mu, alpha):
         )
 
         # Return Poisson when alpha gets very large.
-        return aet.switch(aet.gt(alpha, 1e10), Poisson.dist(mu).logp(value), negbinom)
+        return aet.switch(aet.gt(alpha, 1e10), Poisson.logp(value, mu), negbinom)
 
-    def logcdf(value, mu, alpha):
+    def logcdf(value, n, p):
         """
         Compute the log of the cumulative distribution function for NegativeBinomial distribution
         at the specified value.
@@ -801,20 +801,14 @@ def logcdf(value, mu, alpha):
                 f"NegativeBinomial.logcdf expects a scalar value but received a {np.ndim(value)}-dimensional object."
             )
 
-        # TODO: avoid `p` recomputation if distribution was defined in terms of `p`
-        p = alpha / (mu + alpha)
-
         return bound(
-            aet.log(incomplete_beta(alpha, aet.floor(value) + 1, p)),
+            aet.log(incomplete_beta(n, aet.floor(value) + 1, p)),
             0 <= value,
-            0 < alpha,
+            0 < n,
             0 <= p,
             p <= 1,
         )
 
-    def _distr_parameters_for_repr(self):
-        return self._param_type
-
 
 class Geometric(Discrete):
     R"""
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 2b7d48a6c1..099f6f3cbc 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -26,7 +26,7 @@
 
 from aesara.tensor.random.op import RandomVariable
 
-from pymc3.distributions import _logcdf, _logp
+from pymc3.distributions import _logcdf, _logp, no_transform_object
 
 if TYPE_CHECKING:
     from typing import Optional, Callable
@@ -161,7 +161,7 @@ def __new__(cls, name, *args, **kwargs):
         if "shape" in kwargs:
             raise DeprecationWarning("The `shape` keyword is deprecated; use `size`.")
 
-        transform = kwargs.pop("transform", None)
+        transform = kwargs.pop("transform", no_transform_object)
 
         rv_out = cls.dist(*args, rng=rng, **kwargs)
 
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index fc2833f9fd..f44d65b7a0 100755
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -221,7 +221,7 @@ class MvNormal(Continuous):
     @classmethod
     def dist(cls, mu, cov=None, tau=None, chol=None, lower=True, **kwargs):
         mu = aet.as_tensor_variable(mu)
-        cov = quaddist_matrix(cov, tau, chol, lower)
+        cov = quaddist_matrix(cov, chol, tau, lower)
         return super().dist([mu, cov], **kwargs)
 
     def logp(value, mu, cov):
@@ -386,8 +386,12 @@ class Dirichlet(Continuous):
 
     rv_op = dirichlet
 
+    def __new__(cls, name, *args, **kwargs):
+        kwargs.setdefault("transform", transforms.stick_breaking)
+        return super().__new__(cls, name, *args, **kwargs)
+
     @classmethod
-    def dist(cls, a, transform=transforms.stick_breaking, **kwargs):
+    def dist(cls, a, **kwargs):
 
         a = aet.as_tensor_variable(a)
         # mean = a / aet.sum(a)
@@ -483,7 +487,7 @@ class Multinomial(Discrete):
     @classmethod
     def dist(cls, n, p, *args, **kwargs):
 
-        # p = p / aet.sum(p, axis=-1, keepdims=True)
+        p = p / aet.sum(p, axis=-1, keepdims=True)
         n = aet.as_tensor_variable(n)
         p = aet.as_tensor_variable(p)
 
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index 7dbbc7afd0..b5ab265cb4 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -218,7 +218,7 @@ def jacobian_det(self, rv_var, rv_value):
             s = aet.nnet.softplus(-rv_value)
             return aet.log(b - a) - 2 * s - rv_value
         else:
-            return aet.ones_like(rv_value)
+            return rv_value
 
 
 interval = Interval
@@ -286,7 +286,7 @@ class StickBreaking(Transform):
     name = "stickbreaking"
 
     def forward(self, rv_var, rv_value):
-        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+        if rv_var.broadcastable[-1]:
             # If this variable is just a bunch of scalars/degenerate
             # Dirichlets, we can't transform it
             return rv_value
@@ -299,7 +299,7 @@ def forward(self, rv_var, rv_value):
         return floatX(y.T)
 
     def backward(self, rv_var, rv_value):
-        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+        if rv_var.broadcastable[-1]:
             # If this variable is just a bunch of scalars/degenerate
             # Dirichlets, we can't transform it
             return rv_value
@@ -312,7 +312,7 @@ def backward(self, rv_var, rv_value):
         return floatX(x.T)
 
     def jacobian_det(self, rv_var, rv_value):
-        if rv_var.ndim == 1 or rv_var.broadcastable[-1]:
+        if rv_var.broadcastable[-1]:
             # If this variable is just a bunch of scalars/degenerate
             # Dirichlets, we can't transform it
             return aet.ones_like(rv_value)
diff --git a/pymc3/model.py b/pymc3/model.py
index d6f3e3f82b..d4367e827c 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -38,7 +38,13 @@
 from pymc3.aesaraf import generator, gradient, hessian, inputvars
 from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.data import GenTensorVariable, Minibatch
-from pymc3.distributions import change_rv_size, logp_transform, logpt, logpt_sum
+from pymc3.distributions import (
+    change_rv_size,
+    logp_transform,
+    logpt,
+    logpt_sum,
+    no_transform_object,
+)
 from pymc3.exceptions import ImputationWarning
 from pymc3.math import flatten_list
 from pymc3.memoize import WithMemoization
@@ -240,8 +246,6 @@ def build_named_node_tree(graphs):
 
 T = TypeVar("T", bound="ContextMeta")
 
-no_transform_object = object()
-
 
 class ContextMeta(type):
     """Functionality for objects that put themselves in a context using
@@ -1003,19 +1007,37 @@ def independent_vars(self):
 
     @property
     def test_point(self):
-        """Test point used to check that the model doesn't generate errors"""
+        """Test point used to check that the model doesn't generate errors
+
+        TODO: This should be replaced with proper initial value support.
+        """
         points = []
-        for var in self.free_RVs:
-            var_value = getattr(var.tag, "test_value", None)
+        for rv_var in self.free_RVs:
+            value_var = rv_var.tag.value_var
+            var_value = getattr(value_var.tag, "test_value", None)
 
             if var_value is None:
-                try:
-                    var_value = var.eval()
-                    var.tag.test_value = var_value
-                except Exception:
-                    raise Exception(f"Couldn't generate an initial value for {var}")
 
-            points.append((getattr(var.tag, "value_var", var), var_value))
+                rv_var_value = getattr(rv_var.tag, "test_value", None)
+
+                if rv_var_value is None:
+                    try:
+                        rv_var_value = rv_var.eval()
+                    except Exception:
+                        raise Exception(f"Couldn't generate an initial value for {rv_var}")
+
+                transform = getattr(value_var.tag, "transform", None)
+
+                if transform:
+                    try:
+                        rv_var_value = transform.forward(rv_var, rv_var_value).eval()
+                    except Exception:
+                        raise Exception(f"Couldn't generate an initial value for {rv_var}")
+
+                var_value = rv_var_value
+                value_var.tag.test_value = var_value
+
+            points.append((value_var, var_value))
 
         return Point(points, model=self)
 
@@ -1132,6 +1154,7 @@ def register_rv(
             value_var.name = f"{value_var.name}_{transform.name}__"
             if aesara.config.compute_test_value != "off":
                 value_var.tag.test_value = transform.forward(rv_var, value_var).tag.test_value
+            self.named_vars[value_var.name] = value_var
 
         self.add_random_variable(rv_var, dims)
 
@@ -1653,29 +1676,6 @@ def _walk_up_rv(rv, formatting="plain"):
     return all_rvs
 
 
-class DeterministicWrapper(TensorVariable):
-    def _str_repr(self, formatting="plain"):
-        if "latex" in formatting:
-            if formatting == "latex_with_params":
-                return r"$\text{{{name}}} \sim \text{{Deterministic}}({args})$".format(
-                    name=self.name, args=r",~".join(_walk_up_rv(self, formatting=formatting))
-                )
-            return fr"$\text{{{self.name}}} \sim \text{{Deterministic}}$"
-        else:
-            if formatting == "plain_with_params":
-                args = ", ".join(_walk_up_rv(self, formatting=formatting))
-                return f"{self.name} ~ Deterministic({args})"
-            return f"{self.name} ~ Deterministic"
-
-    def _repr_latex_(self, *, formatting="latex_with_params", **kwargs):
-        return self._str_repr(formatting=formatting)
-
-    __latex__ = _repr_latex_
-
-    def __str__(self):
-        return self._str_repr(formatting="plain")
-
-
 def Deterministic(name, var, model=None, dims=None):
     """Create a named deterministic variable
 
@@ -1692,7 +1692,6 @@ def Deterministic(name, var, model=None, dims=None):
     var = var.copy(model.name_for(name))
     model.deterministics.append(var)
     model.add_random_variable(var, dims)
-    var.__class__ = DeterministicWrapper  # adds str and latex functionality
 
     return var
 
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index d35d3d248d..b6cb03bd44 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -197,7 +197,7 @@ def assign_step_methods(model, step=None, methods=STEP_METHODS, step_kwargs=None
     # Use competence classmethods to select step methods for remaining
     # variables
     selected_steps = defaultdict(list)
-    for var in model.free_RVs:
+    for var in model.vars:
         if var not in assigned_vars:
             # determine if a gradient can be computed
             has_gradient = var.dtype not in discrete_types
@@ -448,7 +448,8 @@ def sample(
         random_seed = [random_seed]
     if random_seed is None or isinstance(random_seed, int):
         if random_seed is not None:
-            np.random.seed(random_seed)
+            # np.random.seed(random_seed)
+            model.default_rng.get_value(borrow=True).seed(random_seed)
         random_seed = [np.random.randint(2 ** 30) for _ in range(chains)]
     if not isinstance(random_seed, abc.Iterable):
         raise TypeError("Invalid value for `random_seed`. Must be tuple, list or int")
@@ -658,9 +659,9 @@ def _check_start_shape(model, start):
     if not isinstance(start, dict):
         raise TypeError("start argument must be a dict or an array-like of dicts")
     e = ""
-    for var in model.vars:
+    for var in model.basic_RVs:
+        var_shape = model.fastfn(var.shape)(start)
         if var.name in start.keys():
-            var_shape = var.shape.tag.test_value
             start_var_shape = np.shape(start[var.name])
             if start_var_shape:
                 if not np.array_equal(var_shape, start_var_shape):
@@ -971,7 +972,8 @@ def _iter_sample(
     model = modelcontext(model)
     draws = int(draws)
     if random_seed is not None:
-        np.random.seed(random_seed)
+        # np.random.seed(random_seed)
+        model.default_rng.get_value(borrow=True).seed(random_seed)
     if draws < 1:
         raise ValueError("Argument `draws` must be greater than 0.")
 
@@ -1239,7 +1241,8 @@ def _prepare_iter_population(
     model = modelcontext(model)
     draws = int(draws)
     if random_seed is not None:
-        np.random.seed(random_seed)
+        # np.random.seed(random_seed)
+        model.default_rng.get_value(borrow=True).seed(random_seed)
     if draws < 1:
         raise ValueError("Argument `draws` should be above 0.")
 
@@ -1710,7 +1713,8 @@ def sample_posterior_predictive(
         vars_ = model.observed_RVs
 
     if random_seed is not None:
-        np.random.seed(random_seed)
+        # np.random.seed(random_seed)
+        model.default_rng.get_value(borrow=True).seed(random_seed)
 
     indices = np.arange(samples)
 
@@ -1724,12 +1728,18 @@ def sample_posterior_predictive(
 
     if not hasattr(_trace, "varnames"):
         inputs_and_names = [
-            (rv, rv.name) for rv in rv_ancestors(vars_to_sample, walk_past_rvs=True)
+            (rv, rv.name)
+            for rv in rv_ancestors(vars_to_sample, walk_past_rvs=True)
+            if rv not in vars_to_sample and rv in model.named_vars.values()
         ]
-        inputs, input_names = zip(*inputs_and_names)
+        if inputs_and_names:
+            inputs, input_names = zip(*inputs_and_names)
+        else:
+            inputs, input_names = [], []
     else:
-        input_names = _trace.varnames
-        inputs = [model[n] for n in _trace.varnames]
+        output_names = [v.name for v in vars_to_sample if v.name is not None]
+        input_names = [n for n in _trace.varnames if n not in output_names]
+        inputs = [model[n] for n in input_names]
 
     if size is not None:
         vars_to_sample = [change_rv_size(v, size, expand=True) for v in vars_to_sample]
@@ -1813,7 +1823,7 @@ def sample_posterior_predictive_w(
         Dictionary with the variables as keys. The values corresponding to the
         posterior predictive samples from the weighted models.
     """
-    np.random.seed(random_seed)
+    # np.random.seed(random_seed)
 
     if isinstance(traces[0], InferenceData):
         n_samples = [
@@ -1830,6 +1840,8 @@ def sample_posterior_predictive_w(
         models = [modelcontext(models)] * len(traces)
 
     for model in models:
+        if random_seed:
+            model.default_rng.get_value(borrow=True).seed(random_seed)
         if model.potentials:
             warnings.warn(
                 "The effect of Potentials on other parameters is ignored during posterior predictive sampling. "
@@ -1969,7 +1981,8 @@ def sample_prior_predictive(
         vars_ = set(var_names)
 
     if random_seed is not None:
-        np.random.seed(random_seed)
+        # np.random.seed(random_seed)
+        model.default_rng.get_value(borrow=True).seed(random_seed)
 
     names = get_default_varnames(vars_, include_transformed=False)
 
@@ -2116,7 +2129,8 @@ def init_nuts(
 
     if random_seed is not None:
         random_seed = int(np.atleast_1d(random_seed)[0])
-        np.random.seed(random_seed)
+        # np.random.seed(random_seed)
+        model.default_rng.get_value(borrow=True).seed(random_seed)
 
     cb = [
         pm.callbacks.CheckParametersConvergence(tolerance=1e-2, diff="absolute"),
diff --git a/pymc3/smc/smc.py b/pymc3/smc/smc.py
index 04d8c017c8..80321d31f6 100644
--- a/pymc3/smc/smc.py
+++ b/pymc3/smc/smc.py
@@ -108,12 +108,15 @@ def initialize_population(self):
 
     def setup_kernel(self):
         """Set up the likelihood logp function based on the chosen kernel."""
-        shared = make_shared_replacements(self.variables, self.model)
+        initial_values = self.model.test_point
+        shared = make_shared_replacements(initial_values, self.variables, self.model)
 
         if self.kernel == "abc":
             factors = [var.logpt for var in self.model.free_RVs]
             factors += [aet.sum(factor) for factor in self.model.potentials]
-            self.prior_logp_func = logp_forw([aet.sum(factors)], self.variables, shared)
+            self.prior_logp_func = logp_forw(
+                initial_values, [aet.sum(factors)], self.variables, shared
+            )
             simulator = self.model.observed_RVs[0]
             distance = simulator.distribution.distance
             sum_stat = simulator.distribution.sum_stat
@@ -132,8 +135,12 @@ def setup_kernel(self):
                 self.save_log_pseudolikelihood,
             )
         elif self.kernel == "metropolis":
-            self.prior_logp_func = logp_forw([self.model.varlogpt], self.variables, shared)
-            self.likelihood_logp_func = logp_forw([self.model.datalogpt], self.variables, shared)
+            self.prior_logp_func = logp_forw(
+                initial_values, [self.model.varlogpt], self.variables, shared
+            )
+            self.likelihood_logp_func = logp_forw(
+                initial_values, [self.model.datalogpt], self.variables, shared
+            )
 
     def initialize_logp(self):
         """Initialize the prior and likelihood log probabilities."""
@@ -271,7 +278,7 @@ def posterior_to_trace(self):
         return strace
 
 
-def logp_forw(out_vars, vars, shared):
+def logp_forw(point, out_vars, vars, shared):
     """Compile Aesara function of the model and the input and output variables.
 
     Parameters
@@ -283,7 +290,7 @@ def logp_forw(out_vars, vars, shared):
     shared: List
         containing :class:`aesara.tensor.Tensor` for depended shared data
     """
-    out_list, inarray0 = join_nonshared_inputs(out_vars, vars, shared)
+    out_list, inarray0 = join_nonshared_inputs(point, out_vars, vars, shared)
     f = aesara_function([inarray0], out_list[0])
     f.trust_input = True
     return f
diff --git a/pymc3/step_methods/arraystep.py b/pymc3/step_methods/arraystep.py
index b0b30d0262..e454d64bc6 100644
--- a/pymc3/step_methods/arraystep.py
+++ b/pymc3/step_methods/arraystep.py
@@ -145,14 +145,26 @@ def step(self, point: Dict[str, np.ndarray]):
         if self.allvars:
             inputs.append(point)
 
+        apoint = DictToArrayBijection.map(point)
+        step_res = self.astep(apoint, *inputs)
+
         if self.generates_stats:
-            apoint, stats = self.astep(DictToArrayBijection.map(point), *inputs)
-            return DictToArrayBijection.rmap(apoint), stats
+            apoint_new, stats = step_res
         else:
-            apoint = self.astep(DictToArrayBijection.map(point), *inputs)
-            return DictToArrayBijection.rmap(apoint)
+            apoint_new = step_res
+
+        if not isinstance(apoint_new, RaveledVars):
+            # We assume that the mapping has stayed the same
+            apoint_new = RaveledVars(apoint_new, apoint.point_map_info)
 
-    def astep(self, apoint, point):
+        point_new = DictToArrayBijection.rmap(apoint_new)
+
+        if self.generates_stats:
+            return point_new, stats
+
+        return point_new
+
+    def astep(self, apoint: RaveledVars, point: Dict[str, np.ndarray]):
         raise NotImplementedError()
 
 
@@ -177,21 +189,43 @@ def __init__(self, vars, shared, blocked=True):
         self.blocked = blocked
 
     def step(self, point):
-        for var, share in self.shared.items():
-            share.set_value(point[var])
+
+        # Remove shared variables from the sample point
+        point_no_shared = point.copy()
+        for name, shared_var in self.shared.items():
+            shared_var.set_value(point[name])
+            if name in point_no_shared:
+                del point_no_shared[name]
+
+        q = DictToArrayBijection.map(point_no_shared)
+
+        step_res = self.astep(q)
 
         if self.generates_stats:
-            apoint, stats = self.astep(DictToArrayBijection.map(point))
-            return DictToArrayBijection.rmap(apoint), stats
+            apoint, stats = step_res
         else:
-            array = DictToArrayBijection.map(point)
-            apoint = self.astep(array)
-            if not isinstance(apoint, RaveledVars):
-                # We assume that the mapping has stayed the same
-                apoint = RaveledVars(apoint, array.point_map_info)
-            return DictToArrayBijection.rmap(apoint)
+            apoint = step_res
 
-    def astep(self, apoint):
+        if not isinstance(apoint, RaveledVars):
+            # We assume that the mapping has stayed the same
+            apoint = RaveledVars(apoint, q.point_map_info)
+
+        # We need to re-add the shared variables to the new sample point
+        a_point = DictToArrayBijection.rmap(apoint)
+        new_point = {}
+        for name in point.keys():
+            shared_value = self.shared.get(name, None)
+            if shared_value is not None:
+                new_point[name] = shared_value.get_value()
+            else:
+                new_point[name] = a_point[name]
+
+        if self.generates_stats:
+            return new_point, stats
+
+        return new_point
+
+    def astep(self, apoint: RaveledVars):
         raise NotImplementedError()
 
 
diff --git a/pymc3/step_methods/metropolis.py b/pymc3/step_methods/metropolis.py
index ac74d89e65..8c523a31e3 100644
--- a/pymc3/step_methods/metropolis.py
+++ b/pymc3/step_methods/metropolis.py
@@ -23,7 +23,7 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import floatX
-from pymc3.blocking import DictToArrayBijection
+from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.step_methods.arraystep import (
     ArrayStep,
     ArrayStepShared,
@@ -149,14 +149,14 @@ def __init__(
         """
 
         model = pm.modelcontext(model)
+        initial_values = model.test_point
 
         if vars is None:
             vars = model.vars
         vars = pm.inputvars(vars)
 
         if S is None:
-            # XXX: This needs to be refactored
-            S = None  # np.ones(sum(v.dsize for v in vars))
+            S = np.ones(sum(initial_values[v.name].size for v in vars))
 
         if proposal_dist is not None:
             self.proposal_dist = proposal_dist(S)
@@ -175,8 +175,7 @@ def __init__(
 
         # Determine type of variables
         self.discrete = np.concatenate(
-            # XXX: This needs to be refactored
-            None  # [[v.dtype in pm.discrete_types] * (v.dsize or 1) for v in vars]
+            [[v.dtype in pm.discrete_types] * (initial_values[v.name].size or 1) for v in vars]
         )
         self.any_discrete = self.discrete.any()
         self.all_discrete = self.discrete.all()
@@ -188,8 +187,8 @@ def __init__(
 
         self.mode = mode
 
-        shared = pm.make_shared_replacements(vars, model)
-        self.delta_logp = delta_logp(model.logpt, vars, shared)
+        shared = pm.make_shared_replacements(initial_values, vars, model)
+        self.delta_logp = delta_logp(initial_values, model.logpt, vars, shared)
         super().__init__(vars, shared)
 
     def reset_tuning(self):
@@ -198,7 +197,7 @@ def reset_tuning(self):
             setattr(self, attr, initial_value)
         return
 
-    def astep(self, q0):
+    def astep(self, q0: RaveledVars):
         if not self.steps_until_tune and self.tune:
             # Tune scaling parameter
             self.scaling = tune(self.scaling, self.accepted / float(self.tune_interval))
@@ -212,15 +211,18 @@ def astep(self, q0):
             if self.all_discrete:
                 delta = np.round(delta, 0).astype("int64")
                 q0 = q0.astype("int64")
-                q = (q0 + delta).astype("int64")
+                q = (q0.data + delta).astype("int64")
             else:
                 delta[self.discrete] = np.round(delta[self.discrete], 0)
-                q = q0 + delta
+                q = q0.data + delta
         else:
-            q = floatX(q0 + delta)
+            q = floatX(q0.data + delta)
+
+        accept = self.delta_logp(q, q0.data)
+        q_new, accepted = metrop_select(accept, q, q0.data)
+
+        q_new = RaveledVars(q_new, q0.point_map_info)
 
-        accept = self.delta_logp(q, q0)
-        q_new, accepted = metrop_select(accept, q, q0)
         self.accepted += accepted
 
         self.steps_until_tune -= 1
@@ -630,6 +632,7 @@ def __init__(
     ):
 
         model = pm.modelcontext(model)
+        initial_values = model.test_value
 
         if vars is None:
             vars = model.cont_vars
@@ -657,8 +660,8 @@ def __init__(
 
         self.mode = mode
 
-        shared = pm.make_shared_replacements(vars, model)
-        self.delta_logp = delta_logp(model.logpt, vars, shared)
+        shared = pm.make_shared_replacements(initial_values, vars, model)
+        self.delta_logp = delta_logp(initial_values, model.logpt, vars, shared)
         super().__init__(vars, shared)
 
     def astep(self, q0):
@@ -771,6 +774,7 @@ def __init__(
         **kwargs
     ):
         model = pm.modelcontext(model)
+        initial_values = model.test_value
 
         if vars is None:
             vars = model.cont_vars
@@ -810,8 +814,8 @@ def __init__(
 
         self.mode = mode
 
-        shared = pm.make_shared_replacements(vars, model)
-        self.delta_logp = delta_logp(model.logpt, vars, shared)
+        shared = pm.make_shared_replacements(initial_values, vars, model)
+        self.delta_logp = delta_logp(initial_values, model.logpt, vars, shared)
         super().__init__(vars, shared)
 
     def reset_tuning(self):
@@ -898,8 +902,8 @@ def softmax(x):
     return e_x / np.sum(e_x, axis=0)
 
 
-def delta_logp(logp, vars, shared):
-    [logp0], inarray0 = pm.join_nonshared_inputs([logp], vars, shared)
+def delta_logp(point, logp, vars, shared):
+    [logp0], inarray0 = pm.join_nonshared_inputs(point, [logp], vars, shared)
 
     tensor_type = inarray0.type
     inarray1 = tensor_type("inarray1")
diff --git a/pymc3/step_methods/mlda.py b/pymc3/step_methods/mlda.py
index 926fb16314..23c4c94a8c 100644
--- a/pymc3/step_methods/mlda.py
+++ b/pymc3/step_methods/mlda.py
@@ -57,6 +57,8 @@ def __init__(self, *args, **kwargs):
         Initialise MetropolisMLDA. This is a mix of the parent's class' initialisation
         and some extra code specific for MLDA.
         """
+        model = pm.modelcontext(kwargs.get("model", None))
+        initial_values = model.test_value
 
         # flag to that variance reduction is activated - forces MetropolisMLDA
         # to store quantities of interest in a register if True
@@ -69,19 +71,18 @@ def __init__(self, *args, **kwargs):
             self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above
 
             # extract some necessary variables
-            model = pm.modelcontext(kwargs.get("model", None))
             vars = kwargs.get("vars", None)
             if vars is None:
                 vars = model.vars
             vars = pm.inputvars(vars)
-            shared = pm.make_shared_replacements(vars, model)
+            shared = pm.make_shared_replacements(initial_values, vars, model)
 
         # call parent class __init__
         super().__init__(*args, **kwargs)
 
         # modify the delta function and point to model if VR is used
         if self.mlda_variance_reduction:
-            self.delta_logp = delta_logp_inverse(model.logpt, vars, shared)
+            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
             self.model = model
 
     def reset_tuning(self):
@@ -124,6 +125,9 @@ def __init__(self, *args, **kwargs):
         # flag used for signaling the end of tuning
         self.tuning_end_trigger = False
 
+        model = pm.modelcontext(kwargs.get("model", None))
+        initial_values = model.test_value
+
         # flag to that variance reduction is activated - forces DEMetropolisZMLDA
         # to store quantities of interest in a register if True
         self.mlda_variance_reduction = kwargs.pop("mlda_variance_reduction", False)
@@ -135,19 +139,18 @@ def __init__(self, *args, **kwargs):
             self.Q_reg = [np.nan] * self.mlda_subsampling_rate_above
 
             # extract some necessary variables
-            model = pm.modelcontext(kwargs.get("model", None))
             vars = kwargs.get("vars", None)
             if vars is None:
                 vars = model.vars
             vars = pm.inputvars(vars)
-            shared = pm.make_shared_replacements(vars, model)
+            shared = pm.make_shared_replacements(initial_values, vars, model)
 
         # call parent class __init__
         super().__init__(*args, **kwargs)
 
         # modify the delta function and point to model if VR is used
         if self.mlda_variance_reduction:
-            self.delta_logp = delta_logp_inverse(model.logpt, vars, shared)
+            self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
             self.model = model
 
     def reset_tuning(self):
@@ -400,6 +403,7 @@ def __init__(
 
         # assign internal state
         model = pm.modelcontext(model)
+        initial_values = model.test_value
         self.model = model
         self.coarse_models = coarse_models
         self.model_below = self.coarse_models[-1]
@@ -553,16 +557,18 @@ def __init__(
 
         # Construct aesara function for current-level model likelihood
         # (for use in acceptance)
-        shared = pm.make_shared_replacements(vars, model)
-        self.delta_logp = delta_logp_inverse(model.logpt, vars, shared)
+        shared = pm.make_shared_replacements(initial_values, vars, model)
+        self.delta_logp = delta_logp_inverse(initial_values, model.logpt, vars, shared)
 
         # Construct aesara function for below-level model likelihood
         # (for use in acceptance)
         model_below = pm.modelcontext(self.model_below)
         vars_below = [var for var in model_below.vars if var.name in self.var_names]
         vars_below = pm.inputvars(vars_below)
-        shared_below = pm.make_shared_replacements(vars_below, model_below)
-        self.delta_logp_below = delta_logp(model_below.logpt, vars_below, shared_below)
+        shared_below = pm.make_shared_replacements(initial_values, vars_below, model_below)
+        self.delta_logp_below = delta_logp(
+            initial_values, model_below.logpt, vars_below, shared_below
+        )
 
         super().__init__(vars, shared)
 
@@ -958,8 +964,8 @@ def update(self, x):
         self.t += 1
 
 
-def delta_logp_inverse(logp, vars, shared):
-    [logp0], inarray0 = pm.join_nonshared_inputs([logp], vars, shared)
+def delta_logp_inverse(point, logp, vars, shared):
+    [logp0], inarray0 = pm.join_nonshared_inputs(point, [logp], vars, shared)
 
     tensor_type = inarray0.type
     inarray1 = tensor_type("inarray1")
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index cb73f67902..0aa008389d 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -59,6 +59,7 @@ class PGBART(ArrayStepShared):
     def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None):
         _log.warning("The BART model is experimental. Use with caution.")
         model = modelcontext(model)
+        initial_values = model.test_value
         vars = inputvars(vars)
         self.bart = vars[0].distribution
 
@@ -80,8 +81,8 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
             p = ParticleTree(self.bart.trees[i], self.bart.prior_prob_leaf_node)
             self.old_trees_particles_list.append(p)
 
-        shared = make_shared_replacements(vars, model)
-        self.likelihood_logp = logp([model.datalogpt], vars, shared)
+        shared = make_shared_replacements(initial_values, vars, model)
+        self.likelihood_logp = logp(initial_values, [model.datalogpt], vars, shared)
         super().__init__(vars, shared)
 
     def astep(self, _):
@@ -274,7 +275,7 @@ def set_particle_to_step(self, t):
             self.expansion_nodes = self.expansion_nodes_history[t]
 
 
-def logp(out_vars, vars, shared):
+def logp(point, out_vars, vars, shared):
     """Compile Aesara function of the model and the input and output variables.
 
     Parameters
@@ -286,7 +287,7 @@ def logp(out_vars, vars, shared):
     shared: List
         containing :class:`aesara.tensor.Tensor` for depended shared data
     """
-    out_list, inarray0 = join_nonshared_inputs(out_vars, vars, shared)
+    out_list, inarray0 = join_nonshared_inputs(point, out_vars, vars, shared)
     f = aesara_function([inarray0], out_list[0])
     f.trust_input = True
     return f
diff --git a/pymc3/step_methods/slicer.py b/pymc3/step_methods/slicer.py
index b0320a9eff..2eb401858e 100644
--- a/pymc3/step_methods/slicer.py
+++ b/pymc3/step_methods/slicer.py
@@ -18,6 +18,7 @@
 import numpy.random as nr
 
 from pymc3.aesaraf import inputvars
+from pymc3.blocking import RaveledVars
 from pymc3.model import modelcontext
 from pymc3.step_methods.arraystep import ArrayStep, Competence
 from pymc3.vartypes import continuous_types
@@ -61,24 +62,28 @@ def __init__(self, vars=None, w=1.0, tune=True, model=None, iter_limit=np.inf, *
         super().__init__(vars, [self.model.fastlogp], **kwargs)
 
     def astep(self, q0, logp):
-        self.w = np.resize(self.w, len(q0))  # this is a repmat
-        q = np.copy(q0)  # TODO: find out if we need this
-        ql = np.copy(q0)  # l for left boundary
-        qr = np.copy(q0)  # r for right boudary
-        for i in range(len(q0)):
+        q0_val = q0.data
+        self.w = np.resize(self.w, len(q0_val))  # this is a repmat
+        q = np.copy(q0_val)  # TODO: find out if we need this
+        ql = np.copy(q0_val)  # l for left boundary
+        qr = np.copy(q0_val)  # r for right boudary
+        for i in range(len(q0_val)):
             # uniformly sample from 0 to p(q), but in log space
-            y = logp(q) - nr.standard_exponential()
+            q_ra = RaveledVars(q, q0.point_map_info)
+            y = logp(q_ra) - nr.standard_exponential()
             ql[i] = q[i] - nr.uniform(0, self.w[i])
             qr[i] = q[i] + self.w[i]
             # Stepping out procedure
             cnt = 0
-            while y <= logp(ql):  # changed lt to leq  for locally uniform posteriors
+            while y <= logp(
+                RaveledVars(ql, q0.point_map_info)
+            ):  # changed lt to leq  for locally uniform posteriors
                 ql[i] -= self.w[i]
                 cnt += 1
                 if cnt > self.iter_limit:
                     raise RuntimeError(LOOP_ERR_MSG % self.iter_limit)
             cnt = 0
-            while y <= logp(qr):
+            while y <= logp(RaveledVars(qr, q0.point_map_info)):
                 qr[i] += self.w[i]
                 cnt += 1
                 if cnt > self.iter_limit:
@@ -86,11 +91,11 @@ def astep(self, q0, logp):
 
             cnt = 0
             q[i] = nr.uniform(ql[i], qr[i])
-            while logp(q) < y:  # Changed leq to lt, to accomodate for locally flat posteriors
+            while logp(q_ra) < y:  # Changed leq to lt, to accomodate for locally flat posteriors
                 # Sample uniformly from slice
-                if q[i] > q0[i]:
+                if q[i] > q0_val[i]:
                     qr[i] = q[i]
-                elif q[i] < q0[i]:
+                elif q[i] < q0_val[i]:
                     ql[i] = q[i]
                 q[i] = nr.uniform(ql[i], qr[i])
                 cnt += 1
diff --git a/pymc3/tests/sampler_fixtures.py b/pymc3/tests/sampler_fixtures.py
index d0a22745b3..03e79a35f8 100644
--- a/pymc3/tests/sampler_fixtures.py
+++ b/pymc3/tests/sampler_fixtures.py
@@ -16,6 +16,7 @@
 import arviz as az
 import numpy as np
 import numpy.testing as npt
+import pytest
 
 from scipy import stats
 
@@ -81,7 +82,7 @@ class NormalFixture(KnownMean, KnownVariance, KnownCDF):
     @classmethod
     def make_model(cls):
         with pm.Model() as model:
-            a = pm.Normal("a", mu=2, sigma=np.sqrt(3), shape=10)
+            a = pm.Normal("a", mu=2, sigma=np.sqrt(3), size=10)
         return model
 
 
@@ -91,7 +92,7 @@ class BetaBinomialFixture(KnownCDF):
     @classmethod
     def make_model(cls):
         with pm.Model() as model:
-            p = pm.Beta("p", [0.5, 0.5, 1.0], [0.5, 0.5, 1.0], shape=3)
+            p = pm.Beta("p", [0.5, 0.5, 1.0], [0.5, 0.5, 1.0], size=3)
             pm.Binomial("y", p=p, n=[4, 12, 9], observed=[1, 2, 9])
         return model
 
@@ -121,7 +122,7 @@ class LKJCholeskyCovFixture(KnownCDF):
     def make_model(cls):
         with pm.Model() as model:
             sd_mu = np.array([1, 2, 3, 4, 5])
-            sd_dist = pm.Lognormal.dist(mu=sd_mu, sigma=sd_mu / 10.0, shape=5)
+            sd_dist = pm.Lognormal.dist(mu=sd_mu, sigma=sd_mu / 10.0, size=5)
             chol_packed = pm.LKJCholeskyCov("chol_packed", eta=3, n=5, sd_dist=sd_dist)
             chol = pm.expand_packed_triangular(5, chol_packed, lower=True)
             cov = aet.dot(chol, chol.T)
@@ -140,17 +141,26 @@ def setup_class(cls):
         cls.model = cls.make_model()
         with cls.model:
             cls.step = cls.make_step()
-            cls.trace = pm.sample(cls.n_samples, tune=cls.tune, step=cls.step, cores=cls.chains)
+            cls.trace = pm.sample(
+                cls.n_samples,
+                tune=cls.tune,
+                step=cls.step,
+                cores=cls.chains,
+                return_inferencedata=False,
+                compute_convergence_checks=False,
+            )
         cls.samples = {}
         for var in cls.model.unobserved_RVs:
-            cls.samples[get_var_name(var)] = cls.trace.get_values(var.tag.value_var, burn=cls.burn)
+            cls.samples[get_var_name(var)] = cls.trace.get_values(var, burn=cls.burn)
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_neff(self):
         if hasattr(self, "min_n_eff"):
             n_eff = az.ess(self.trace[self.burn :])
             for var in n_eff:
                 npt.assert_array_less(self.min_n_eff, n_eff[var])
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_Rhat(self):
         rhat = az.rhat(self.trace[self.burn :])
         for var in rhat:
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 15fd860981..27f5a0b827 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -230,7 +230,11 @@ def build_model(distfam, valuedomain, vardomains, extra_args=None):
             v_at.name = v
             param_vars[v] = v_at
         param_vars.update(extra_args)
-        distfam("value", **param_vars, transform=None)
+        distfam(
+            "value",
+            **param_vars,
+            transform=None,
+        )
     return m, param_vars
 
 
@@ -643,28 +647,8 @@ def logp_reference(args):
         domains = paramdomains.copy()
         domains["value"] = domain
         for pt in product(domains, n_samples=n_samples):
-
             pt = dict(pt)
-            pt_d = {}
-            for k, v in pt.items():
-                rv_var = model.named_vars.get(k)
-                nv = param_vars.get(k, rv_var)
-                nv = getattr(nv.tag, "value_var", nv)
-
-                transform = getattr(nv.tag, "transform", None)
-                if transform:
-                    # TODO: The compiled graph behind this should be cached and
-                    # reused (if it isn't already).
-                    v = transform.forward(rv_var, v).eval()
-
-                if nv.name in param_vars:
-                    # Update the shared parameter variables in `param_vars`
-                    param_vars[nv.name].set_value(v)
-                else:
-                    # Create an argument entry for the (potentially
-                    # transformed) "value" variable
-                    pt_d[nv.name] = v
-
+            pt_d = self._model_input_dict(model, param_vars, pt)
             pt_logp = Point(pt_d, model=model)
             pt_ref = Point(pt, filter_model_vars=False, model=model)
             assert_almost_equal(
@@ -674,6 +658,30 @@ def logp_reference(args):
                 err_msg=str(pt),
             )
 
+    def _model_input_dict(self, model, param_vars, pt):
+        """Create a dict with only the necessary, transformed logp inputs."""
+        pt_d = {}
+        for k, v in pt.items():
+            rv_var = model.named_vars.get(k)
+            nv = param_vars.get(k, rv_var)
+            nv = getattr(nv.tag, "value_var", nv)
+
+            transform = getattr(nv.tag, "transform", None)
+            if transform:
+                # todo: the compiled graph behind this should be cached and
+                # reused (if it isn't already).
+                v = transform.forward(rv_var, v).eval()
+
+            if nv.name in param_vars:
+                # update the shared parameter variables in `param_vars`
+                param_vars[nv.name].set_value(v)
+            else:
+                # create an argument entry for the (potentially
+                # transformed) "value" variable
+                pt_d[nv.name] = v
+
+        return pt_d
+
     def check_logcdf(
         self,
         pymc3_dist,
@@ -970,6 +978,7 @@ def test_normal(self):
             R,
             {"mu": R, "sigma": Rplus},
             lambda value, mu, sigma: sp.norm.logcdf(value, mu, sigma),
+            decimal=select_by_precision(float64=6, float32=1),
         )
 
     @pytest.mark.xfail(reason="Distribution not refactored yet")
@@ -1012,11 +1021,7 @@ def test_chi_squared(self):
         )
 
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    @pytest.mark.xfail(
-        condition=(aesara.config.floatX == "float32"),
-        reason="Poor CDF in SciPy. See scipy/scipy#869 for details.",
-    )
-    def test_wald_scipy(self):
+    def test_wald_logp(self):
         self.check_logp(
             Wald,
             Rplus,
@@ -1024,6 +1029,13 @@ def test_wald_scipy(self):
             lambda value, mu, alpha: sp.invgauss.logpdf(value, mu=mu, loc=alpha),
             decimal=select_by_precision(float64=6, float32=1),
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Poor CDF in SciPy. See scipy/scipy#869 for details.",
+    )
+    def test_wald_logcdf(self):
         self.check_logcdf(
             Wald,
             Rplus,
@@ -1075,6 +1087,7 @@ def test_beta(self):
             {"alpha": Rplus, "beta": Rplus},
             lambda value, alpha, beta: sp.beta.logcdf(value, alpha, beta),
             n_samples=10,
+            decimal=select_by_precision(float64=5, float32=3),
         )
 
     @pytest.mark.xfail(reason="Distribution not refactored yet")
@@ -1158,7 +1171,6 @@ def modified_scipy_hypergeom_logcdf(value, N, k, n):
             {"N": NatSmall, "k": NatSmall, "n": NatSmall},
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_negative_binomial(self):
         def scipy_mu_alpha_logpmf(value, mu, alpha):
             return sp.nbinom.logpmf(value, alpha, 1 - mu / (mu + alpha))
@@ -1337,7 +1349,7 @@ def test_gamma_logcdf(self):
             skip_paramdomain_outside_edge_test=True,
         )
 
-    def test_inverse_gamma(self):
+    def test_inverse_gamma_logp(self):
         self.check_logp(
             InverseGamma,
             Rplus,
@@ -1346,6 +1358,14 @@ def test_inverse_gamma(self):
         )
         # pymc-devs/aesara#224: skip_paramdomain_outside_edge_test has to be set
         # True to avoid triggering a C-level assertion in the Aesara GammaQ function
+
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Fails on float32 due to numerical issues",
+    )
+    def test_inverse_gamma_logcdf(self):
+        # pymc-devs/aesara#224: skip_paramdomain_outside_edge_test has to be set
+        # True to avoid triggering a C-level assertion in the Aesara GammaQ function
         # in gamma.c file. Can be set back to False (default) once that issue is solved
         self.check_logcdf(
             InverseGamma,
@@ -1387,18 +1407,21 @@ def test_pareto(self):
             lambda value, alpha, m: sp.pareto.logcdf(value, alpha, scale=m),
         )
 
-    @pytest.mark.xfail(
-        condition=(aesara.config.floatX == "float32"),
-        reason="Fails on float32 due to inf issues",
-    )
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_weibull(self):
+    def test_weibull_logp(self):
         self.check_logp(
             Weibull,
             Rplus,
             {"alpha": Rplusbig, "beta": Rplusbig},
             lambda value, alpha, beta: sp.exponweib.logpdf(value, 1, alpha, scale=beta),
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Fails on float32 due to inf issues",
+    )
+    def test_weibull_logcdf(self):
         self.check_logcdf(
             Weibull,
             Rplus,
@@ -1448,29 +1471,42 @@ def test_binomial(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
-    @pytest.mark.xfail(
-        condition=(SCIPY_VERSION < parse("1.4.0")), reason="betabinom is new in Scipy 1.4.0"
-    )
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_beta_binomial(self):
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
+    def test_beta_binomial_distribution(self):
         self.checkd(
             BetaBinomial,
             Nat,
             {"alpha": Rplus, "beta": Rplus, "n": NatSmall},
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    @pytest.mark.skipif(
+        condition=(SCIPY_VERSION < parse("1.4.0")), reason="betabinom is new in Scipy 1.4.0"
+    )
+    def test_beta_binomial_logp(self):
         self.check_logp(
             BetaBinomial,
             Nat,
             {"alpha": Rplus, "beta": Rplus, "n": NatSmall},
             lambda value, alpha, beta, n: sp.betabinom.logpmf(value, a=alpha, b=beta, n=n),
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.skipif(
+        condition=(SCIPY_VERSION < parse("1.4.0")), reason="betabinom is new in Scipy 1.4.0"
+    )
+    def test_beta_binomial_logcdf(self):
         self.check_logcdf(
             BetaBinomial,
             Nat,
             {"alpha": Rplus, "beta": Rplus, "n": NatSmall},
             lambda value, alpha, beta, n: sp.betabinom.logcdf(value, a=alpha, b=beta, n=n),
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_beta_binomial_selfconsistency(self):
         self.check_selfconsistency_discrete_logcdf(
             BetaBinomial,
             Nat,
@@ -1563,14 +1599,20 @@ def test_constantdist(self):
         self.check_logp(Constant, I, {"c": I}, lambda value, c: np.log(c == value))
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_zeroinflatedpoisson(self):
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Fails on float32 due to inf issues",
+    )
+    def test_zeroinflatedpoisson_distribution(self):
         self.checkd(
             ZeroInflatedPoisson,
             Nat,
             {"theta": Rplus, "psi": Unit},
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_zeroinflatedpoisson_logcdf(self):
         self.check_selfconsistency_discrete_logcdf(
             ZeroInflatedPoisson,
             Nat,
@@ -1578,14 +1620,20 @@ def test_zeroinflatedpoisson(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_zeroinflatednegativebinomial(self):
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Fails on float32 due to inf issues",
+    )
+    def test_zeroinflatednegativebinomial_distribution(self):
         self.checkd(
             ZeroInflatedNegativeBinomial,
             Nat,
             {"mu": Rplusbig, "alpha": Rplusbig, "psi": Unit},
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_zeroinflatednegativebinomial_logcdf(self):
         self.check_selfconsistency_discrete_logcdf(
             ZeroInflatedNegativeBinomial,
             Nat,
@@ -1594,7 +1642,6 @@ def test_zeroinflatednegativebinomial(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_zeroinflatedbinomial(self):
         self.checkd(
@@ -1602,6 +1649,9 @@ def test_zeroinflatedbinomial(self):
             Nat,
             {"n": NatSmall, "p": Unit, "psi": Unit},
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_zeroinflatedbinomial_logcdf(self):
         self.check_selfconsistency_discrete_logcdf(
             ZeroInflatedBinomial,
             Nat,
@@ -1609,7 +1659,6 @@ def test_zeroinflatedbinomial(self):
             n_samples=10,
         )
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     @pytest.mark.parametrize("n", [1, 2, 3])
     def test_mvnormal(self, n):
         self.check_logp(
@@ -1617,6 +1666,7 @@ def test_mvnormal(self, n):
             RealMatrix(5, n),
             {"mu": Vector(R, n), "tau": PdMatrix(n)},
             normal_logpdf_tau,
+            extra_args={"size": 5},
         )
         self.check_logp(
             MvNormal,
@@ -1629,6 +1679,7 @@ def test_mvnormal(self, n):
             RealMatrix(5, n),
             {"mu": Vector(R, n), "cov": PdMatrix(n)},
             normal_logpdf_cov,
+            extra_args={"size": 5},
         )
         self.check_logp(
             MvNormal,
@@ -1642,6 +1693,7 @@ def test_mvnormal(self, n):
             {"mu": Vector(R, n), "chol": PdMatrixChol(n)},
             normal_logpdf_chol,
             decimal=select_by_precision(float64=6, float32=-1),
+            extra_args={"size": 5},
         )
         self.check_logp(
             MvNormal,
@@ -1650,23 +1702,19 @@ def test_mvnormal(self, n):
             normal_logpdf_chol,
             decimal=select_by_precision(float64=6, float32=0),
         )
-
-        def MvNormalUpper(*args, **kwargs):
-            return MvNormal(lower=False, *args, **kwargs)
-
         self.check_logp(
-            MvNormalUpper,
+            MvNormal,
             Vector(R, n),
             {"mu": Vector(R, n), "chol": PdMatrixCholUpper(n)},
             normal_logpdf_chol_upper,
             decimal=select_by_precision(float64=6, float32=0),
+            extra_args={"lower": False},
         )
 
     @pytest.mark.xfail(
         condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to inf issues",
     )
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_mvnormal_indef(self):
         cov_val = np.array([[1, 0.5], [0.5, -2]])
         cov = aet.matrix("cov")
@@ -1681,14 +1729,13 @@ def test_mvnormal_indef(self):
         f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
-        logp = logp(MvNormal.dist(mu=mu, tau=cov), x)
+        logp = logpt(MvNormal.dist(mu=mu, tau=cov), x)
         f_logp = aesara.function([cov, x], logp)
         assert f_logp(cov_val, np.ones(2)) == -np.inf
         dlogp = aet.grad(logp, cov)
         f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
-    @pytest.mark.xfail(reason="Distribution not refactored yet")
     def test_mvnormal_init_fail(self):
         with Model():
             with pytest.raises(ValueError):
@@ -1873,10 +1920,13 @@ def test_dirichlet_with_batch_shapes(self, dist_shape):
         with pm.Model() as model:
             d = pm.Dirichlet("d", a=a)
 
+        # Generate sample points to test
         d_value = d.tag.value_var
-        d_point = d.eval()
+        d_point = d.eval().astype("float64")
+        d_point /= d_point.sum(axis=-1)[..., None]
+
         if hasattr(d_value.tag, "transform"):
-            d_point_trans = d_value.tag.transform.forward(d, d_point).eval()
+            d_point_trans = d_value.tag.transform.forward(d, aet.as_tensor(d_point)).eval()
         else:
             d_point_trans = d_point
 
@@ -2414,15 +2464,23 @@ def test_rice(self):
             lambda value, b, sigma: sp.rice.logpdf(value, b=b, loc=0, scale=sigma),
         )
 
-    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(reason="Distribution not refactored yet")
-    def test_moyal(self):
+    def test_moyal_logp(self):
+        # Using a custom domain, because the standard `R` domain undeflows with scipy in float64
+        value_domain = Domain([-inf, -1.5, -1, -0.01, 0.0, 0.01, 1, 1.5, inf])
         self.check_logp(
             Moyal,
-            R,
+            value_domain,
             {"mu": R, "sigma": Rplusbig},
             lambda value, mu, sigma: floatX(sp.moyal.logpdf(value, mu, sigma)),
         )
+
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    @pytest.mark.xfail(
+        condition=(aesara.config.floatX == "float32"),
+        reason="Pymc3 underflows earlier than scipy on float32",
+    )
+    def test_moyal_logcdf(self):
         self.check_logcdf(
             Moyal,
             R,
@@ -2726,6 +2784,18 @@ def test_issue_3051(self, dims, dist_cls, kwargs):
         assert isinstance(actual_a, np.ndarray)
         assert actual_a.shape == (X.shape[0],)
 
+    @pytest.mark.xfail(reason="Distribution not refactored yet")
+    def test_issue_4499(self):
+        # Test for bug in Uniform and DiscreteUniform logp when setting check_bounds = False
+        # https://github.com/pymc-devs/pymc3/issues/4499
+        with pm.Model(check_bounds=False) as m:
+            x = pm.Uniform("x", 0, 2, shape=10, transform=None)
+        assert_almost_equal(m.logp_array(np.ones(10)), -np.log(2) * 10)
+
+        with pm.Model(check_bounds=False) as m:
+            x = pm.DiscreteUniform("x", 0, 1, shape=10)
+        assert_almost_equal(m.logp_array(np.ones(10)), -np.log(2) * 10)
+
 
 @pytest.mark.xfail(reason="DensityDist no longer supported")
 def test_serialize_density_dist():
diff --git a/pymc3/tests/test_ndarray_backend.py b/pymc3/tests/test_ndarray_backend.py
index b4744373ed..c46ed73a2e 100644
--- a/pymc3/tests/test_ndarray_backend.py
+++ b/pymc3/tests/test_ndarray_backend.py
@@ -272,8 +272,8 @@ def test_sample_posterior_predictive(self, tmpdir_factory):
             ppc = pm.sample_posterior_predictive(self.trace)
 
         with TestSaveLoad.model() as model:
-            model.default_rng.get_value(borrow=True).seed(10)
             trace2 = pm.load_trace(directory)
+            model.default_rng.get_value(borrow=True).seed(10)
             ppc2 = pm.sample_posterior_predictive(trace2)
             ppc2f = pm.sample_posterior_predictive(trace2)
 
diff --git a/pymc3/tests/test_posteriors.py b/pymc3/tests/test_posteriors.py
index 8ac068bd75..91ecd05f1f 100644
--- a/pymc3/tests/test_posteriors.py
+++ b/pymc3/tests/test_posteriors.py
@@ -76,6 +76,7 @@ class TestNUTSBetaBinomial(sf.NutsFixture, sf.BetaBinomialFixture):
     min_n_eff = 400
 
 
+@pytest.mark.xfail(reason="StudentT not refactored for v4")
 class TestNUTSStudentT(sf.NutsFixture, sf.StudentTFixture):
     n_samples = 10000
     tune = 1000
@@ -97,6 +98,7 @@ class TestNUTSNormalLong(sf.NutsFixture, sf.NormalFixture):
     atol = 0.001
 
 
+@pytest.mark.xfail(reason="StudentT not refactored for v4")
 class TestNUTSLKJCholeskyCov(sf.NutsFixture, sf.LKJCholeskyCovFixture):
     n_samples = 2000
     tune = 1000
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index ff1d1473f0..150675b43c 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -159,13 +159,19 @@ def test_trace_report(self, step_cls, discard):
             # add more variables, because stats are 2D with CompoundStep!
             pm.Uniform("uni")
             trace = pm.sample(
-                draws=100, tune=50, cores=1, discard_tuned_samples=discard, step=step_cls()
+                draws=100,
+                tune=50,
+                cores=1,
+                discard_tuned_samples=discard,
+                step=step_cls(),
+                compute_convergence_checks=False,
+                return_inferencedata=False,
             )
             assert trace.report.n_tune == 50
             assert trace.report.n_draws == 100
             assert isinstance(trace.report.t_sampling, float)
-        pass
 
+    @pytest.mark.xfail(reason="BART not refactored for v4")
     def test_trace_report_bart(self):
         X = np.random.normal(0, 1, size=(3, 250)).T
         Y = np.random.normal(0, 1, size=250)
@@ -218,7 +224,6 @@ def test_return_inferencedata(self, monkeypatch):
             monkeypatch.setattr("pymc3.__version__", "3.10")
             with pytest.warns(FutureWarning, match="pass return_inferencedata"):
                 result = pm.sample(**kwargs)
-        pass
 
     @pytest.mark.parametrize("cores", [1, 2])
     def test_sampler_stat_tune(self, cores):
@@ -228,15 +233,14 @@ def test_sampler_stat_tune(self, cores):
             ).get_sampler_stats("tune", chains=1)
             assert list(tune_stat).count(True) == 5
             assert list(tune_stat).count(False) == 7
-        pass
 
     @pytest.mark.parametrize(
         "start, error",
         [
             ([1, 2], TypeError),
-            ({"x": 1}, ValueError),
+            ({"x": 1}, TypeError),
             ({"x": [1, 2, 3]}, ValueError),
-            ({"x": np.array([[1, 1], [1, 1]])}, ValueError),
+            ({"x": np.array([[1, 1], [1, 1]])}, TypeError),
         ],
     )
     def test_sample_start_bad_shape(self, start, error):
@@ -285,6 +289,7 @@ def callback(trace, draw):
             assert len(trace) == trace_cancel_length
 
 
+@pytest.mark.xfail(reason="Lognormal not refactored for v4")
 def test_sample_find_MAP_does_not_modify_start():
     # see https://github.com/pymc-devs/pymc3/pull/4458
     with pm.Model():
@@ -319,6 +324,7 @@ def test_partial_trace_sample():
         a = pm.Normal("a", mu=0, sigma=1)
         b = pm.Normal("b", mu=0, sigma=1)
         trace = pm.sample(trace=[a])
+        # TODO: Assert something to make this a real test
 
 
 @pytest.mark.parametrize(
@@ -358,13 +364,13 @@ def test_shared_named(self):
                 "theta0",
                 mu=np.atleast_2d(0),
                 tau=np.atleast_2d(1e20),
-                shape=(1, 1),
+                size=(1, 1),
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
-            res = theta.random()
+            res = theta.eval()
             assert np.isclose(res, 0.0)
 
     def test_shared_unnamed(self):
@@ -374,13 +380,13 @@ def test_shared_unnamed(self):
                 "theta0",
                 mu=np.atleast_2d(0),
                 tau=np.atleast_2d(1e20),
-                shape=(1, 1),
+                size=(1, 1),
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
-            res = theta.random()
+            res = theta.eval()
             assert np.isclose(res, 0.0)
 
     def test_constant_named(self):
@@ -390,14 +396,14 @@ def test_constant_named(self):
                 "theta0",
                 mu=np.atleast_2d(0),
                 tau=np.atleast_2d(1e20),
-                shape=(1, 1),
+                size=(1, 1),
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), size=(1, 1)
             )
 
-            res = theta.random()
+            res = theta.eval()
             assert np.isclose(res, 0.0)
 
 
@@ -420,12 +426,16 @@ def test_normal_scalar(self):
         with pm.Model() as model:
             mu = pm.Normal("mu", 0.0, 1.0)
             a = pm.Normal("a", mu=mu, sigma=1, observed=0.0)
-            trace = pm.sample(draws=ndraws, chains=nchains)
+            trace = pm.sample(
+                draws=ndraws,
+                chains=nchains,
+                return_inferencedata=False,
+            )
 
         with model:
             # test list input
             ppc0 = pm.sample_posterior_predictive([model.test_point], samples=10)
-            # deprecated argument is not introduced to fast version [2019/08/20:rpg]
+            # # deprecated argument is not introduced to fast version [2019/08/20:rpg]
             ppc = pm.sample_posterior_predictive(trace, var_names=["a"])
             # test empty ppc
             ppc = pm.sample_posterior_predictive(trace, var_names=[])
@@ -435,11 +445,6 @@ def test_normal_scalar(self):
             ppc = pm.sample_posterior_predictive(trace, keep_size=True)
             assert ppc["a"].shape == (nchains, ndraws)
 
-            # test keep_size parameter and idata input
-            idata = az.from_pymc3(trace)
-            ppc = pm.sample_posterior_predictive(idata, keep_size=True)
-            assert ppc["a"].shape == (nchains, ndraws)
-
             # test default case
             ppc = pm.sample_posterior_predictive(trace, var_names=["a"])
             assert "a" in ppc
@@ -453,11 +458,28 @@ def test_normal_scalar(self):
             ppc = pm.sample_posterior_predictive(trace, size=5, var_names=["a"])
             assert ppc["a"].shape == (nchains * ndraws, 5)
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
+    def test_normal_scalar_idata(self):
+        nchains = 2
+        ndraws = 500
+        with pm.Model() as model:
+            mu = pm.Normal("mu", 0.0, 1.0)
+            a = pm.Normal("a", mu=mu, sigma=1, observed=0.0)
+            trace = pm.sample(
+                draws=ndraws, chains=nchains, return_inferencedata=True, discard_tuned_samples=False
+            )
+
+        with model:
+            # test keep_size parameter and idata input
+            idata = az.from_pymc3(trace)
+            ppc = pm.sample_posterior_predictive(idata, keep_size=True)
+            assert ppc["a"].shape == (nchains, ndraws)
+
     def test_normal_vector(self, caplog):
         with pm.Model() as model:
             mu = pm.Normal("mu", 0.0, 1.0)
             a = pm.Normal("a", mu=mu, sigma=1, observed=np.array([0.5, 0.2]))
-            trace = pm.sample()
+            trace = pm.sample(return_inferencedata=False)
 
         with model:
             # test list input
@@ -473,10 +495,6 @@ def test_normal_vector(self, caplog):
             assert "a" in ppc
             assert ppc["a"].shape == (12, 2)
 
-            # test keep_size parameter with inference data as input...
-            idata = az.from_pymc3(trace)
-            ppc = pm.sample_posterior_predictive(idata, keep_size=True)
-            assert ppc["a"].shape == (trace.nchains, len(trace), 2)
             with pytest.warns(UserWarning):
                 ppc = pm.sample_posterior_predictive(trace, samples=12, var_names=["a"])
             assert "a" in ppc
@@ -487,6 +505,19 @@ def test_normal_vector(self, caplog):
             assert "a" in ppc
             assert ppc["a"].shape == (10, 4, 2)
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
+    def test_normal_vector_idata(self, caplog):
+        with pm.Model() as model:
+            mu = pm.Normal("mu", 0.0, 1.0)
+            a = pm.Normal("a", mu=mu, sigma=1, observed=np.array([0.5, 0.2]))
+            trace = pm.sample(return_inferencedata=False)
+
+        with model:
+            # test keep_size parameter with inference data as input...
+            idata = az.from_pymc3(trace)
+            ppc = pm.sample_posterior_predictive(idata, keep_size=True)
+            assert ppc["a"].shape == (trace.nchains, len(trace), 2)
+
     def test_exceptions(self, caplog):
         with pm.Model() as model:
             mu = pm.Normal("mu", 0.0, 1.0)
@@ -542,6 +573,7 @@ def test_sum_normal(self):
             _, pval = stats.kstest(ppc["b"], stats.norm(scale=scale).cdf)
             assert pval > 0.001
 
+    @pytest.mark.xfail(reason="HalfFlat not refactored for v4")
     def test_model_not_drawable_prior(self):
         data = np.random.poisson(lam=10, size=200)
         model = pm.Model()
@@ -567,7 +599,7 @@ def test_model_shared_variable(self):
             logistic = pm.Deterministic("p", pm.math.sigmoid(coeff * x_shared))
 
             obs = pm.Bernoulli("obs", p=logistic, observed=y_shared)
-            trace = pm.sample(100)
+            trace = pm.sample(100, return_inferencedata=False, compute_convergence_checks=False)
 
         x_shared.set_value([-1, 0, 1.0])
         y_shared.set_value([0, 0, 0])
@@ -599,15 +631,22 @@ def test_deterministic_of_observed(self):
             out_diff = in_1 + in_2
             pm.Deterministic("out", out_diff)
 
-            trace = pm.sample(100, chains=nchains)
-            np.random.seed(0)
+            model.default_rng.get_value(borrow=True).seed(0)
+            trace = pm.sample(
+                100,
+                chains=nchains,
+                return_inferencedata=False,
+                compute_convergence_checks=False,
+            )
+
             rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-4
 
-            np.random.seed(0)
+            model.default_rng.get_value(borrow=True).seed(0)
             ppc = pm.sample_posterior_predictive(
                 model=model,
                 trace=trace,
                 samples=len(trace) * nchains,
+                random_seed=0,
                 var_names=[var.name for var in (model.deterministics + model.basic_RVs)],
             )
 
@@ -627,7 +666,11 @@ def test_deterministic_of_observed_modified_interface(self):
             out_diff = in_1 + in_2
             pm.Deterministic("out", out_diff)
 
-            trace = pm.sample(100)
+            trace = pm.sample(
+                100,
+                return_inferencedata=False,
+                compute_convergence_checks=False,
+            )
             ppc_trace = pm.trace_to_dataframe(
                 trace, varnames=[n for n in trace.varnames if n != "out"]
             ).to_dict("records")
@@ -646,7 +689,7 @@ def test_variable_type(self):
             mu = pm.HalfNormal("mu", 1)
             a = pm.Normal("a", mu=mu, sigma=2, observed=np.array([1, 2]))
             b = pm.Poisson("b", mu, observed=np.array([1, 2]))
-            trace = pm.sample()
+            trace = pm.sample(compute_convergence_checks=False, return_inferencedata=False)
 
         with model:
             ppc = pm.sample_posterior_predictive(trace, samples=1)
@@ -667,6 +710,7 @@ def test_potentials_warning(self):
 
 
 class TestSamplePPCW(SeededTest):
+    @pytest.mark.xfail(reason="sample_posterior_predictive_w not refactored for v4")
     def test_sample_posterior_predictive_w(self):
         data0 = np.random.normal(0, 1, size=50)
         warning_msg = "The number of samples is too small to check convergence reliably"
@@ -676,14 +720,14 @@ def test_sample_posterior_predictive_w(self):
             y = pm.Normal("y", mu=mu, sigma=1, observed=data0)
             with pytest.warns(UserWarning, match=warning_msg):
                 trace_0 = pm.sample(10, tune=0, chains=2, return_inferencedata=False)
-            idata_0 = az.from_pymc3(trace_0)
+            idata_0 = az.from_pymc3(trace_0, log_likelihood=False)
 
         with pm.Model() as model_1:
-            mu = pm.Normal("mu", mu=0, sigma=1, shape=len(data0))
+            mu = pm.Normal("mu", mu=0, sigma=1, size=len(data0))
             y = pm.Normal("y", mu=mu, sigma=1, observed=data0)
             with pytest.warns(UserWarning, match=warning_msg):
                 trace_1 = pm.sample(10, tune=0, chains=2, return_inferencedata=False)
-            idata_1 = az.from_pymc3(trace_1)
+            idata_1 = az.from_pymc3(trace_1, log_likelihood=False)
 
         with pm.Model() as model_2:
             # Model with no observed RVs.
@@ -716,6 +760,7 @@ def test_sample_posterior_predictive_w(self):
         ):
             pm.sample_posterior_predictive_w([trace_0, trace_2], 100, [model_0, model_2])
 
+    @pytest.mark.xfail(reason="sample_posterior_predictive_w not refactored for v4")
     def test_potentials_warning(self):
         warning_msg = "The effect of Potentials on other parameters is ignored during"
         with pm.Model() as m:
@@ -731,32 +776,33 @@ def test_potentials_warning(self):
 @pytest.mark.parametrize(
     "method",
     [
-        "jitter+adapt_diag",
-        "adapt_diag",
         "advi",
         "ADVI+adapt_diag",
         "advi+adapt_diag_grad",
-        "map",
         "advi_map",
+        "jitter+adapt_diag",
+        "adapt_diag",
+        "map",
         "adapt_full",
         "jitter+adapt_full",
     ],
 )
+@pytest.mark.xfail(reason="ADVI not refactored for v4", exception=NotImplementedError)
 def test_exec_nuts_init(method):
     with pm.Model() as model:
-        pm.Normal("a", mu=0, sigma=1, shape=2)
+        pm.Normal("a", mu=0, sigma=1, size=2)
         pm.HalfNormal("b", sigma=1)
     with model:
         start, _ = pm.init_nuts(init=method, n_init=10)
         assert isinstance(start, list)
         assert len(start) == 1
         assert isinstance(start[0], dict)
-        assert "a" in start[0] and "b_log__" in start[0]
+        assert "a" in start[0] and "b" in start[0]
         start, _ = pm.init_nuts(init=method, n_init=10, chains=2)
         assert isinstance(start, list)
         assert len(start) == 2
         assert isinstance(start[0], dict)
-        assert "a" in start[0] and "b_log__" in start[0]
+        assert "a" in start[0] and "b" in start[0]
 
 
 @pytest.mark.parametrize(
@@ -843,8 +889,8 @@ def test_ignores_observed(self):
     def test_respects_shape(self):
         for shape in (2, (2,), (10, 2), (10, 10)):
             with pm.Model():
-                mu = pm.Gamma("mu", 3, 1, shape=1)
-                goals = pm.Poisson("goals", mu, shape=shape)
+                mu = pm.Gamma("mu", 3, 1, size=1)
+                goals = pm.Poisson("goals", mu, size=shape)
                 trace1 = pm.sample_prior_predictive(10, var_names=["mu", "mu", "goals"])
                 trace2 = pm.sample_prior_predictive(10, var_names=["mu", "goals"])
             if shape == 2:  # want to test shape as an int
@@ -854,32 +900,34 @@ def test_respects_shape(self):
 
     def test_multivariate(self):
         with pm.Model():
-            m = pm.Multinomial("m", n=5, p=np.array([0.25, 0.25, 0.25, 0.25]), shape=4)
+            m = pm.Multinomial("m", n=5, p=np.array([0.25, 0.25, 0.25, 0.25]))
             trace = pm.sample_prior_predictive(10)
 
-        assert m.random(size=10).shape == (10, 4)
         assert trace["m"].shape == (10, 4)
 
     def test_multivariate2(self):
         # Added test for issue #3271
         mn_data = np.random.multinomial(n=100, pvals=[1 / 6.0] * 6, size=10)
         with pm.Model() as dm_model:
-            probs = pm.Dirichlet("probs", a=np.ones(6), shape=6)
+            probs = pm.Dirichlet("probs", a=np.ones(6))
             obs = pm.Multinomial("obs", n=100, p=probs, observed=mn_data)
-            burned_trace = pm.sample(20, tune=10, cores=1)
+            burned_trace = pm.sample(
+                20, tune=10, cores=1, return_inferencedata=False, compute_convergence_checks=False
+            )
         sim_priors = pm.sample_prior_predictive(samples=20, model=dm_model)
         sim_ppc = pm.sample_posterior_predictive(burned_trace, samples=20, model=dm_model)
         assert sim_priors["probs"].shape == (20, 6)
-        assert sim_priors["obs"].shape == (20,) + obs.distribution.shape
-        assert sim_ppc["obs"].shape == (20,) + obs.distribution.shape
+        assert sim_priors["obs"].shape == (20,) + mn_data.shape
+        assert sim_ppc["obs"].shape == (20,) + mn_data.shape
 
     def test_layers(self):
         with pm.Model() as model:
-            a = pm.Uniform("a", lower=0, upper=1, shape=10)
-            b = pm.Binomial("b", n=1, p=a, shape=10)
+            a = pm.Uniform("a", lower=0, upper=1, size=10)
+            b = pm.Binomial("b", n=1, p=a, size=10)
 
-        avg = b.random(size=10000).mean(axis=0)
-        npt.assert_array_almost_equal(avg, 0.5 * np.ones_like(b), decimal=2)
+        b_sampler = aesara.function([], b)
+        avg = np.stack([b_sampler() for i in range(10000)]).mean(0)
+        npt.assert_array_almost_equal(avg, 0.5 * np.ones((10,)), decimal=2)
 
     def test_transformed(self):
         n = 18
@@ -893,14 +941,14 @@ def test_transformed(self):
             kappa_log = pm.Exponential("logkappa", lam=5.0)
             kappa = pm.Deterministic("kappa", aet.exp(kappa_log))
 
-            thetas = pm.Beta("thetas", alpha=phi * kappa, beta=(1.0 - phi) * kappa, shape=n)
+            thetas = pm.Beta("thetas", alpha=phi * kappa, beta=(1.0 - phi) * kappa, size=n)
 
             y = pm.Binomial("y", n=at_bats, p=thetas, observed=hits)
             gen = pm.sample_prior_predictive(draws)
 
         assert gen["phi"].shape == (draws,)
         assert gen["y"].shape == (draws, n)
-        assert "thetas_logodds__" in gen
+        assert "thetas" in gen
 
     def test_shared(self):
         n1 = 10
@@ -921,6 +969,7 @@ def test_shared(self):
 
         assert gen2["y"].shape == (draws, n2)
 
+    @pytest.mark.xfail(reason="DensityDist not refactored for v4")
     def test_density_dist(self):
         obs = np.random.normal(-1, 0.1, size=10)
         with pm.Model():
@@ -938,26 +987,28 @@ def test_density_dist(self):
 
     def test_shape_edgecase(self):
         with pm.Model():
-            mu = pm.Normal("mu", shape=5)
+            mu = pm.Normal("mu", size=5)
             sd = pm.Uniform("sd", lower=2, upper=3)
-            x = pm.Normal("x", mu=mu, sigma=sd, shape=5)
+            x = pm.Normal("x", mu=mu, sigma=sd, size=5)
             prior = pm.sample_prior_predictive(10)
         assert prior["mu"].shape == (10, 5)
 
+    @pytest.mark.xfail(reason="ZeroInflatedPoisson not refactored for v4")
     def test_zeroinflatedpoisson(self):
         with pm.Model():
             theta = pm.Beta("theta", alpha=1, beta=1)
             psi = pm.HalfNormal("psi", sd=1)
-            pm.ZeroInflatedPoisson("suppliers", psi=psi, theta=theta, shape=20)
+            pm.ZeroInflatedPoisson("suppliers", psi=psi, theta=theta, size=20)
             gen_data = pm.sample_prior_predictive(samples=5000)
             assert gen_data["theta"].shape == (5000,)
             assert gen_data["psi"].shape == (5000,)
             assert gen_data["suppliers"].shape == (5000, 20)
 
+    @pytest.mark.xfail(reason="Bound not refactored for v4")
     def test_bounded_dist(self):
         with pm.Model() as model:
             BoundedNormal = pm.Bound(pm.Normal, lower=0.0)
-            x = BoundedNormal("x", mu=aet.zeros((3, 1)), sd=1 * aet.ones((3, 1)), shape=(3, 1))
+            x = BoundedNormal("x", mu=aet.zeros((3, 1)), sd=1 * aet.ones((3, 1)), size=(3, 1))
 
         with model:
             prior_trace = pm.sample_prior_predictive(5)
@@ -980,6 +1031,7 @@ def test_point_list_arg_bug_spp(self, point_list_arg_bug_fixture):
         with pmodel:
             pp = pm.sample_posterior_predictive([trace[15]], var_names=["d"])
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_sample_from_xarray_prior(self, point_list_arg_bug_fixture):
         pmodel, trace = point_list_arg_bug_fixture
 
@@ -989,6 +1041,7 @@ def test_sample_from_xarray_prior(self, point_list_arg_bug_fixture):
         with pmodel:
             pp = pm.sample_posterior_predictive(idat.prior, var_names=["d"])
 
+    @pytest.mark.xfail(reason="Arviz not refactored for v4")
     def test_sample_from_xarray_posterior(self, point_list_arg_bug_fixture):
         pmodel, trace = point_list_arg_bug_fixture
         idat = az.from_pymc3(trace)
diff --git a/pymc3/tests/test_transforms.py b/pymc3/tests/test_transforms.py
index cedf33c37f..ad4407b8d9 100644
--- a/pymc3/tests/test_transforms.py
+++ b/pymc3/tests/test_transforms.py
@@ -47,6 +47,8 @@
 def check_transform(transform, domain, constructor=aet.dscalar, test=0, rv_var=None):
     x = constructor("x")
     x.tag.test_value = test
+    if rv_var is None:
+        rv_var = x
     # test forward and forward_val
     # FIXME: What's being tested here?  That the transformed graph can compile?
     forward_f = aesara.function([x], transform.forward(rv_var, x))
@@ -63,6 +65,8 @@ def check_vector_transform(transform, domain, rv_var=None):
 def get_values(transform, domain=R, constructor=aet.dscalar, test=0, rv_var=None):
     x = constructor("x")
     x.tag.test_value = test
+    if rv_var is None:
+        rv_var = x
     f = aesara.function([x], transform.backward(rv_var, x))
     return np.array([f(val) for val in domain.vals])
 
@@ -79,6 +83,9 @@ def check_jacobian_det(
     y = constructor("y")
     y.tag.test_value = test
 
+    if rv_var is None:
+        rv_var = y
+
     x = transform.backward(rv_var, y)
     if make_comparable:
         x = make_comparable(x)
@@ -125,7 +132,7 @@ def test_stickbreaking_accuracy():
     x = aet.dvector("x")
     x.tag.test_value = val
     identity_f = aesara.function(
-        [x], tr.stick_breaking.forward(None, tr.stick_breaking.backward(None, x))
+        [x], tr.stick_breaking.forward(x, tr.stick_breaking.backward(x, x))
     )
     close_to(val, identity_f(val), tol)
 
@@ -300,7 +307,7 @@ def check_vectortransform_elementwise_logp(self, model, vect_opt=0):
         assert logpt(x).ndim == jacob_det.ndim
 
         # Hack to get relative tolerance
-        a = logpt(x, array, jacobian=False).eval()
+        a = logpt(x, array.astype(aesara.config.floatX), jacobian=False).eval()
         b = logp_nojac.eval()
         close_to(a, b, np.abs(0.5 * (a + b) * tol))
 
diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py
index ad2ae9ed3d..3475f0cbac 100644
--- a/pymc3/tuning/starting.py
+++ b/pymc3/tuning/starting.py
@@ -29,7 +29,7 @@
 import pymc3 as pm
 
 from pymc3.aesaraf import inputvars
-from pymc3.blocking import DictToArrayBijection
+from pymc3.blocking import DictToArrayBijection, RaveledVars
 from pymc3.model import Point, modelcontext
 from pymc3.util import (
     check_start_vals,
@@ -107,14 +107,22 @@ def find_MAP(
 
     start = Point(start, model=model)
 
-    logp_func = DictToArrayBijection.mapf(model.fastlogp_nojac)
     x0 = DictToArrayBijection.map(start)
 
+    # TODO: If the mapping is fixed, we can simply create graphs for the
+    # mapping and avoid all this bijection overhead
+    def logp_func(x):
+        return DictToArrayBijection.mapf(model.fastlogp_nojac)(RaveledVars(x, x0.point_map_info))
+
     try:
         # This might be needed for calls to `dlogp_func`
         # start_map_info = tuple((v.name, v.shape, v.dtype) for v in vars)
 
-        dlogp_func = DictToArrayBijection.mapf(model.fastdlogp_nojac(vars))
+        def dlogp_func(x):
+            return DictToArrayBijection.mapf(model.fastdlogp_nojac(vars))(
+                RaveledVars(x, x0.point_map_info)
+            )
+
         compute_gradient = True
     except (AttributeError, NotImplementedError, tg.NullTypeGradError):
         compute_gradient = False
@@ -135,7 +143,9 @@ def find_MAP(
         cost_func = CostFuncWrapper(maxeval, progressbar, logp_func)
 
     try:
-        opt_result = minimize(cost_func, x0, method=method, jac=compute_gradient, *args, **kwargs)
+        opt_result = minimize(
+            cost_func, x0.data, method=method, jac=compute_gradient, *args, **kwargs
+        )
         mx0 = opt_result["x"]  # r -> opt_result
     except (KeyboardInterrupt, StopIteration) as e:
         mx0, opt_result = cost_func.previous_x, None
@@ -149,6 +159,8 @@ def find_MAP(
             cost_func.progress.update(last_v)
             print()
 
+    mx0 = RaveledVars(mx0, x0.point_map_info)
+
     vars = get_default_varnames(
         [v.tag.value_var for v in model.unobserved_RVs], include_transformed
     )
diff --git a/pymc3/util.py b/pymc3/util.py
index 459968aa28..a4deff209a 100644
--- a/pymc3/util.py
+++ b/pymc3/util.py
@@ -199,6 +199,10 @@ def check_start_vals(start, model):
     """
     start_points = [start] if isinstance(start, dict) else start
     for elem in start_points:
+
+        for k, v in elem.items():
+            elem[k] = np.asarray(v, dtype=model[k].dtype)
+
         if not set(elem.keys()).issubset(model.named_vars.keys()):
             extra_keys = ", ".join(set(elem.keys()) - set(model.named_vars.keys()))
             valid_keys = ", ".join(model.named_vars.keys())
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index d7f62167e2..bf75a63719 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -63,6 +63,7 @@
 from pymc3.model import modelcontext
 from pymc3.util import get_default_varnames, get_transformed
 from pymc3.variational.updates import adagrad_window
+from pymc3.vartypes import discrete_types
 
 __all__ = ["ObjectiveFunction", "Operator", "TestFunction", "Group", "Approximation"]
 
@@ -831,6 +832,9 @@ def __init__(
         options=None,
         **kwargs,
     ):
+        # XXX: Needs to be refactored for v4
+        raise NotImplementedError("This class needs to be refactored for v4")
+
         if local and not self.supports_batched:
             raise LocalGroupError("%s does not support local groups" % self.__class__)
         if local and rowwise:
@@ -957,7 +961,7 @@ def __init_group__(self, group):
         # self.ordering = ArrayOrdering([])
         self.replacements = dict()
         for var in self.group:
-            if isinstance(var.distribution, pm.Discrete):
+            if var.type.numpy_dtype.name in discrete_types:
                 raise ParametrizationError(f"Discrete variables are not supported by VI: {var}")
             begin = self.ddim
             if self.batched: