diff --git a/autoPyTorch/pipeline/components/setup/base_setup.py b/autoPyTorch/pipeline/components/setup/base_setup.py
index 43bb41b56..eff6b6e69 100644
--- a/autoPyTorch/pipeline/components/setup/base_setup.py
+++ b/autoPyTorch/pipeline/components/setup/base_setup.py
@@ -1,4 +1,6 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional
+
+import numpy as np
 
 from autoPyTorch.pipeline.components.base_component import autoPyTorchComponent
 
@@ -7,8 +9,8 @@ class autoPyTorchSetupComponent(autoPyTorchComponent):
     """Provide an abstract interface for schedulers
     in Auto-Pytorch"""
 
-    def __init__(self) -> None:
-        super(autoPyTorchSetupComponent, self).__init__()
+    def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
+        super(autoPyTorchSetupComponent, self).__init__(random_state=random_state)
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
index 648c63e0c..00e0d2c2f 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -89,7 +89,7 @@ class LearnedEntityEmbedding(NetworkEmbeddingComponent):
     Class to learn an embedding for categorical hyperparameters.
     """
 
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None, **kwargs: Any):
+    def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwargs: Any):
         super().__init__(random_state=random_state)
         self.config = kwargs
 
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
index a8b81af2f..5f7ba28b4 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py
@@ -20,7 +20,7 @@ class NoEmbedding(NetworkEmbeddingComponent):
     Class to learn an embedding for categorical hyperparameters.
     """
 
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+    def __init__(self, random_state: Optional[np.random.RandomState] = None):
         super().__init__(random_state=random_state)
 
     def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 8652c347c..5ae2880ed 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple
 
 import numpy as np
 
@@ -11,10 +11,9 @@
 
 
 class NetworkEmbeddingComponent(autoPyTorchSetupComponent):
-    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
-        super().__init__()
+    def __init__(self, random_state: Optional[np.random.RandomState] = None):
+        super().__init__(random_state=random_state)
         self.embedding: Optional[nn.Module] = None
-        self.random_state = random_state
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/GridCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/GridCutMixTrainer.py
index fb22e7cb8..71a170c61 100644
--- a/autoPyTorch/pipeline/components/training/trainer/GridCutMixTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/GridCutMixTrainer.py
@@ -27,11 +27,11 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             typing.Dict[str, np.ndarray]: arguments to the criterion function
         """
         beta = 1.0
-        lam = np.random.beta(beta, beta)
+        lam = self.random_state.beta(beta, beta)
         batch_size, channel, W, H = X.size()
         index = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size)
 
-        r = np.random.rand(1)
+        r = self.random_state.rand(1)
         if beta <= 0 or r > self.alpha:
             return X, {'y_a': y, 'y_b': y[index], 'lam': 1}
 
@@ -40,8 +40,8 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         cut_rat = np.sqrt(1. - lam)
         cut_w = np.int(W * cut_rat)
         cut_h = np.int(H * cut_rat)
-        cx = np.random.randint(W)
-        cy = np.random.randint(H)
+        cx = self.random_state.randint(W)
+        cy = self.random_state.randint(H)
         bbx1 = np.clip(cx - cut_w // 2, 0, W)
         bby1 = np.clip(cy - cut_h // 2, 0, H)
         bbx2 = np.clip(cx + cut_w // 2, 0, W)
diff --git a/autoPyTorch/pipeline/components/training/trainer/GridCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/GridCutOutTrainer.py
index 37c71d53b..b2fd6151a 100644
--- a/autoPyTorch/pipeline/components/training/trainer/GridCutOutTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/GridCutOutTrainer.py
@@ -24,7 +24,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             np.ndarray: that processes data
             typing.Dict[str, np.ndarray]: arguments to the criterion function
         """
-        r = np.random.rand(1)
+        r = self.random_state.rand(1)
         batch_size, channel, W, H = X.size()
         if r > self.cutout_prob:
             return X, {'y_a': y, 'y_b': y, 'lam': 1}
@@ -34,8 +34,8 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         cut_rat = np.sqrt(1. - self.patch_ratio)
         cut_w = np.int(W * cut_rat)
         cut_h = np.int(H * cut_rat)
-        cx = np.random.randint(W)
-        cy = np.random.randint(H)
+        cx = self.random_state.randint(W)
+        cy = self.random_state.randint(H)
         bbx1 = np.clip(cx - cut_w // 2, 0, W)
         bby1 = np.clip(cy - cut_h // 2, 0, H)
         bbx2 = np.clip(cx + cut_w // 2, 0, W)
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
index b639156bb..f0d8536f9 100644
--- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
@@ -1,4 +1,3 @@
-import random
 import typing
 
 import numpy as np
@@ -28,11 +27,11 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             typing.Dict[str, np.ndarray]: arguments to the criterion function
         """
         beta = 1.0
-        lam = np.random.beta(beta, beta)
+        lam = self.random_state.beta(beta, beta)
         batch_size = X.size()[0]
         index = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size)
 
-        r = np.random.rand(1)
+        r = self.random_state.rand(1)
         if beta <= 0 or r > self.alpha:
             return X, {'y_a': y, 'y_b': y[index], 'lam': 1}
 
@@ -40,7 +39,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         # It is unlikely that the batch size is lower than the number of features, but
         # be safe
         size = min(X.shape[0], X.shape[1])
-        indices = torch.tensor(random.sample(range(1, size), max(1, np.int(size * lam))))
+        indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int(size * lam))))
 
         X[:, indices] = X[index, :][:, indices]
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
index 660f6202f..a7936c4f8 100644
--- a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
+++ b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
@@ -1,4 +1,3 @@
-import random
 import typing
 
 import numpy as np
@@ -28,7 +27,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             typing.Dict[str, np.ndarray]: arguments to the criterion function
         """
 
-        r = np.random.rand(1)
+        r = self.random_state.rand(1)
         if r > self.cutout_prob:
             y_a = y
             y_b = y
@@ -39,7 +38,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         # It is unlikely that the batch size is lower than the number of features, but
         # be safe
         size = min(X.shape[0], X.shape[1])
-        indices = torch.tensor(random.sample(range(1, size), max(1, np.int(size * self.patch_ratio))))
+        indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int(size * self.patch_ratio))))
 
         # We use an ordinal encoder on the tabular data
         # -1 is the conceptual equivalent to 0 in a image, that does not