robotics-laboratory · aizamaksutova · Jan 24, 2023 · Feb 4, 2023 · Feb 4, 2023 · Feb 4, 2023
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,47 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-docker-compose
+{
+    "name": "cart-pole",
+
+    // Update the 'dockerComposeFile' list if you have more compose files or use different names.
+    // The .devcontainer/docker-compose.yml file contains any overrides you need/want to make.
+    "dockerComposeFile": ["../docker-compose.yaml"],
+
+    // The 'service' property is the name of the service for the container that VS Code should
+    // use. Update this value and .devcontainer/docker-compose.yml to the real service name.
+    "service": "cartpole",
+
+    // The optional 'workspaceFolder' property is the path VS Code should open by default when
+    // connected. This is typically a file mount in .devcontainer/docker-compose.yml
+    "workspaceFolder": "/cartpole"
+
+    // Features to add to the dev container. More info: https://containers.dev/features.
+    // "features": {},
+
+    // Use 'forwardPorts' to make a list of ports inside the container available locally.
+    // "forwardPorts": [],
+
+    // Uncomment the next line if you want start specific services in your Docker Compose config.
+    // "runServices": [],
+
+    // Uncomment the next line if you want to keep your containers running after VS Code shuts down.
+    // "shutdownAction": "none",
+
+    // Uncomment the next line to run commands after the container is created.
+    // "postCreateCommand": "cat /etc/os-release",
+
+    // Configure tool-specific properties.
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "charliermarsh.ruff",
+                "yzhang.markdown-all-in-one"
+            ]
+        }
+    }
+
+    // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+    // "remoteUser": "cartpole"
+}
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
@@ -0,0 +1 @@
+version: '3.9'
diff --git a/.github/workflows/alt_ci.yml b/.github/workflows/alt_ci.yml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,28 @@
+name: docs
+
+on:
+  push:
+    branches:
+      - master
+      - cart-pole-3  # TODO: Remove after merging to master
+
+permissions:
+  contents: write
+
+jobs:
+  deploy:
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      # TODO: Add proper chaching
+      # TODO: Maybe use poetry-based action
+      - run: pip install mkdocs-material mkdocs-material-extensions mkdocstrings[python] mike poetry
+      - run: |
+          git config user.name github-actions
+          git config user.email [email protected]
+      - run: mike deploy --push --force --update-aliases $(poetry version -s) latest
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,31 @@
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SHELL /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+WORKDIR /tmp
+
+### COMMON
+
+RUN apt update -q \
+    && apt install -yq --no-install-recommends \
+        build-essential \
+        coinor-libipopt-dev \
+        git \
+        python3 \
+        python3-dev \
+        python3-pip \
+    && pip3 install --no-cache-dir -U pip \
+    && rm -rf /var/lib/apt/lists/* && apt-get clean
+
+### POETRY
+
+RUN pip3 install --no-cache-dir -U poetry \
+    && poetry completions bash >> ~/.bash_completion
+
+## PYTHON DEPENDENCIES
+COPY pyproject.toml /tmp/pyproject.toml
+
+RUN poetry config virtualenvs.create false \
+    && poetry install --no-interaction --no-ansi --no-root
diff --git a/SAC/config.yaml b/SAC/config.yaml
@@ -0,0 +1,34 @@
+environment:
+  state_size: 5
+  action_size: 1
+  max_action: 1
+  delta: 0.1
+
+trainer:
+  device_name: cpu
+  gamma: 0.99                    
+  max_buffer_size: 10**5       
+  start_timesteps: 5000      
+  timesteps_per_epoch: 1        
+  batch_size: 128               
+  max_grad_norm: 10             
+  tau: 0.005                     
+  policy_update_freq: 1
+  alpha: 0.1 
+
+actor:
+  lr: 3e-4
+  tau: 1
+  layer_sizes:
+    - 256
+    - 256
+    - 256
+
+critic:
+  lr_1: 3e-4
+  lr_2: 3e-4
+  tau: 1
+  layers:
+    - 256
+    - 256
+    - 256
diff --git a/SAC/models.py b/SAC/models.py
@@ -0,0 +1,87 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+import numpy as np
+
+from torch.distributions import Normal
+
+from replay_buffer import state_to_tensor
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# LOG_STD_MIN, LOG_STD_MAX = -20, 2
+
+class Actor(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super().__init__()
+
+        self.h = 2056  #change
+        self.action_dim = action_dim
+
+        self.actor_model = nn.Sequential(
+            nn.Linear(in_features=state_dim, out_features=self.h),
+            nn.ReLU(),
+            nn.Linear(in_features=self.h, out_features=self.h),
+            nn.ReLU(),
+            nn.Linear(in_features=self.h, out_features=self.h),
+            nn.ReLU(),
+            nn.Linear(in_features=self.h, out_features=self.h),
+            nn.ReLU(),
+            nn.Linear(in_features=self.h, out_features=action_dim*2)
+        )
+
+    def apply(self, states):
+        m = -20
+        M = 2
+        states = states.to(DEVICE)
+        output = self.actor_model(states)
+        means = output[..., :self.action_dim]
+        var = torch.tanh(output[..., self.action_dim:]) + 1
+        var = 0.5 * (M - m) * var + m
+        var = torch.exp(var)
+        normal_distr = Normal(means, var)
+
+        actions_first = normal_distr.rsample()
+        actions = torch.tanh(actions_first)
+        log_prob = normal_distr.log_prob(actions_first) - torch.log(1 - actions**2 + 1e-6)
+
+        #this is a more numerically stable version of the appendix C eq.21 https://arxiv.org/pdf/1801.01290.pdf 
+
+        return actions, log_prob
+
+    def get_action(self, states):
+
+        with torch.no_grad():
+
+            states = state_to_tensor(states)
+            actions, _ = self.apply(states)
+            actions = actions.cpu().detach().numpy()
+
+            assert isinstance(actions, (list,np.ndarray))
+            assert actions.max() <= 1. and actions.min() >= -1, "actions must be in the range [-1, 1]"
+            return actions.item()
+
+
+class Critic(nn.Module):
+
+    def __init__(self, state_dim, action_dim):
+        super().__init__()
+        self.h = 2056
+        input_dim = state_dim + action_dim
+        self.critic_model = nn.Sequential(
+            nn.Linear(in_features=input_dim, out_features=self.h),
+            nn.ReLU(),
+            nn.Linear(in_features=self.h, out_features=self.h),
+            nn.ReLU(),
+            nn.Linear(in_features=self.h, out_features=self.h),
+            nn.ReLU(),
+            nn.Linear(in_features=self.h, out_features=1)
+        )
+
+    def get_qvalues(self, states, actions):
+
+        batch = torch.cat([states, actions], dim=1)
+        qvalues = self.critic_model(batch)
+
+        # assert len(qvalues.shape) == 1 and qvalues.shape[0] == states.shape[0]
+        return qvalues
diff --git a/SAC/replay_buffer.py b/SAC/replay_buffer.py
@@ -0,0 +1,62 @@
+import collections
+import numpy as np
+import torch
+import random
+
+from math import pi, cos, tanh, sin
+
+from cartpole import State, Error
+
+
+
+def state_to_tensor(state: State):
+    return torch.tensor([
+        (state.cart_position / 0.25),
+        (state.cart_velocity / 5.0),
+        (state.cart_acceleration / 7.5),
+        cos(state.pole_angle),
+        state.pole_angle / (6 * np.pi), #important
+        state.pole_angular_velocity / (6 * np.pi)  #change and normalize
+    ])
+
+def make_tensor(from_state, to_state, action, reward):
+    return torch.concat([
+        state_to_tensor(from_state),
+        state_to_tensor(to_state),
+        torch.tensor([action]),
+        torch.tensor([reward]),
+        torch.tensor([to_state.error != Error.NO_ERROR])
+    ])
+
+
+class ReplayBuffer:
+
+    def __init__(self, state_dim: int, maxlen: int):
+        self.state_dim = state_dim
+        self.states = torch.zeros((maxlen, self.state_dim*2 + 3))
+        self.maxlen = maxlen
+        self.ptr = 0
+        self.length = 0
+
+    def __len__(self):
+        return len(self.states)
+
+    def add(self, from_state: State, to_state: State, action: float, reward: float):
+        self.states[self.ptr] = make_tensor(from_state, to_state, action, reward)
+        self.ptr = (self.ptr + 1) % self.maxlen
+        self.length = min(self.length + 1, self.maxlen)
+
+    def sample(self, sample_size: int, device):
+        sample_size = min(self.length, sample_size)
+        sample = self.states[random.sample(range(self.length), sample_size)]
+        return (
+            sample[:, :self.state_dim].to(device),
+            sample[:, self.state_dim:self.state_dim*2].to(device),
+            sample[:, self.state_dim*2].reshape(-1, 1).to(device),
+            sample[:, self.state_dim*2+1].reshape(-1, 1).to(device),
+            sample[:, self.state_dim*2+2].reshape(-1, 1).to(device)
+        )
+    def get_last(self):
+        if len(self.states) == 0:
+            return None
+        return self.states[(self.ptr - 1) % self.maxlen]