polaris-hub · zhu0619 · Aug 19, 2024 · Jul 26, 2024 · Jul 29, 2024 · Aug 6, 2024
@@ -16,3 +16,9 @@
         filters: ["!^_"]
 
 ---
+
+::: polaris.dataset.converters.PDBConverter
+    options:
+        filters: ["!^_"]
+
+---
@@ -0,0 +1,345 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "217690be-9836-4e06-930e-ba7efbb37d91",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": [
+     "remove_cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Note: Cell is tagged to not show up in the mkdocs build\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39b58e71",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "<div class=\"admonition abstract highlight\">\n",
+    "    <p class=\"admonition-title\">In short</p>\n",
+    "    <p>This tutorial shows how to create datasets with PDBs through the .zarr format.</p>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e154bb54",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "### Dummy PDB example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "5e201379",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import zarr\n",
+    "import platformdirs\n",
+    "\n",
+    "import numpy as np\n",
+    "import datamol as dm\n",
+    "import pandas as pd\n",
+    "\n",
+    "from polaris.dataset import DatasetFactory\n",
+    "from polaris.dataset.converters import SDFConverter, PDBConverter\n",
+    "\n",
+    "SAVE_DIR = dm.fs.join(platformdirs.user_cache_dir(appname=\"polaris-tutorials\"), \"002\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "14b6c3a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PDB file '/Users/lu.zhu/Library/Caches/polaris-tutorials/002/tutorial.pdb' created successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdb_content = \"\"\"\\\n",
+    "ATOM      1  N   ASN A   1      38.267  13.340  12.748  1.00 18.15           N  \n",
+    "ATOM      2  CA  ASN A   1      37.251  14.218  12.226  1.00 16.56           C  \n",
+    "ATOM      3  C   ASN A   1      36.022  13.500  11.637  1.00 16.50           C  \n",
+    "ATOM      4  O   ASN A   1      35.023  14.079  11.216  1.00 16.60           O  \n",
+    "ATOM      5  CB  ASN A   1      37.767  15.426  11.473  1.00 16.60           C  \n",
+    "TER\n",
+    "END\n",
+    "\"\"\"\n",
+    "\n",
+    "# Specify the file name\n",
+    "pdb_filename = dm.fs.join(SAVE_DIR, \"tutorial.pdb\")\n",
+    "\n",
+    "# Write the string to a PDB file\n",
+    "with open(pdb_filename, \"w\") as pdb_file:\n",
+    "    pdb_file.write(pdb_content)\n",
+    "\n",
+    "print(f\"PDB file '{pdb_filename}' created successfully.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a47ae20",
+   "metadata": {},
+   "source": [
+    "### Create dataset from PDB file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "07442028",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "save_dst = dm.fs.join(SAVE_DIR, \"tutorial_pdb.zarr\")\n",
+    "\n",
+    "factory = DatasetFactory(zarr_root_path=save_dst)\n",
+    "factory.reset(save_dst)\n",
+    "\n",
+    "factory.register_converter(\"pdb\", PDBConverter(pdb_column=\"pdb\"))\n",
+    "factory.add_from_file([pdb_filename])\n",
+    "\n",
+    "# Build the dataset\n",
+    "dataset = factory.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35bb183e",
+   "metadata": {},
+   "source": [
+    "### Check the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "05712cbd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table border=\"1\"><tr><th>name</th><td>None</td></tr><tr><th>description</th><td></td></tr><tr><th>tags</th><td></td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>owner</th><td>None</td></tr><tr><th>polaris_version</th><td>0.7.10.dev7+gb61dfdd.d20240809</td></tr><tr><th>default_adapters</th><td><table border=\"1\"><tr><th>pdb</th><td>PDB_TO_ARRAY</td></tr></table></td></tr><tr><th>zarr_root_path</th><td>/Users/lu.zhu/Library/Caches/polaris-tutorials/002/tutorial_pdb.zarr</td></tr><tr><th>readme</th><td></td></tr><tr><th>annotations</th><td><table border=\"1\"><tr><th>pdb</th><td><table border=\"1\"><tr><th>is_pointer</th><td>True</td></tr><tr><th>modality</th><td>PROTEIN_3D</td></tr><tr><th>description</th><td>None</td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>dtype</th><td>object</td></tr></table></td></tr></table></td></tr><tr><th>source</th><td>None</td></tr><tr><th>license</th><td>None</td></tr><tr><th>curation_reference</th><td>None</td></tr><tr><th>cache_dir</th><td>/Users/lu.zhu/Library/Caches/polaris/datasets/46c15ea7-d397-478e-a3e7-bb81752133f6</td></tr><tr><th>md5sum</th><td>9851ac3224382ee99ca8998d813d7421</td></tr><tr><th>artifact_id</th><td>None</td></tr><tr><th>n_rows</th><td>1</td></tr><tr><th>n_columns</th><td>1</td></tr></table>"
+      ],
+      "text/plain": [
+       "{\n",
+       "  \"name\": null,\n",
+       "  \"description\": \"\",\n",
+       "  \"tags\": [],\n",
+       "  \"user_attributes\": {},\n",
+       "  \"owner\": null,\n",
+       "  \"polaris_version\": \"0.7.10.dev7+gb61dfdd.d20240809\",\n",
+       "  \"default_adapters\": {\n",
+       "    \"pdb\": \"PDB_TO_ARRAY\"\n",
+       "  },\n",
+       "  \"zarr_root_path\": \"/Users/lu.zhu/Library/Caches/polaris-tutorials/002/tutorial_pdb.zarr\",\n",
+       "  \"readme\": \"\",\n",
+       "  \"annotations\": {\n",
+       "    \"pdb\": {\n",
+       "      \"is_pointer\": true,\n",
+       "      \"modality\": \"PROTEIN_3D\",\n",
+       "      \"description\": null,\n",
+       "      \"user_attributes\": {},\n",
+       "      \"dtype\": \"object\"\n",
+       "    }\n",
+       "  },\n",
+       "  \"source\": null,\n",
+       "  \"license\": null,\n",
+       "  \"curation_reference\": null,\n",
+       "  \"cache_dir\": \"/Users/lu.zhu/Library/Caches/polaris/datasets/46c15ea7-d397-478e-a3e7-bb81752133f6\",\n",
+       "  \"md5sum\": \"9851ac3224382ee99ca8998d813d7421\",\n",
+       "  \"artifact_id\": null,\n",
+       "  \"n_rows\": 1,\n",
+       "  \"n_columns\": 1\n",
+       "}"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5f904bc",
+   "metadata": {},
+   "source": [
+    "### Check data table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "6b7017ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>pdb</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>pdb#tutorial</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            pdb\n",
+       "0  pdb#tutorial"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.table"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a89953b8",
+   "metadata": {},
+   "source": [
+    "### Get PDB data from specific row\n",
+    "A array of list of `biotite.Atom` will be returned.\n",
+    "See more details at [fastpdb](https://github.com/biotite-dev/fastpdb) and [Atom](https://github.com/biotite-dev/biotite/blob/main/src/biotite/structure/atoms.py)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "f2583c8d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([\n",
+       "\tAtom(np.array([38.267, 13.34 , 12.748], dtype=float32), chain_id=\"A\", res_id=1, ins_code=\"\", res_name=\"ASN\", hetero=False, atom_name=\"N\", element=\"N\", b_factor=18.15, charge=0, occupancy=1.0),\n",
+       "\tAtom(np.array([37.251, 14.218, 12.226], dtype=float32), chain_id=\"A\", res_id=1, ins_code=\"\", res_name=\"ASN\", hetero=False, atom_name=\"CA\", element=\"C\", b_factor=16.56, charge=0, occupancy=1.0),\n",
+       "\tAtom(np.array([36.022, 13.5  , 11.637], dtype=float32), chain_id=\"A\", res_id=1, ins_code=\"\", res_name=\"ASN\", hetero=False, atom_name=\"C\", element=\"C\", b_factor=16.5, charge=0, occupancy=1.0),\n",
+       "\tAtom(np.array([35.023, 14.079, 11.216], dtype=float32), chain_id=\"A\", res_id=1, ins_code=\"\", res_name=\"ASN\", hetero=False, atom_name=\"O\", element=\"O\", b_factor=16.6, charge=0, occupancy=1.0),\n",
+       "\tAtom(np.array([37.767, 15.426, 11.473], dtype=float32), chain_id=\"A\", res_id=1, ins_code=\"\", res_name=\"ASN\", hetero=False, atom_name=\"CB\", element=\"C\", b_factor=16.6, charge=0, occupancy=1.0)\n",
+       "])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.get_data(0, \"pdb\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72767ef2",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "source": [
+    "The process of completing the dataset's metadata and uploading it to the hub follows the same steps as outlined in the tutorial [dataset_zarr.ipynb](docs/tutorials/dataset_zarr.ipynb)\n",
+    "\n",
+    "The End. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -56,3 +56,6 @@ dependencies:
   - mdx_truly_sane_lists
   - nbconvert
   - mike >=1.0.0
+
+  - pip:
+    - fastpdb