Add parameter calculation notebook

boydgreenfield · boydgreenfield · commit 1604535c635f · 2023-02-11T21:15:04.000-05:00
diff --git a/docs/notebook/calculate-parameters.ipynb b/docs/notebook/calculate-parameters.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import numpy as np\n",
+    "from scipy import stats\n",
+    "from scipy.special import comb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Getting Started\n",
+    "This notebook provides helpful formulas for computing optimal parameters for the construction of B-field (described further [here](https://github.com/onecodex/rust-bfield)). It includes a few sections:\n",
+    "* **Quick Calculator**: Change a few input variables to determine optimal B-field construction parameters\n",
+    "* **Space Efficiency vs. Error Rate**: Visualize B-field space efficiency vs. error rate for B-fields supporting several different maximum numbers of values ($\\theta$)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_nu_and_kappa(max_value, max_nu=64):\n",
+    "    \"\"\"Find ν and κ with a constraint of a `max_nu` value, minimizing κ.\n",
+    "    \"\"\"\n",
+    "    nu = 2\n",
+    "    kappa = 1\n",
+    "    while kappa < nu:\n",
+    "        for nu in range(1, max_nu + 1):\n",
+    "            if comb(nu, kappa) >= max_value:\n",
+    "                return nu, kappa\n",
+    "        kappa += 1\n",
+    "    raise Exception(f\"No value of ν choose κ has a value over {max_value}. Consider raising the `max_nu` parameter.\")\n",
+    "    \n",
+    "    \n",
+    "def calculate_fp_rate(m_over_n, n_hashes):\n",
+    "    return np.power(1 - np.power(np.e, -n_hashes * 1 / m_over_n), n_hashes)\n",
+    "    \n",
+    "    \n",
+    "def calculate_m_over_n_and_hashes_from_per_bit_fp(max_per_bit_fp, max_hashes=12):\n",
+    "    \"\"\"Find an optimal number of hashes, k, and m/n (bits per element), minimizing m/n\n",
+    "    \n",
+    "    See https://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for helpful detail.\n",
+    "    \"\"\"\n",
+    "    m_over_n = 2\n",
+    "    fp_rate = np.inf\n",
+    "    while fp_rate >= max_per_bit_fp:\n",
+    "        for n_hashes in range(1, max_hashes + 1):\n",
+    "            fp_rate = calculate_fp_rate(m_over_n, n_hashes)\n",
+    "            if fp_rate < max_fp_rate:\n",
+    "                return m_over_n, n_hashes\n",
+    "        m_over_n += 1\n",
+    "    raise Exception(f\"No m/n found for max false positive rate of {max_fp_rate}. Consider increasing `max_hashes` parameter.\")\n",
+    "\n",
+    "    \n",
+    "def calculate_m_over_n_and_hashes_from_alpha(max_alpha, max_hashes=12):\n",
+    "    \"\"\"Find an optimal number of hashes, k, and m/n (bits per element), minimizing m/n    \n",
+    "    \"\"\"\n",
+    "    m_over_n = 2\n",
+    "    alpha = np.inf\n",
+    "    while alpha >= max_alpha:\n",
+    "        for n_hashes in range(1, max_hashes + 1):\n",
+    "            fp_rate = calculate_fp_rate(m_over_n, n_hashes)\n",
+    "\n",
+    "            # We skip anything where we're in the lefthand side of the CDF\n",
+    "            if stats.binom.cdf(kappa, nu, fp_rate) < 0.5:\n",
+    "                continue\n",
+    "                \n",
+    "            alpha = stats.binom.cdf(kappa, nu, fp_rate) - stats.binom.cdf(kappa - 1, nu, fp_rate)\n",
+    "            if alpha < max_alpha:\n",
+    "                return m_over_n, n_hashes, alpha\n",
+    "        m_over_n += 1\n",
+    "    raise Exception(f\"No m/n found for max false positive rate of {max_fp_rate}. Consider increasing `max_hashes` parameter.\")\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quick Calculator\n",
+    "Set the following configuration options and then run the cell to compute the required B-field creation parameters:\n",
+    "* `MAX_VALUE`: The maximum value $y$ you'd like to store (alternatively $\\theta$). Note the `rust-bfield` implementation only supports `u32` integers for values and you should strongly consider remapping values to a complete range of natural numbers $1...\\theta$.\n",
+    "* `MAX_FALSE_POSITIVE_RATE`: The maximum false positive rate $(\\alpha)$ you'd like to allow in your B-field. Recommended values for many applications are 0.01 or below.\n",
+    "* `MAX_INDETERMINACY_RATE`: The maximum indeterminacy rate $(\\beta)$ you'd like to allow in your B-field. Recommend a value of 0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MAX_VALUE = 1e6\n",
+    "MAX_FALSE_POSITIVE_RATE = 0.001\n",
+    "MAX_INDETERMINACY_RATE = 0\n",
+    "N_ELEMENTS = 1e9\n",
+    "\n",
+    "# Recommended standard values\n",
+    "MAX_SCALEDOWN = 0.001\n",
+    "\n",
+    "# First we find suitable values of nu and kappa\n",
+    "nu, kappa = calculate_nu_and_kappa(MAX_VALUE)\n",
+    "\n",
+    "# Then we compute the bits per element required for the desired false positive rate on a per bit basis\n",
+    "m_over_n, n_hashes, alpha = calculate_m_over_n_and_hashes_from_alpha(MAX_FALSE_POSITIVE_RATE)\n",
+    "\n",
+    "p = calculate_fp_rate(m_over_n, n_hashes)\n",
+    "bits_per_element = m_over_n * kappa\n",
+    "\n",
+    "# Next, we compute the implied indeterminacy error rate and the required number and size of secondary arrays\n",
+    "uncorrected_beta = stats.binom.cdf(1, nu - kappa, p) - stats.binom.cdf(0, nu - kappa, p)  # this is also the scaledown factor\n",
+    "n_secondaries = 0\n",
+    "calculated_indeterminacy_rate = np.inf\n",
+    "\n",
+    "#\n",
+    "secondary_array_size = N_ELEMENTS\n",
+    "expected_indeterminate_results = int(N_ELEMENTS * uncorrected_beta)\n",
+    "array_sizes = []\n",
+    "debug = False\n",
+    "while calculated_indeterminacy_rate > MAX_INDETERMINACY_RATE:\n",
+    "    # Stop if the expected number of indeterminate results is < 0.5    \n",
+    "    array_sizes.append(secondary_array_size * bits_per_element)\n",
+    "    if expected_indeterminate_results < 0.5:\n",
+    "        break\n",
+    "\n",
+    "    # Scale the secondary array down by the uncorrected 𝛽\n",
+    "    n_secondaries += 1    \n",
+    "    secondary_array_size = int(secondary_array_size * uncorrected_beta)\n",
+    "    \n",
+    "    # But never make an array smaller than N_ELEMENTS * MAX_SCALEDOWN\n",
+    "    if secondary_array_size < N_ELEMENTS * MAX_SCALEDOWN:\n",
+    "        secondary_array_size = int(N_ELEMENTS * MAX_SCALEDOWN)\n",
+    "\n",
+    "    if debug:\n",
+    "        print(f\"The #{n_secondaries} secondary array will be {secondary_array_size:,} elements ({int(expected_indeterminate_results):,} expected elements)\")\n",
+    "        \n",
+    "    # Now calculate the expected number of indeterminate results flowing *out* of the nth secondary array\n",
+    "    secondary_array_size_bits = secondary_array_size * bits_per_element\n",
+    "    corrected_m_over_n = (secondary_array_size / expected_indeterminate_results) * m_over_n\n",
+    "    corrected_p = calculate_fp_rate(corrected_m_over_n, n_hashes)\n",
+    "    \n",
+    "    # Heuristic: But don't allow p to be set to 0, always use at least 10-e7 (1 in 1M)\n",
+    "    corrected_p = max(10e-7, corrected_p)\n",
+    "    corrected_beta = stats.binom.cdf(1, nu - kappa, corrected_p) - stats.binom.cdf(0, nu - kappa, corrected_p)\n",
+    "    expected_indeterminate_results = expected_indeterminate_results * corrected_beta\n",
+    "    \n",
+    "    if debug:\n",
+    "        print(f\"Expect {int(expected_indeterminate_results):,} indeterminate results in next array ({corrected_m_over_n}, corrected p {corrected_p:.10f}), corrected beta {corrected_beta:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"\"\"\n",
+    "Input configuration requirements are:\n",
+    "\n",
+    "`MAX_VALUE` (𝜃) = {int(MAX_VALUE):,}\n",
+    "`MAX_FALSE_POSITIVE_RATE` (𝛼) = {MAX_FALSE_POSITIVE_RATE}\n",
+    "`MAX_INDETERMINACY_RATE` (corrected 𝛽) = {MAX_INDETERMINACY_RATE}\n",
+    "`N_ELEMENTS` (n) = {int(N_ELEMENTS):,}\n",
+    "`MAX_SCALEDOWN` = {MAX_SCALEDOWN} (recommended standard value)\n",
+    "\n",
+    "Recommended parameters are: \n",
+    "\n",
+    "`size` (mκ) = {int(N_ELEMENTS * m_over_n * kappa):,}\n",
+    "`n_hashes` (k) = {n_hashes}\n",
+    "`marker_width` (ν) = {nu}\n",
+    "`n_marker_bits` (κ) = {kappa}\n",
+    "`secondary_scaledown` (uncorrected Array_0 β) = {np.ceil(uncorrected_beta * 1000)/1000:.3f}\n",
+    "`max_scaledown` (-) = {MAX_SCALEDOWN} (recommended standard value)\n",
+    "`n_secondaries` (number of Array_x's) = {n_secondaries}\n",
+    "\n",
+    "Summary statistics:\n",
+    "\n",
+    "* {np.sum(array_sizes, dtype=int):,} total bits ({np.sum(array_sizes) / (8 * 1024**2):.2f} Mb, {np.sum(array_sizes) / (8 * 1024**3):.2f} Gb)\n",
+    "* {np.sum(array_sizes) / N_ELEMENTS:.2f} bits per element\n",
+    "* {np.sum(array_sizes) / (N_ELEMENTS * 8):.2f} bytes per element\n",
+    "* Expected false positive rate (𝛼): {alpha:.4f}\n",
+    "* Expected error rate per bit in the primary array (p): {p:.4f}\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.7"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/notebook/requirements.txt b/docs/notebook/requirements.txt
@@ -0,0 +1,2 @@
+jupyter==1.0.0
+scipy==1.9.3