Skip to content

Commit 1604535

Browse files
Add parameter calculation notebook
1 parent f512566 commit 1604535

File tree

2 files changed

+231
-0
lines changed

2 files changed

+231
-0
lines changed
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"# Imports\n",
10+
"import numpy as np\n",
11+
"from scipy import stats\n",
12+
"from scipy.special import comb"
13+
]
14+
},
15+
{
16+
"cell_type": "markdown",
17+
"metadata": {},
18+
"source": [
19+
"### Getting Started\n",
20+
"This notebook provides helpful formulas for computing optimal parameters for the construction of B-field (described further [here](https://github.com/onecodex/rust-bfield)). It includes a few sections:\n",
21+
"* **Quick Calculator**: Change a few input variables to determine optimal B-field construction parameters\n",
22+
"* **Space Efficiency vs. Error Rate**: Visualize B-field space efficiency vs. error rate for B-fields supporting several different maximum numbers of values ($\\theta$)."
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": null,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"def calculate_nu_and_kappa(max_value, max_nu=64):\n",
32+
" \"\"\"Find ν and κ with a constraint of a `max_nu` value, minimizing κ.\n",
33+
" \"\"\"\n",
34+
" nu = 2\n",
35+
" kappa = 1\n",
36+
" while kappa < nu:\n",
37+
" for nu in range(1, max_nu + 1):\n",
38+
" if comb(nu, kappa) >= max_value:\n",
39+
" return nu, kappa\n",
40+
" kappa += 1\n",
41+
" raise Exception(f\"No value of ν choose κ has a value over {max_value}. Consider raising the `max_nu` parameter.\")\n",
42+
" \n",
43+
" \n",
44+
"def calculate_fp_rate(m_over_n, n_hashes):\n",
45+
" return np.power(1 - np.power(np.e, -n_hashes * 1 / m_over_n), n_hashes)\n",
46+
" \n",
47+
" \n",
48+
"def calculate_m_over_n_and_hashes_from_per_bit_fp(max_per_bit_fp, max_hashes=12):\n",
49+
" \"\"\"Find an optimal number of hashes, k, and m/n (bits per element), minimizing m/n\n",
50+
" \n",
51+
" See https://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for helpful detail.\n",
52+
" \"\"\"\n",
53+
" m_over_n = 2\n",
54+
" fp_rate = np.inf\n",
55+
" while fp_rate >= max_per_bit_fp:\n",
56+
" for n_hashes in range(1, max_hashes + 1):\n",
57+
" fp_rate = calculate_fp_rate(m_over_n, n_hashes)\n",
58+
" if fp_rate < max_fp_rate:\n",
59+
" return m_over_n, n_hashes\n",
60+
" m_over_n += 1\n",
61+
" raise Exception(f\"No m/n found for max false positive rate of {max_fp_rate}. Consider increasing `max_hashes` parameter.\")\n",
62+
"\n",
63+
" \n",
64+
"def calculate_m_over_n_and_hashes_from_alpha(max_alpha, max_hashes=12):\n",
65+
" \"\"\"Find an optimal number of hashes, k, and m/n (bits per element), minimizing m/n \n",
66+
" \"\"\"\n",
67+
" m_over_n = 2\n",
68+
" alpha = np.inf\n",
69+
" while alpha >= max_alpha:\n",
70+
" for n_hashes in range(1, max_hashes + 1):\n",
71+
" fp_rate = calculate_fp_rate(m_over_n, n_hashes)\n",
72+
"\n",
73+
" # We skip anything where we're in the lefthand side of the CDF\n",
74+
" if stats.binom.cdf(kappa, nu, fp_rate) < 0.5:\n",
75+
" continue\n",
76+
" \n",
77+
" alpha = stats.binom.cdf(kappa, nu, fp_rate) - stats.binom.cdf(kappa - 1, nu, fp_rate)\n",
78+
" if alpha < max_alpha:\n",
79+
" return m_over_n, n_hashes, alpha\n",
80+
" m_over_n += 1\n",
81+
" raise Exception(f\"No m/n found for max false positive rate of {max_fp_rate}. Consider increasing `max_hashes` parameter.\")\n",
82+
" "
83+
]
84+
},
85+
{
86+
"cell_type": "markdown",
87+
"metadata": {},
88+
"source": [
89+
"### Quick Calculator\n",
90+
"Set the following configuration options and then run the cell to compute the required B-field creation parameters:\n",
91+
"* `MAX_VALUE`: The maximum value $y$ you'd like to store (alternatively $\\theta$). Note the `rust-bfield` implementation only supports `u32` integers for values and you should strongly consider remapping values to a complete range of natural numbers $1...\\theta$.\n",
92+
"* `MAX_FALSE_POSITIVE_RATE`: The maximum false positive rate $(\\alpha)$ you'd like to allow in your B-field. Recommended values for many applications are 0.01 or below.\n",
93+
"* `MAX_INDETERMINACY_RATE`: The maximum indeterminacy rate $(\\beta)$ you'd like to allow in your B-field. Recommend a value of 0."
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": null,
99+
"metadata": {},
100+
"outputs": [],
101+
"source": [
102+
"MAX_VALUE = 1e6\n",
103+
"MAX_FALSE_POSITIVE_RATE = 0.001\n",
104+
"MAX_INDETERMINACY_RATE = 0\n",
105+
"N_ELEMENTS = 1e9\n",
106+
"\n",
107+
"# Recommended standard values\n",
108+
"MAX_SCALEDOWN = 0.001\n",
109+
"\n",
110+
"# First we find suitable values of nu and kappa\n",
111+
"nu, kappa = calculate_nu_and_kappa(MAX_VALUE)\n",
112+
"\n",
113+
"# Then we compute the bits per element required for the desired false positive rate on a per bit basis\n",
114+
"m_over_n, n_hashes, alpha = calculate_m_over_n_and_hashes_from_alpha(MAX_FALSE_POSITIVE_RATE)\n",
115+
"\n",
116+
"p = calculate_fp_rate(m_over_n, n_hashes)\n",
117+
"bits_per_element = m_over_n * kappa\n",
118+
"\n",
119+
"# Next, we compute the implied indeterminacy error rate and the required number and size of secondary arrays\n",
120+
"uncorrected_beta = stats.binom.cdf(1, nu - kappa, p) - stats.binom.cdf(0, nu - kappa, p) # this is also the scaledown factor\n",
121+
"n_secondaries = 0\n",
122+
"calculated_indeterminacy_rate = np.inf\n",
123+
"\n",
124+
"#\n",
125+
"secondary_array_size = N_ELEMENTS\n",
126+
"expected_indeterminate_results = int(N_ELEMENTS * uncorrected_beta)\n",
127+
"array_sizes = []\n",
128+
"debug = False\n",
129+
"while calculated_indeterminacy_rate > MAX_INDETERMINACY_RATE:\n",
130+
" # Stop if the expected number of indeterminate results is < 0.5 \n",
131+
" array_sizes.append(secondary_array_size * bits_per_element)\n",
132+
" if expected_indeterminate_results < 0.5:\n",
133+
" break\n",
134+
"\n",
135+
" # Scale the secondary array down by the uncorrected 𝛽\n",
136+
" n_secondaries += 1 \n",
137+
" secondary_array_size = int(secondary_array_size * uncorrected_beta)\n",
138+
" \n",
139+
" # But never make an array smaller than N_ELEMENTS * MAX_SCALEDOWN\n",
140+
" if secondary_array_size < N_ELEMENTS * MAX_SCALEDOWN:\n",
141+
" secondary_array_size = int(N_ELEMENTS * MAX_SCALEDOWN)\n",
142+
"\n",
143+
" if debug:\n",
144+
" print(f\"The #{n_secondaries} secondary array will be {secondary_array_size:,} elements ({int(expected_indeterminate_results):,} expected elements)\")\n",
145+
" \n",
146+
" # Now calculate the expected number of indeterminate results flowing *out* of the nth secondary array\n",
147+
" secondary_array_size_bits = secondary_array_size * bits_per_element\n",
148+
" corrected_m_over_n = (secondary_array_size / expected_indeterminate_results) * m_over_n\n",
149+
" corrected_p = calculate_fp_rate(corrected_m_over_n, n_hashes)\n",
150+
" \n",
151+
" # Heuristic: But don't allow p to be set to 0, always use at least 10-e7 (1 in 1M)\n",
152+
" corrected_p = max(10e-7, corrected_p)\n",
153+
" corrected_beta = stats.binom.cdf(1, nu - kappa, corrected_p) - stats.binom.cdf(0, nu - kappa, corrected_p)\n",
154+
" expected_indeterminate_results = expected_indeterminate_results * corrected_beta\n",
155+
" \n",
156+
" if debug:\n",
157+
" print(f\"Expect {int(expected_indeterminate_results):,} indeterminate results in next array ({corrected_m_over_n}, corrected p {corrected_p:.10f}), corrected beta {corrected_beta:.4f}\")"
158+
]
159+
},
160+
{
161+
"cell_type": "code",
162+
"execution_count": null,
163+
"metadata": {},
164+
"outputs": [],
165+
"source": [
166+
"print(f\"\"\"\n",
167+
"Input configuration requirements are:\n",
168+
"\n",
169+
"`MAX_VALUE` (𝜃) = {int(MAX_VALUE):,}\n",
170+
"`MAX_FALSE_POSITIVE_RATE` (𝛼) = {MAX_FALSE_POSITIVE_RATE}\n",
171+
"`MAX_INDETERMINACY_RATE` (corrected 𝛽) = {MAX_INDETERMINACY_RATE}\n",
172+
"`N_ELEMENTS` (n) = {int(N_ELEMENTS):,}\n",
173+
"`MAX_SCALEDOWN` = {MAX_SCALEDOWN} (recommended standard value)\n",
174+
"\n",
175+
"Recommended parameters are: \n",
176+
"\n",
177+
"`size` (mκ) = {int(N_ELEMENTS * m_over_n * kappa):,}\n",
178+
"`n_hashes` (k) = {n_hashes}\n",
179+
"`marker_width` (ν) = {nu}\n",
180+
"`n_marker_bits` (κ) = {kappa}\n",
181+
"`secondary_scaledown` (uncorrected Array_0 β) = {np.ceil(uncorrected_beta * 1000)/1000:.3f}\n",
182+
"`max_scaledown` (-) = {MAX_SCALEDOWN} (recommended standard value)\n",
183+
"`n_secondaries` (number of Array_x's) = {n_secondaries}\n",
184+
"\n",
185+
"Summary statistics:\n",
186+
"\n",
187+
"* {np.sum(array_sizes, dtype=int):,} total bits ({np.sum(array_sizes) / (8 * 1024**2):.2f} Mb, {np.sum(array_sizes) / (8 * 1024**3):.2f} Gb)\n",
188+
"* {np.sum(array_sizes) / N_ELEMENTS:.2f} bits per element\n",
189+
"* {np.sum(array_sizes) / (N_ELEMENTS * 8):.2f} bytes per element\n",
190+
"* Expected false positive rate (𝛼): {alpha:.4f}\n",
191+
"* Expected error rate per bit in the primary array (p): {p:.4f}\n",
192+
"\"\"\")"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": null,
198+
"metadata": {},
199+
"outputs": [],
200+
"source": []
201+
}
202+
],
203+
"metadata": {
204+
"kernelspec": {
205+
"display_name": "Python 3 (ipykernel)",
206+
"language": "python",
207+
"name": "python3"
208+
},
209+
"language_info": {
210+
"codemirror_mode": {
211+
"name": "ipython",
212+
"version": 3
213+
},
214+
"file_extension": ".py",
215+
"mimetype": "text/x-python",
216+
"name": "python",
217+
"nbconvert_exporter": "python",
218+
"pygments_lexer": "ipython3",
219+
"version": "3.10.7"
220+
},
221+
"vscode": {
222+
"interpreter": {
223+
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
224+
}
225+
}
226+
},
227+
"nbformat": 4,
228+
"nbformat_minor": 2
229+
}

docs/notebook/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
jupyter==1.0.0
2+
scipy==1.9.3

0 commit comments

Comments
 (0)