Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions cratedb_toolkit/docs/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def help_settings():
"--format",
"-f",
"format_",
type=click.Choice(["json", "markdown", "sql"]),
type=click.Choice(["json", "yaml", "markdown", "sql"]),
default="json",
help="Output format (json, markdown or sql)",
help="Output format (json, yaml, markdown or sql)",
)
@click.option("--output", "-o", default=None, help="Output file name")
def settings(format_: str, output: str):
Expand All @@ -57,6 +57,7 @@ def settings(format_: str, output: str):

Output in JSON, Markdown, or SQL format.
"""
from .settings import extract
from .settings import SettingsExtractor

extract(format_, output)
extractor = SettingsExtractor()
extractor.acquire().render(format_).write(output)
278 changes: 133 additions & 145 deletions cratedb_toolkit/docs/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,52 +3,37 @@
CrateDB Settings Extractor

This tool extracts settings from CrateDB's documentation and outputs them
in either JSON or Markdown format, or the SQL statements to set the default value.
in either JSON, YAML, or Markdown format, or the SQL statements to set the default value.
It parses the HTML structure of the documentation to identify settings, their
descriptions, default values, and whether they're runtime configurable.
descriptions, default values, and whether they are runtime configurable or not.

Author: wolta
Date: April 2025
Source: https://gist.github.com/WalBeh/c863eb5cc35ee987d577851f38b64261
"""

# /// script
# requires-python = ">=3.12"
# dependencies = [
# "beautifulsoup4",
# "requests",
# "click",
# "rich",
# "tqdm",
# ]
# ///

import dataclasses
import io
import json
import logging
import re
import sys
from typing import Any, Dict, List, Optional
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import click
import requests
import yaml
from bs4 import BeautifulSoup
from rich.console import Console
from rich.logging import RichHandler
from rich.progress import Progress, SpinnerColumn, TextColumn

# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
handlers=[RichHandler(rich_tracebacks=True, markup=True)],
)
logger = logging.getLogger("cratedb_settings")
console = Console()
# Configure logging.
logger = logging.getLogger(__name__)
console = Console(stderr=True)

# Constants
DOCS_URL = "https://cratedb.com/docs/crate/reference/en/latest/config/cluster.html"
DEFAULT_JSON_OUTPUT = "cratedb_settings.json"
DEFAULT_MD_OUTPUT = "cratedb_settings.md"
SET_CLUSTER = "SET GLOBAL PERSISTENT"


Expand All @@ -61,20 +46,21 @@
"""
settings = {}

with console.status("[bold green]Fetching documentation...", spinner="dots"):
response = requests.get(DOCS_URL, timeout=5)

if response.status_code != 200:
logger.error(f"Failed to fetch documentation: HTTP {response.status_code}")
return {}

logger.info("Successfully retrieved documentation page")

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")
logger.info(f"Extracting CrateDB settings from {DOCS_URL}")

# Process content
with Progress(SpinnerColumn(), TextColumn("[bold blue]{task.description}"), console=console) as progress:
# Download resource.
task0 = progress.add_task("[yellow]Fetching documentation", total=None)
response = requests.get(DOCS_URL, timeout=5)
if response.status_code != 200:
logger.error(f"Failed to fetch documentation: HTTP {response.status_code}")
return {}

Check warning on line 58 in cratedb_toolkit/docs/settings.py

View check run for this annotation

Codecov / codecov/patch

cratedb_toolkit/docs/settings.py#L57-L58

Added lines #L57 - L58 were not covered by tests
progress.update(task0, completed=True)

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find all section divs that contain settings
sections = soup.find_all(["div", "section"], class_=["section", "doc-content"])
logger.debug(f"Found {len(sections)} potential sections")
Expand Down Expand Up @@ -439,54 +425,52 @@
return setting_info


def write_markdown_table(settings: Dict[str, Dict[str, Any]], output_file: str) -> None:
def write_markdown_table(settings: Dict[str, Dict[str, Any]]) -> str:
"""
Write settings to a markdown table file.
Write settings to a Markdown table file.

Args:
settings: Dictionary of settings
output_file: Path to output file
"""
with console.status(f"[bold green]Writing Markdown to {output_file}..."):
with open(output_file, "w", encoding="utf-8") as f:
# Write header with metadata
f.write("# CrateDB Settings Reference\n\n")
f.write(f"*Generated on {click.format_filename(output_file)}*\n\n")
f.write("This document contains all CrateDB settings, their default values, and descriptions.\n\n")

# Write runtime configurable settings table
runtime_settings = {k: v for k, v in settings.items() if v["runtime_configurable"]}
f.write(f"## Runtime Configurable Settings ({len(runtime_settings)})\n\n")
f.write("These settings can be changed while the cluster is running.\n\n")
f.write("| Setting | Default Value | Description | SQL Statement |\n")
f.write("|---------|---------------|-------------|--------------|\n")

# Sort settings for better readability
for key, info in sorted(runtime_settings.items()):
# Escape pipe symbols in all fields
setting = key.replace("|", "\\|")
default = info["default_value"].replace("|", "\\|") if info["default_value"] else "-"
desc = info["purpose"].replace("\n", " ").replace("|", "\\|")
stmt = info.get("stmt", "").replace("|", "\\|") if info.get("stmt") else "-"

f.write(f"| {setting} | {default} | {desc} | {stmt} |\n")

# Write non-runtime configurable settings table
non_runtime_settings = {k: v for k, v in settings.items() if not v["runtime_configurable"]}
f.write(f"\n\n## Non-Runtime Configurable Settings ({len(non_runtime_settings)})\n\n")
f.write("These settings can only be changed by restarting the cluster.\n\n")
f.write("| Setting | Default Value | Description |\n")
f.write("|---------|---------------|-------------|\n")

for key, info in sorted(non_runtime_settings.items()):
# Escape pipe symbols in all fields
setting = key.replace("|", "\\|")
default = info["default_value"].replace("|", "\\|") if info["default_value"] else "-"
desc = info["purpose"].replace("\n", " ").replace("|", "\\|")

f.write(f"| {setting} | {default} | {desc} |\n")

logger.info(f"Successfully wrote Markdown table to {output_file}")
f = io.StringIO()
with console.status("[bold green]Generating Markdown"):
# Write header with metadata
f.write("# CrateDB Settings Reference\n\n")
f.write("This document contains all CrateDB settings, their default values, and descriptions.\n\n")

# Write runtime configurable settings table
runtime_settings = {k: v for k, v in settings.items() if v["runtime_configurable"]}
f.write(f"## Runtime Configurable Settings ({len(runtime_settings)})\n\n")
f.write("These settings can be changed while the cluster is running.\n\n")
f.write("| Setting | Default Value | Description | SQL Statement |\n")
f.write("|---------|---------------|-------------|--------------|\n")

# Sort settings for better readability
for key, info in sorted(runtime_settings.items()):
# Escape pipe symbols in all fields
setting = key.replace("|", "\\|")
default = info["default_value"].replace("|", "\\|") if info["default_value"] else "-"
desc = info["purpose"].replace("\n", " ").replace("|", "\\|")
stmt = info.get("stmt", "").replace("|", "\\|") if info.get("stmt") else "-"

f.write(f"| {setting} | {default} | {desc} | {stmt} |\n")

# Write non-runtime configurable settings table
non_runtime_settings = {k: v for k, v in settings.items() if not v["runtime_configurable"]}
f.write(f"\n\n## Non-Runtime Configurable Settings ({len(non_runtime_settings)})\n\n")
f.write("These settings can only be changed by restarting the cluster.\n\n")
f.write("| Setting | Default Value | Description |\n")
f.write("|---------|---------------|-------------|\n")

for key, info in sorted(non_runtime_settings.items()):
# Escape pipe symbols in all fields
setting = key.replace("|", "\\|")
default = info["default_value"].replace("|", "\\|") if info["default_value"] else "-"
desc = info["purpose"].replace("\n", " ").replace("|", "\\|")

f.write(f"| {setting} | {default} | {desc} |\n")

return f.getvalue()


def generate_sql_statements(settings: Dict[str, Dict[str, Any]]) -> None:
Expand Down Expand Up @@ -605,85 +589,89 @@
print(f"\nTotal statements: {statement_count}") # noqa: T201


def extract(format_: str, output: str):
"""
Extract CrateDB settings from documentation.
def write_sql_statements(settings) -> str:
f = io.StringIO()
f.write("-- CrateDB Runtime Configurable Settings\n")
f.write("-- Generated by settings extractor\n\n")

Output in JSON, Markdown, or SQL format.
count = 0
for _, setting_info in sorted(settings.items()):
if setting_info["runtime_configurable"] and "stmt" in setting_info:
stmt = setting_info["stmt"]

This tool scrapes the CrateDB documentation to extract configuration settings,
their default values, descriptions, and runtime configurability status.
# Ensure statement ends with semicolon
if not stmt.endswith(";"):
stmt += ";"

f.write(f"{stmt}\n")
count += 1

f.write(f"\n-- Total statements: {count}\n")
return f.getvalue()

Examples:

# Extract settings to JSON (default)
python soup2.py
class OutputFormat(str, Enum):
"""Output formats supported by the SettingsExtractor."""

# Extract settings to Markdown
python soup2.py --format markdown
JSON = "json"
YAML = "yaml"
MARKDOWN = "markdown"
SQL = "sql"

# Extract SQL statements for runtime configurable settings
python soup2.py --format sql

# Specify custom output file
python soup2.py --format markdown --output cratedb_reference.md
@dataclasses.dataclass
class SettingsExtractor:
"""
Extract CrateDB settings from documentation.
Output in JSON, YAML, Markdown, or SQL format.
"""

settings: Dict[str, Dict[str, Any]] = dataclasses.field(default_factory=dict)
payload: Optional[str] = None

try:
def acquire(self):
# Extract settings
settings = extract_cratedb_settings()
self.settings = extract_cratedb_settings()

if not settings:
if not self.settings:
logger.error("No settings were extracted. Please check the script or documentation structure.")
sys.exit(1)

# Generate SQL statements for runtime configurable settings
generate_sql_statements(settings)

# Determine output file name
if output is None:
if format_ == "markdown":
output = DEFAULT_MD_OUTPUT
elif format_ == "sql":
output = "cratedb_settings.sql"
else:
output = DEFAULT_JSON_OUTPUT

# Save to file in selected format
# Generate SQL statements for runtime configurable settings.
generate_sql_statements(self.settings)
return self

def render(self, format_: Union[str, OutputFormat]):
# Convert the string format to enum if needed.
if isinstance(format_, str):
try:
format_ = OutputFormat(format_.lower())
except ValueError as e:
raise ValueError(

Check warning on line 650 in cratedb_toolkit/docs/settings.py

View check run for this annotation

Codecov / codecov/patch

cratedb_toolkit/docs/settings.py#L649-L650

Added lines #L649 - L650 were not covered by tests
f"Unsupported format: {format_}. Choose from: {', '.join(f.value for f in OutputFormat)}"
) from e

# Render settings to selected format.
if format_ == "json":
with open(output, "w", encoding="utf-8") as f:
json.dump(settings, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(settings)} settings to {output}")
self.payload = json.dumps(self.settings, indent=2, ensure_ascii=False)
elif format_ == "yaml":
self.payload = yaml.dump(self.settings)
elif format_ == "markdown":
write_markdown_table(settings, output)
self.payload = write_markdown_table(self.settings)
elif format_ == "sql":
with console.status(f"[bold green]Writing SQL statements to {output}..."):
with open(output, "w", encoding="utf-8") as f:
f.write("-- CrateDB Runtime Configurable Settings\n")
f.write("-- Generated by settings extractor\n\n")

count = 0
for _, setting_info in sorted(settings.items()):
if setting_info["runtime_configurable"] and "stmt" in setting_info:
stmt = setting_info["stmt"]

# Ensure statement ends with semicolon
if not stmt.endswith(";"):
stmt += ";"

f.write(f"{stmt}\n")
count += 1

f.write(f"\n-- Total statements: {count}\n")
logger.info(f"Saved {count} SQL statements to {output}")

# Count runtime configurable settings
runtime_count = sum(1 for info in settings.values() if info["runtime_configurable"])
logger.info(f"Found {runtime_count} runtime configurable settings out of {len(settings)} total")

except KeyboardInterrupt:
logger.warning("Operation cancelled by user")
sys.exit(130)
except Exception as e:
logger.exception(f"An error occurred: {e}")
sys.exit(1)
self.payload = write_sql_statements(self.settings)

# Count runtime configurable settings.
runtime_count = sum(1 for info in self.settings.values() if info["runtime_configurable"])
logger.info(f"Found {runtime_count} runtime configurable settings out of {len(self.settings)} total")
return self

def write(self, path: Optional[Path] = None):
if self.payload is None:
raise ValueError("No content to write. Please `render()` first.")

Check warning on line 671 in cratedb_toolkit/docs/settings.py

View check run for this annotation

Codecov / codecov/patch

cratedb_toolkit/docs/settings.py#L671

Added line #L671 was not covered by tests
if path is None:
print(self.payload) # noqa: T201

Check warning on line 673 in cratedb_toolkit/docs/settings.py

View check run for this annotation

Codecov / codecov/patch

cratedb_toolkit/docs/settings.py#L673

Added line #L673 was not covered by tests
else:
with open(path, "w", encoding="utf-8") as f:
f.write(self.payload)
return self
16 changes: 15 additions & 1 deletion doc/docs-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,21 @@

CrateDB Toolkit's Docs API provides programmatic access to CrateDB's documentation.

## CrateDB settings
## Install
```shell
uv pip install 'cratedb-toolkit[docs-api]'
```

## Usage

### CrateDB settings

This tool extracts settings from CrateDB's documentation and outputs them
in either JSON, YAML or Markdown formats, or SQL statements to set the default value.

It parses the HTML structure of the documentation to identify settings, their
descriptions, default values, and whether they are runtime configurable or not.

```shell
ctk docs settings --help
```
Expand Down
Loading