Skip to content

Commit 4acd877

Browse files
committed
Docs API: Add CrateDB functions extractor
1 parent 42d64dd commit 4acd877

File tree

6 files changed

+264
-5
lines changed

6 files changed

+264
-5
lines changed

CHANGES.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
## Unreleased
44
- MCP: Add subsystem providing a few server and client utilities through
55
the `ctk query mcp {list,inquire,launch}` subcommands.
6-
- Docs API: Added CrateDB settings extractor
6+
- Docs API: Added extractors for CrateDB functions and settings
77

88
## 2025/01/31 v0.0.31
99
- Fixed connectivity for `jobstats collect`

cratedb_toolkit/docs/cli.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,27 @@ def cli(ctx: click.Context, verbose: bool, debug: bool):
1919
return boot_click(ctx, verbose, debug)
2020

2121

22+
def help_functions():
23+
"""
24+
Extract CrateDB SQL function definitions by scraping relevant documentation pages.
25+
26+
Examples
27+
========
28+
29+
# Extract functions to JSON (default)
30+
ctk docs functions
31+
32+
# Extract functions to Markdown
33+
ctk docs functions --format markdown
34+
35+
# Specify custom output file
36+
ctk docs functions --format markdown --output cratedb-functions.md
37+
""" # noqa: E501
38+
39+
2240
def help_settings():
2341
"""
24-
This tool scrapes the CrateDB documentation to extract configuration settings,
25-
their default values, descriptions, and runtime configurability status.
42+
Extract CrateDB configuration settings by scraping relevant documentation pages.
2643
2744
Examples
2845
========
@@ -37,10 +54,37 @@ def help_settings():
3754
ctk docs settings --format sql
3855
3956
# Specify custom output file
40-
ctk docs settings --format markdown --output cratedb_reference.md
57+
ctk docs settings --format markdown --output cratedb-settings.md
4158
""" # noqa: E501
4259

4360

61+
@make_command(cli, "functions", help_functions)
62+
@click.option(
63+
"--format",
64+
"-f",
65+
"format_",
66+
type=click.Choice(["json", "yaml", "markdown", "sql"]),
67+
default="json",
68+
help="Output format (json, yaml, markdown or sql)",
69+
)
70+
@click.option("--output", "-o", default=None, help="Output file name")
71+
def functions(format_: str, output: str):
72+
"""
73+
Extract CrateDB functions from documentation.
74+
75+
Output in JSON, Markdown, or SQL format.
76+
"""
77+
from .functions import FunctionsExtractor
78+
79+
try:
80+
extractor = FunctionsExtractor()
81+
extractor.acquire().render(format_).write(output)
82+
except Exception as e:
83+
msg = f"Failed to extract functions: {e}"
84+
logger.error(msg)
85+
raise click.ClickException(msg) from e
86+
87+
4488
@make_command(cli, "settings", help_settings)
4589
@click.option(
4690
"--format",

cratedb_toolkit/docs/functions.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import dataclasses
2+
import datetime as dt
3+
import logging
4+
from typing import Any, Dict, Optional
5+
6+
import docutils.nodes
7+
import requests
8+
from docutils import nodes
9+
from docutils.examples import internals
10+
from docutils.parsers.rst.directives import register_directive
11+
from docutils.parsers.rst.directives.admonitions import Note
12+
from docutils.parsers.rst.roles import normalized_role_options, register_canonical_role # type: ignore[attr-defined]
13+
14+
from cratedb_toolkit.docs.util import GenericProcessor
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
DOCS_URL = "https://github.com/crate/crate/raw/refs/heads/5.10/docs/general/builtins/scalar-functions.rst"
20+
21+
22+
@dataclasses.dataclass
23+
class Function:
24+
name: str
25+
signature: str
26+
category: str
27+
description: str
28+
# TODO: Parse `returns` and `example` from `description`.
29+
returns: Optional[str] = None
30+
example: Optional[str] = None
31+
32+
def to_dict(self) -> Dict[str, Any]:
33+
"""
34+
Convert the dataclass instance to a dictionary.
35+
36+
Returns:
37+
Dict[str, Any]: A dictionary containing all fields of the instance.
38+
"""
39+
return dataclasses.asdict(self)
40+
41+
42+
@dataclasses.dataclass
43+
class FunctionRegistry:
44+
meta: Dict[str, str] = dataclasses.field(default_factory=dict)
45+
functions: Dict[str, Function] = dataclasses.field(default_factory=dict)
46+
47+
def register(self, function: Function):
48+
"""
49+
Register a new function in the registry.
50+
51+
Adds a Function instance to the registry using its signature as the unique key.
52+
Raises a ValueError if a function with the same signature is already registered.
53+
54+
Args:
55+
function: A Function instance to be added to the registry.
56+
"""
57+
if function.signature in self.functions:
58+
raise ValueError(f"Function already registered: {function.signature}")
59+
self.functions[function.signature] = function
60+
61+
def to_dict(self) -> Dict[str, Any]:
62+
"""
63+
Convert the instance to a dictionary.
64+
65+
Returns:
66+
dict: A dictionary containing the instance's fields and their values.
67+
"""
68+
return dataclasses.asdict(self)
69+
70+
71+
def sphinx_ref_role(role, rawtext, text=None, lineno=None, inliner=None, options=None, content=None):
72+
options = normalized_role_options(options)
73+
text = nodes.unescape(text, True) # type: ignore[attr-defined]
74+
label = text.split(" ", 1)[0]
75+
node = nodes.raw(rawtext, label, **options)
76+
node.source, node.line = inliner.reporter.get_source_and_line(lineno)
77+
return [node], []
78+
79+
80+
@dataclasses.dataclass
81+
class FunctionsExtractor(GenericProcessor):
82+
"""
83+
Extract CrateDB functions from documentation.
84+
Output in JSON, YAML, Markdown, or SQL format.
85+
"""
86+
87+
registry: FunctionRegistry = dataclasses.field(default_factory=FunctionRegistry)
88+
thing: Dict[str, Dict[str, Any]] = dataclasses.field(default_factory=dict)
89+
payload: Optional[str] = None
90+
91+
def acquire(self):
92+
"""
93+
Extract and register CrateDB functions from online documentation.
94+
95+
Fetch documentation from a defined URL, and process its content to extract functions grouped
96+
under categories. For each function section, it parses the title and description to create a
97+
Function instance, updates the registry with metadata such as creation time and generator info.
98+
If no functions are found, the method logs an error and terminates the program. The registry
99+
is then converted to a dictionary and stored in the instance attribute 'thing'.
100+
101+
Returns:
102+
FunctionsExtractor: The instance with an updated function registry.
103+
"""
104+
register_canonical_role("ref", sphinx_ref_role)
105+
register_directive("seealso", Note)
106+
document, pub = internals(requests.get(DOCS_URL, timeout=10).text)
107+
108+
self.registry.meta["created"] = dt.datetime.now().isoformat()
109+
self.registry.meta["generator"] = "CrateDB Toolkit"
110+
111+
item: docutils.nodes.Element
112+
function: docutils.nodes.Element
113+
for item in document:
114+
if item.tagname == "section":
115+
category_title = item.children[0].astext()
116+
for function in item.children: # type: ignore[assignment]
117+
if function.tagname == "section":
118+
function_title = function.children[0].astext()
119+
function_body = function.children[1].astext()
120+
fun = Function(
121+
name=function_title.split("(")[0],
122+
signature=function_title,
123+
category=category_title,
124+
description=function_body,
125+
)
126+
self.registry.register(fun)
127+
128+
if self.registry.functions:
129+
self.thing = self.registry.to_dict()
130+
else:
131+
logger.error("No functions were extracted. Please check the script or documentation structure.")
132+
return self

cratedb_toolkit/docs/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
@dataclasses.dataclass
99
class GenericProcessor:
1010
"""
11-
Extract CrateDB settings from documentation.
11+
Extract CrateDB knowledge bites (e.g., settings, functions) from documentation.
1212
Output in JSON, YAML, Markdown, or SQL format.
1313
"""
1414

doc/docs-api.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,47 @@ uv pip install 'cratedb-toolkit[docs-api]'
99

1010
## Usage
1111

12+
### CrateDB functions
13+
14+
This tool extracts functions from CrateDB's documentation and outputs them
15+
in either JSON, YAML or Markdown formats.
16+
17+
```shell
18+
ctk docs functions --help
19+
```
20+
21+
:::{rubric} Example
22+
:::
23+
```shell
24+
ctk docs functions --format=json
25+
```
26+
```json
27+
{
28+
"meta": {
29+
"created": "2025-04-13T22:57:02.258806",
30+
"generator": "CrateDB Toolkit"
31+
},
32+
"functions": {
33+
"concat('first_arg', second_arg, [ parameter , ... ])": {
34+
"name": "concat",
35+
"signature": "concat('first_arg', second_arg, [ parameter , ... ])",
36+
"category": "String functions",
37+
"description": "Concatenates a variable number of arguments into a single string. It ignores\nNULL values.",
38+
"returns": null,
39+
"example": null
40+
},
41+
"concat_ws('separator', second_arg, [ parameter , ... ])": {
42+
"name": "concat_ws",
43+
"signature": "concat_ws('separator', second_arg, [ parameter , ... ])",
44+
"category": "String functions",
45+
"description": "Concatenates a variable number of arguments into a single string using a\nseparator defined by the first argument. If first argument is NULL the\nreturn value is NULL. Remaining NULL arguments are ignored.",
46+
"returns": null,
47+
"example": null
48+
}
49+
}
50+
}
51+
```
52+
1253
### CrateDB settings
1354

1455
This tool extracts settings from CrateDB's documentation and outputs them

tests/docs/test_cli.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,45 @@ def test_settings_yaml(tmp_path: Path):
9191
# Verify the outcome.
9292
data = yaml.safe_load(output_path.read_text())
9393
assert "whether or not to collect statistical information" in data["stats.enabled"]["purpose"]
94+
95+
96+
def test_functions_json(tmp_path: Path):
97+
"""
98+
Verify `ctk docs functions`.
99+
"""
100+
101+
output_path = tmp_path / "cratedb-functions.json"
102+
103+
# Invoke command.
104+
runner = CliRunner()
105+
result = runner.invoke(
106+
cli,
107+
args=f"functions --format=json --output={output_path}",
108+
catch_exceptions=False,
109+
)
110+
assert result.exit_code == 0
111+
112+
# Verify the outcome.
113+
data = json.loads(output_path.read_text())
114+
assert "substr('string' FROM 'pattern')" in data["functions"]
115+
116+
117+
def test_functions_markdown(tmp_path: Path):
118+
"""
119+
Verify `ctk docs functions`.
120+
"""
121+
122+
output_path = tmp_path / "cratedb-functions.md"
123+
124+
# Invoke command.
125+
runner = CliRunner()
126+
result = runner.invoke(
127+
cli,
128+
args=f"functions --format=markdown --output={output_path}",
129+
catch_exceptions=False,
130+
)
131+
assert result.exit_code == 0
132+
133+
# Verify the outcome.
134+
data = output_path.read_text()
135+
assert "substr('string' FROM 'pattern')" in data

0 commit comments

Comments
 (0)