|
| 1 | +"""Candidate approval workflow (v1.1.0 · #51). |
| 2 | +
|
| 3 | +New entity/concept pages created by `/wiki-ingest` land in |
| 4 | +``wiki/candidates/`` first with ``status: candidate`` frontmatter. |
| 5 | +A human then runs `/wiki-review` to promote, merge, or discard each |
| 6 | +one. Promoted pages move into ``wiki/entities/`` or ``wiki/concepts/``. |
| 7 | +Discarded candidates are archived under ``wiki/archive/candidates/`` |
| 8 | +for audit. |
| 9 | +
|
| 10 | +Rationale: hallucinated entities ("CompanyX" that doesn't exist) should |
| 11 | +not land in the trusted wiki layer without human review. |
| 12 | +
|
| 13 | +Public API: |
| 14 | + - ``list_candidates(wiki_dir)`` → list of Candidate dicts |
| 15 | + - ``promote(slug, wiki_dir, dest)`` → move candidate into trusted area |
| 16 | + - ``merge(slug, wiki_dir, into_slug)`` → fold candidate into an existing page |
| 17 | + - ``discard(slug, wiki_dir, reason)`` → move to archive/ |
| 18 | + - ``stale_candidates(wiki_dir, threshold_days=30)`` → list pages flagged stale |
| 19 | + - ``is_candidate(page_path)`` → bool |
| 20 | +
|
| 21 | +Design choices: |
| 22 | + - Separate ``candidates/`` mirror tree (vs status field only) so the |
| 23 | + build step can cleanly exclude them from the public site by default. |
| 24 | + - ``## Connections`` links from candidates stay as-is when promoted; |
| 25 | + callers run `llmwiki lint` afterward to catch any stale pointers. |
| 26 | + - Discard is non-destructive: pages move to ``wiki/archive/candidates/`` |
| 27 | + with a timestamped reason file so you can recover them later. |
| 28 | +""" |
| 29 | + |
| 30 | +from __future__ import annotations |
| 31 | + |
| 32 | +import re |
| 33 | +import shutil |
| 34 | +from datetime import datetime, timezone |
| 35 | +from pathlib import Path |
| 36 | +from typing import Any, Optional, TypedDict |
| 37 | + |
| 38 | +# ─── constants ───────────────────────────────────────────────────────── |
| 39 | + |
| 40 | +CANDIDATES_DIR_NAME = "candidates" |
| 41 | +ARCHIVE_DIR_NAME = "archive" |
| 42 | +ARCHIVED_CANDIDATES_SUBDIR = "candidates" |
| 43 | + |
| 44 | +# Subfolders mirrored under wiki/candidates/ |
| 45 | +MIRRORED_SUBDIRS = ["entities", "concepts", "sources", "syntheses"] |
| 46 | + |
| 47 | +# Default staleness threshold (days) |
| 48 | +DEFAULT_STALE_DAYS = 30 |
| 49 | + |
| 50 | +FRONTMATTER_RE = re.compile(r"^---\n(.*?)\n---\n(.*)$", re.DOTALL) |
| 51 | + |
| 52 | + |
| 53 | +# ─── types ───────────────────────────────────────────────────────────── |
| 54 | + |
| 55 | +class Candidate(TypedDict): |
| 56 | + """Info about one candidate page waiting for review.""" |
| 57 | + |
| 58 | + slug: str # bare filename stem (e.g. "NewEntity") |
| 59 | + rel_path: str # path relative to wiki/ (e.g. "candidates/entities/NewEntity.md") |
| 60 | + abs_path: Path # absolute path to the file |
| 61 | + kind: str # "entities" | "concepts" | "sources" | "syntheses" |
| 62 | + title: str # frontmatter title |
| 63 | + created: Optional[str] # frontmatter created/last_updated date (YYYY-MM-DD) |
| 64 | + age_days: int # days since `created` |
| 65 | + body_preview: str # first 200 chars of body |
| 66 | + |
| 67 | + |
| 68 | +# ─── helpers ─────────────────────────────────────────────────────────── |
| 69 | + |
| 70 | +def _parse_frontmatter(text: str) -> tuple[dict[str, str], str]: |
| 71 | + """Return (meta_dict, body).""" |
| 72 | + m = FRONTMATTER_RE.match(text) |
| 73 | + if not m: |
| 74 | + return {}, text |
| 75 | + out: dict[str, str] = {} |
| 76 | + for line in m.group(1).splitlines(): |
| 77 | + if ":" not in line: |
| 78 | + continue |
| 79 | + k, _, v = line.partition(":") |
| 80 | + out[k.strip()] = v.strip().strip('"') |
| 81 | + return out, m.group(2) |
| 82 | + |
| 83 | + |
| 84 | +def _age_days(date_str: Optional[str], *, now: Optional[datetime] = None) -> int: |
| 85 | + """Compute days between ``date_str`` (YYYY-MM-DD) and now.""" |
| 86 | + if not date_str: |
| 87 | + return 0 |
| 88 | + try: |
| 89 | + dt = datetime.fromisoformat(date_str) |
| 90 | + if dt.tzinfo is None: |
| 91 | + dt = dt.replace(tzinfo=timezone.utc) |
| 92 | + except (ValueError, TypeError): |
| 93 | + return 0 |
| 94 | + ref = now or datetime.now(timezone.utc) |
| 95 | + return max(0, (ref - dt).days) |
| 96 | + |
| 97 | + |
| 98 | +def is_candidate(page_path: Path) -> bool: |
| 99 | + """True if the path is inside wiki/candidates/ subtree.""" |
| 100 | + parts = page_path.parts |
| 101 | + return CANDIDATES_DIR_NAME in parts |
| 102 | + |
| 103 | + |
| 104 | +def candidates_dir(wiki_dir: Path) -> Path: |
| 105 | + """Return wiki/candidates/ (creates parent if needed).""" |
| 106 | + return wiki_dir / CANDIDATES_DIR_NAME |
| 107 | + |
| 108 | + |
| 109 | +def archive_dir(wiki_dir: Path) -> Path: |
| 110 | + """Return wiki/archive/candidates/.""" |
| 111 | + return wiki_dir / ARCHIVE_DIR_NAME / ARCHIVED_CANDIDATES_SUBDIR |
| 112 | + |
| 113 | + |
| 114 | +# ─── public API ──────────────────────────────────────────────────────── |
| 115 | + |
| 116 | + |
| 117 | +def list_candidates( |
| 118 | + wiki_dir: Path, |
| 119 | + *, |
| 120 | + now: Optional[datetime] = None, |
| 121 | +) -> list[Candidate]: |
| 122 | + """Walk wiki/candidates/ and return one entry per pending page.""" |
| 123 | + root = candidates_dir(wiki_dir) |
| 124 | + if not root.is_dir(): |
| 125 | + return [] |
| 126 | + |
| 127 | + out: list[Candidate] = [] |
| 128 | + for sub in MIRRORED_SUBDIRS: |
| 129 | + sub_dir = root / sub |
| 130 | + if not sub_dir.is_dir(): |
| 131 | + continue |
| 132 | + for path in sorted(sub_dir.glob("*.md")): |
| 133 | + if path.name == "_context.md": |
| 134 | + continue |
| 135 | + try: |
| 136 | + text = path.read_text(encoding="utf-8") |
| 137 | + except OSError: |
| 138 | + continue |
| 139 | + meta, body = _parse_frontmatter(text) |
| 140 | + created = meta.get("last_updated") or meta.get("date") |
| 141 | + out.append({ |
| 142 | + "slug": path.stem, |
| 143 | + "rel_path": str(path.relative_to(wiki_dir)), |
| 144 | + "abs_path": path, |
| 145 | + "kind": sub, |
| 146 | + "title": meta.get("title", path.stem), |
| 147 | + "created": created, |
| 148 | + "age_days": _age_days(created, now=now), |
| 149 | + "body_preview": body.strip()[:200], |
| 150 | + }) |
| 151 | + return out |
| 152 | + |
| 153 | + |
| 154 | +def promote( |
| 155 | + slug: str, |
| 156 | + wiki_dir: Path, |
| 157 | + *, |
| 158 | + kind: Optional[str] = None, |
| 159 | +) -> Path: |
| 160 | + """Move ``wiki/candidates/<kind>/<slug>.md`` → ``wiki/<kind>/<slug>.md``. |
| 161 | +
|
| 162 | + If ``kind`` is omitted, infers from where the candidate lives. Rewrites |
| 163 | + the frontmatter ``status:`` from ``candidate`` → ``reviewed`` so the |
| 164 | + lifecycle rule picks it up. |
| 165 | +
|
| 166 | + Returns the new (promoted) path. Raises FileNotFoundError if the |
| 167 | + candidate does not exist. |
| 168 | + """ |
| 169 | + candidate = _find_candidate(slug, wiki_dir, kind) |
| 170 | + inferred_kind = candidate.parent.name |
| 171 | + target_dir = wiki_dir / inferred_kind |
| 172 | + target_dir.mkdir(parents=True, exist_ok=True) |
| 173 | + target = target_dir / candidate.name |
| 174 | + |
| 175 | + text = candidate.read_text(encoding="utf-8") |
| 176 | + text = _rewrite_status(text, old="candidate", new="reviewed") |
| 177 | + target.write_text(text, encoding="utf-8") |
| 178 | + candidate.unlink() |
| 179 | + return target |
| 180 | + |
| 181 | + |
| 182 | +def merge( |
| 183 | + slug: str, |
| 184 | + wiki_dir: Path, |
| 185 | + *, |
| 186 | + into_slug: str, |
| 187 | + kind: Optional[str] = None, |
| 188 | +) -> Path: |
| 189 | + """Append the candidate's body under a ``## Candidate merge — <date>`` |
| 190 | + heading into the existing wiki page ``<into_slug>.md``, then discard |
| 191 | + the candidate. |
| 192 | +
|
| 193 | + Returns the path of the target page. Raises FileNotFoundError if either |
| 194 | + page is missing. |
| 195 | + """ |
| 196 | + candidate = _find_candidate(slug, wiki_dir, kind) |
| 197 | + inferred_kind = candidate.parent.name |
| 198 | + target = wiki_dir / inferred_kind / f"{into_slug}.md" |
| 199 | + if not target.is_file(): |
| 200 | + raise FileNotFoundError( |
| 201 | + f"merge target not found: {target} (candidate={candidate})" |
| 202 | + ) |
| 203 | + |
| 204 | + candidate_text = candidate.read_text(encoding="utf-8") |
| 205 | + _, candidate_body = _parse_frontmatter(candidate_text) |
| 206 | + |
| 207 | + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") |
| 208 | + appended = ( |
| 209 | + target.read_text(encoding="utf-8").rstrip() + |
| 210 | + f"\n\n## Candidate merge — {today}\n\n" + |
| 211 | + f"Merged from `{candidate.relative_to(wiki_dir)}`:\n\n" + |
| 212 | + candidate_body.strip() + "\n" |
| 213 | + ) |
| 214 | + target.write_text(appended, encoding="utf-8") |
| 215 | + |
| 216 | + # Discard candidate by moving it to archive with a merge-reason file |
| 217 | + _archive_candidate(candidate, wiki_dir, reason=f"merged into {into_slug}") |
| 218 | + return target |
| 219 | + |
| 220 | + |
| 221 | +def discard( |
| 222 | + slug: str, |
| 223 | + wiki_dir: Path, |
| 224 | + *, |
| 225 | + reason: str = "", |
| 226 | + kind: Optional[str] = None, |
| 227 | +) -> Path: |
| 228 | + """Move the candidate to ``wiki/archive/candidates/<timestamp>/<slug>.md`` |
| 229 | + with an adjacent ``<slug>.reason.txt`` capturing why. |
| 230 | +
|
| 231 | + Returns the archived path. |
| 232 | + """ |
| 233 | + candidate = _find_candidate(slug, wiki_dir, kind) |
| 234 | + return _archive_candidate(candidate, wiki_dir, reason=reason) |
| 235 | + |
| 236 | + |
| 237 | +def stale_candidates( |
| 238 | + wiki_dir: Path, |
| 239 | + *, |
| 240 | + threshold_days: int = DEFAULT_STALE_DAYS, |
| 241 | + now: Optional[datetime] = None, |
| 242 | +) -> list[Candidate]: |
| 243 | + """Return candidates older than ``threshold_days``.""" |
| 244 | + return [ |
| 245 | + c for c in list_candidates(wiki_dir, now=now) |
| 246 | + if c["age_days"] >= threshold_days |
| 247 | + ] |
| 248 | + |
| 249 | + |
| 250 | +# ─── internals ───────────────────────────────────────────────────────── |
| 251 | + |
| 252 | + |
| 253 | +def _find_candidate( |
| 254 | + slug: str, |
| 255 | + wiki_dir: Path, |
| 256 | + kind: Optional[str], |
| 257 | +) -> Path: |
| 258 | + """Locate ``<slug>.md`` under wiki/candidates/, optionally filtered by kind.""" |
| 259 | + root = candidates_dir(wiki_dir) |
| 260 | + subs = [kind] if kind else MIRRORED_SUBDIRS |
| 261 | + for sub in subs: |
| 262 | + path = root / sub / f"{slug}.md" |
| 263 | + if path.is_file(): |
| 264 | + return path |
| 265 | + raise FileNotFoundError( |
| 266 | + f"candidate not found: {slug!r} under {root}" |
| 267 | + + (f" (kind={kind})" if kind else "") |
| 268 | + ) |
| 269 | + |
| 270 | + |
| 271 | +def _rewrite_status(text: str, *, old: str, new: str) -> str: |
| 272 | + """Replace ``status: <old>`` with ``status: <new>`` in frontmatter.""" |
| 273 | + pattern = re.compile( |
| 274 | + rf"^(status:\s*){re.escape(old)}(\s*)$", |
| 275 | + re.MULTILINE, |
| 276 | + ) |
| 277 | + if pattern.search(text): |
| 278 | + return pattern.sub(rf"\g<1>{new}\g<2>", text) |
| 279 | + # Add status line to frontmatter if missing |
| 280 | + m = FRONTMATTER_RE.match(text) |
| 281 | + if m: |
| 282 | + new_fm = m.group(1) + f"\nstatus: {new}" |
| 283 | + return f"---\n{new_fm}\n---\n{m.group(2)}" |
| 284 | + return text |
| 285 | + |
| 286 | + |
| 287 | +def _archive_candidate( |
| 288 | + candidate: Path, |
| 289 | + wiki_dir: Path, |
| 290 | + *, |
| 291 | + reason: str, |
| 292 | +) -> Path: |
| 293 | + """Move candidate into archive with reason file.""" |
| 294 | + stamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S") |
| 295 | + dest_dir = archive_dir(wiki_dir) / stamp |
| 296 | + dest_dir.mkdir(parents=True, exist_ok=True) |
| 297 | + |
| 298 | + dest = dest_dir / candidate.name |
| 299 | + shutil.move(str(candidate), str(dest)) |
| 300 | + |
| 301 | + if reason: |
| 302 | + reason_file = dest.with_suffix(".reason.txt") |
| 303 | + reason_file.write_text( |
| 304 | + f"Discarded at: {datetime.now(timezone.utc).isoformat()}\n" |
| 305 | + f"Reason: {reason}\n" |
| 306 | + f"Original path: candidates/{candidate.parent.name}/{candidate.name}\n", |
| 307 | + encoding="utf-8", |
| 308 | + ) |
| 309 | + return dest |
0 commit comments