Skip to content

Commit e92ba1f

Browse files
alenditcodex
andcommitted
fix: preserve Clojure symbol case in graph IDs
Keep Clojure extractor IDs and same-file call lookup case-sensitive, and avoid deduplicating same-file code labels that only differ by case. Co-authored-by: Codex <noreply@openai.com>
1 parent 5ffc73a commit e92ba1f

3 files changed

Lines changed: 68 additions & 14 deletions

File tree

graphify/dedup.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,18 @@ def _norm(label: str) -> str:
1919
return re.sub(r"[^a-z0-9]+", " ", label.lower()).strip()
2020

2121

22+
def _exact_label_groups(nodes: list[dict]) -> list[list[dict]]:
23+
"""Avoid exact-deduping same-file symbols that differ only by case."""
24+
labels = [str(node.get("label", node.get("id", ""))) for node in nodes]
25+
if len(set(labels)) <= 1 or len({label.lower() for label in labels}) != 1:
26+
return [nodes]
27+
28+
by_label: dict[str, list[dict]] = defaultdict(list)
29+
for node, label in zip(nodes, labels, strict=True):
30+
by_label[label].append(node)
31+
return [group for group in by_label.values() if len(group) > 1]
32+
33+
2234
def _entropy(label: str) -> float:
2335
"""Shannon entropy in bits/char of the normalised label."""
2436
s = _norm(label)
@@ -185,11 +197,13 @@ def deduplicate_entities(
185197
sf = node.get("source_file") or ""
186198
by_file[sf].append(node)
187199
for file_group in by_file.values():
188-
if len(file_group) > 1:
189-
winner = _pick_winner(file_group)
190-
for node in file_group:
200+
if len(file_group) <= 1:
201+
continue
202+
for exact_group in _exact_label_groups(file_group):
203+
winner = _pick_winner(exact_group)
204+
for node in exact_group:
191205
uf.union(winner["id"], node["id"])
192-
exact_merges += len(file_group) - 1
206+
exact_merges += len(exact_group) - 1
193207

194208
# ── pass 2: MinHash/LSH + Jaro-Winkler (high-entropy nodes only) ─────────
195209
candidates: list[dict] = []

graphify/extract.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6131,6 +6131,13 @@ def node_text(node) -> str:
61316131
def named_children(node) -> list[object]:
61326132
return [child for child in node.children if getattr(child, "is_named", False)]
61336133

6134+
def make_clojure_id(*parts: str) -> str:
6135+
combined = "_".join(p.strip("_.") for p in parts if p)
6136+
combined = unicodedata.normalize("NFKC", combined)
6137+
cleaned = re.sub(r"[^\w]+", "_", combined, flags=re.UNICODE)
6138+
cleaned = re.sub(r"_+", "_", cleaned)
6139+
return cleaned.strip("_")
6140+
61346141
def form_head(node) -> str | None:
61356142
if node.type != "list_lit":
61366143
return None
@@ -6199,7 +6206,7 @@ def add_import_edges(ns_form) -> None:
61996206
else:
62006207
continue
62016208
if module_name:
6202-
add_edge(src_nid, _make_id(module_name), "imports_from",
6209+
add_edge(src_nid, make_clojure_id(module_name), "imports_from",
62036210
module_node.start_point[0] + 1, context="import")
62046211
elif clause_head == ":import":
62056212
for import_form in clause_children[1:]:
@@ -6214,7 +6221,7 @@ def add_import_edges(ns_form) -> None:
62146221
imported = import_symbols[1:] or [package]
62156222
for name in imported:
62166223
target = f"{package}.{name}" if name != package else package
6217-
add_edge(src_nid, _make_id(target), "imports",
6224+
add_edge(src_nid, make_clojure_id(target), "imports",
62186225
import_form.start_point[0] + 1, context="import")
62196226

62206227
def add_protocol_methods(protocol_nid: str, protocol_form) -> None:
@@ -6225,7 +6232,7 @@ def add_protocol_methods(protocol_nid: str, protocol_form) -> None:
62256232
if not method_name:
62266233
continue
62276234
line = child.start_point[0] + 1
6228-
method_nid = _make_id(protocol_nid, method_name)
6235+
method_nid = make_clojure_id(protocol_nid, method_name)
62296236
add_node(method_nid, f".{method_name}()", line)
62306237
add_edge(protocol_nid, method_nid, "method", line)
62316238

@@ -6257,16 +6264,16 @@ def add_definition(form) -> None:
62576264
if head == "defmethod":
62586265
dispatch = node_text(children[2]) if len(children) > 2 else ""
62596266
label = f"{name} {dispatch}".strip()
6260-
nid = _make_id(stem, name, dispatch)
6267+
nid = make_clojure_id(stem, name, dispatch)
62616268
elif head in type_heads:
62626269
label = name
6263-
nid = _make_id(stem, name)
6270+
nid = make_clojure_id(stem, name)
62646271
elif head in callable_heads or head == "defmulti":
62656272
label = f"{name}()"
6266-
nid = _make_id(stem, name)
6273+
nid = make_clojure_id(stem, name)
62676274
else:
62686275
label = name
6269-
nid = _make_id(stem, name)
6276+
nid = make_clojure_id(stem, name)
62706277

62716278
add_node(nid, label, line)
62726279
add_edge(parent_nid(), nid, "contains", line)
@@ -6284,7 +6291,7 @@ def add_definition(form) -> None:
62846291
ns_children = named_children(child)
62856292
if len(ns_children) > 1 and ns_children[1].type == "sym_lit":
62866293
namespace_name = node_text(ns_children[1])
6287-
namespace_nid = _make_id(stem, namespace_name)
6294+
namespace_nid = make_clojure_id(stem, namespace_name)
62886295
add_node(namespace_nid, namespace_name, ns_children[1].start_point[0] + 1)
62896296
add_edge(file_nid, namespace_nid, "contains", ns_children[1].start_point[0] + 1)
62906297
add_import_edges(child)
@@ -6295,7 +6302,7 @@ def add_definition(form) -> None:
62956302
for n in nodes:
62966303
raw = n["label"]
62976304
normalised = raw.strip("()").lstrip(".").split(" ", 1)[0]
6298-
label_to_nid[normalised.casefold()] = n["id"]
6305+
label_to_nid[normalised] = n["id"]
62996306

63006307
seen_call_pairs: set[tuple[str, str]] = set()
63016308

@@ -6319,7 +6326,7 @@ def walk_calls(node, caller_nid: str) -> None:
63196326
if raw_head:
63206327
callee_name = callee_from_symbol(raw_head)
63216328
if callee_name:
6322-
tgt_nid = label_to_nid.get(callee_name.casefold())
6329+
tgt_nid = label_to_nid.get(callee_name)
63236330
if tgt_nid and tgt_nid != caller_nid:
63246331
pair = (caller_nid, tgt_nid)
63256332
if pair not in seen_call_pairs:

tests/test_languages.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33
from pathlib import Path
44
import pytest
5+
from graphify.build import build
56
from graphify.extract import (
67
extract_java, extract_c, extract_cpp, extract_ruby,
78
extract_csharp, extract_kotlin, extract_scala, extract_php,
@@ -905,6 +906,38 @@ def test_clojure_emits_same_file_calls():
905906
assert any("handle-request" in src and "normalize-name" in tgt for src, tgt in calls)
906907

907908

909+
def test_clojure_preserves_case_sensitive_symbols(tmp_path):
910+
sample = tmp_path / "case_sensitive.clj"
911+
sample.write_text(
912+
"""(ns sample.case)
913+
(defn Foo [] :upper)
914+
(defn foo [] :lower)
915+
(defn call-upper [] (Foo))
916+
(defn call-lower [] (foo))
917+
""",
918+
encoding="utf-8",
919+
)
920+
921+
r = extract_clojure(sample)
922+
923+
assert "error" not in r
924+
ids_by_label = {n["label"]: n["id"] for n in r["nodes"]}
925+
assert "Foo()" in ids_by_label
926+
assert "foo()" in ids_by_label
927+
assert ids_by_label["Foo()"] != ids_by_label["foo()"]
928+
929+
calls = _calls(r)
930+
assert ("call-upper()", "Foo()") in calls
931+
assert ("call-lower()", "foo()") in calls
932+
assert ("call-upper()", "foo()") not in calls
933+
assert ("call-lower()", "Foo()") not in calls
934+
935+
G = build([r])
936+
built_labels = [data.get("label") for _, data in G.nodes(data=True)]
937+
assert "Foo()" in built_labels
938+
assert "foo()" in built_labels
939+
940+
908941
def test_clojure_call_edges_have_call_context():
909942
r = extract_clojure(FIXTURES / "sample.clj")
910943
call_edges = _edges_with_relation(r, "calls")

0 commit comments

Comments
 (0)