Skip to content

Commit d0923a3

Browse files
authored
Merge pull request #115 from yzh3434/pr/parent-doc-retrieval
feat(C9): hybrid_search 加入父文档回填,消除菜谱截断导致的步骤不全问题
2 parents 3e93b32 + c3695c0 commit d0923a3

2 files changed

Lines changed: 56 additions & 0 deletions

File tree

code/C9/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ class GraphRAGConfig:
2828
# 检索配置(LightRAG Round-robin策略)
2929
top_k: int = 5
3030

31+
# 父文档检索配置
32+
enable_parent_doc_retrieval: bool = False # 默认 False,不做父文档回填,直接把chunk当作上下文,有可能会出现步骤不全问题
33+
parent_doc_top_n: int = 3 # 仅 RRF 分前 N 名做父文档替换
34+
parent_doc_max_chars: int = 4000 # 每篇父文档字符上限(兜底)
35+
3136
# 生成配置
3237
temperature: float = 0.1
3338
max_tokens: int = 2048
@@ -61,6 +66,9 @@ def to_dict(self) -> Dict[str, Any]:
6166
'embedding_model': self.embedding_model,
6267
'llm_model': self.llm_model,
6368
'top_k': self.top_k,
69+
'enable_parent_doc_retrieval': self.enable_parent_doc_retrieval,
70+
'parent_doc_top_n': self.parent_doc_top_n,
71+
'parent_doc_max_chars': self.parent_doc_max_chars,
6472

6573
'temperature': self.temperature,
6674
'max_tokens': self.max_tokens,

code/C9/rag_modules/hybrid_retrieval.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ def initialize(self, chunks: List[Document]):
8888
# 初始化图索引
8989
self._build_graph_index()
9090

91+
# 初始化父文档映射,每个nodeid对应该chunk所属父文档的document
92+
self._parent_doc_map = self._build_parent_doc_map()
93+
logger.info(f"父文档映射构建完成,菜谱文档数: {len(self._parent_doc_map)}")
94+
9195
@staticmethod
9296
def _tokenize_chinese(text: str) -> List[str]:
9397
"""jieba 精确分词 + 停用词 / 空白 / 单字符过滤"""
@@ -708,6 +712,46 @@ def _rrf_merge(
708712

709713
return merged
710714

715+
def _build_parent_doc_map(self) -> Dict[str, Document]:
716+
"""{str(node_id): 整篇父菜谱 Document},由分块前的 data_module.documents 懒建一次。"""
717+
docs = getattr(self.data_module, "documents", None) or []
718+
m: Dict[str, Document] = {}
719+
for d in docs:
720+
nid = d.metadata.get("node_id")
721+
if nid is not None:
722+
m[str(nid)] = d
723+
return m
724+
725+
def _attach_parent_documents(self, docs: List[Document]) -> List[Document]:
726+
"""RRF 去重后,前 parent_doc_top_n 条且能在映射中找到父菜谱的,
727+
用整篇父菜谱(超 parent_doc_max_chars 截断)替换 chunk;其余原样不变。
728+
不改顺序/数量/排名,不 mutate 输入(被替换的造新 Document,未替换的直接传原对象)。"""
729+
if getattr(self, "_parent_doc_map", None) is None:
730+
self._parent_doc_map = self._build_parent_doc_map()
731+
pmap = self._parent_doc_map
732+
if not pmap:
733+
logger.warning("父文档映射为空(data_module.documents 未就绪),父文档回填未生效,仍然使用原chunk填充上下文")
734+
return docs
735+
top_n = getattr(self.config, "parent_doc_top_n", 3)
736+
max_chars = getattr(self.config, "parent_doc_max_chars", 4000)
737+
738+
out: List[Document] = []
739+
for i, doc in enumerate(docs):
740+
if i >= top_n:
741+
out.append(doc)
742+
continue
743+
nid = doc.metadata.get("node_id")
744+
key = str(nid if nid is not None else doc.metadata.get("parent_id"))
745+
parent = pmap.get(key)
746+
if parent is None:
747+
out.append(doc)
748+
continue
749+
pc = parent.page_content or ""
750+
if len(pc) > max_chars:
751+
pc = pc[:max_chars] + "…(父文档已截断)"
752+
out.append(Document(page_content=pc, metadata=dict(doc.metadata)))
753+
return out
754+
711755
def hybrid_search(self, query: str, top_k: int = 5) -> List[Document]:
712756
"""
713757
混合检索:三路召回(图键值双层 + 向量 + BM25)→ RRF 融合
@@ -737,6 +781,10 @@ def hybrid_search(self, query: str, top_k: int = 5) -> List[Document]:
737781
top_k=top_k,
738782
)
739783

784+
# 父文档回填(仅 hybrid_traditional 路;不改排名,仅换上下文内容)
785+
if getattr(self.config, "enable_parent_doc_retrieval", False):
786+
final_docs = self._attach_parent_documents(final_docs)
787+
740788
logger.info(
741789
f"RRF 融合完成:dual={len(dual_docs)} vector={len(vector_docs)} "
742790
f"bm25={len(bm25_docs)} → 最终 {len(final_docs)} 个文档"

0 commit comments

Comments
 (0)