@@ -88,6 +88,10 @@ def initialize(self, chunks: List[Document]):
8888 # 初始化图索引
8989 self ._build_graph_index ()
9090
91+ # 初始化父文档映射,每个nodeid对应该chunk所属父文档的document
92+ self ._parent_doc_map = self ._build_parent_doc_map ()
93+ logger .info (f"父文档映射构建完成,菜谱文档数: { len (self ._parent_doc_map )} " )
94+
9195 @staticmethod
9296 def _tokenize_chinese (text : str ) -> List [str ]:
9397 """jieba 精确分词 + 停用词 / 空白 / 单字符过滤"""
@@ -708,6 +712,46 @@ def _rrf_merge(
708712
709713 return merged
710714
715+ def _build_parent_doc_map (self ) -> Dict [str , Document ]:
716+ """{str(node_id): 整篇父菜谱 Document},由分块前的 data_module.documents 懒建一次。"""
717+ docs = getattr (self .data_module , "documents" , None ) or []
718+ m : Dict [str , Document ] = {}
719+ for d in docs :
720+ nid = d .metadata .get ("node_id" )
721+ if nid is not None :
722+ m [str (nid )] = d
723+ return m
724+
725+ def _attach_parent_documents (self , docs : List [Document ]) -> List [Document ]:
726+ """RRF 去重后,前 parent_doc_top_n 条且能在映射中找到父菜谱的,
727+ 用整篇父菜谱(超 parent_doc_max_chars 截断)替换 chunk;其余原样不变。
728+ 不改顺序/数量/排名,不 mutate 输入(被替换的造新 Document,未替换的直接传原对象)。"""
729+ if getattr (self , "_parent_doc_map" , None ) is None :
730+ self ._parent_doc_map = self ._build_parent_doc_map ()
731+ pmap = self ._parent_doc_map
732+ if not pmap :
733+ logger .warning ("父文档映射为空(data_module.documents 未就绪),父文档回填未生效,仍然使用原chunk填充上下文" )
734+ return docs
735+ top_n = getattr (self .config , "parent_doc_top_n" , 3 )
736+ max_chars = getattr (self .config , "parent_doc_max_chars" , 4000 )
737+
738+ out : List [Document ] = []
739+ for i , doc in enumerate (docs ):
740+ if i >= top_n :
741+ out .append (doc )
742+ continue
743+ nid = doc .metadata .get ("node_id" )
744+ key = str (nid if nid is not None else doc .metadata .get ("parent_id" ))
745+ parent = pmap .get (key )
746+ if parent is None :
747+ out .append (doc )
748+ continue
749+ pc = parent .page_content or ""
750+ if len (pc ) > max_chars :
751+ pc = pc [:max_chars ] + "…(父文档已截断)"
752+ out .append (Document (page_content = pc , metadata = dict (doc .metadata )))
753+ return out
754+
711755 def hybrid_search (self , query : str , top_k : int = 5 ) -> List [Document ]:
712756 """
713757 混合检索:三路召回(图键值双层 + 向量 + BM25)→ RRF 融合
@@ -737,6 +781,10 @@ def hybrid_search(self, query: str, top_k: int = 5) -> List[Document]:
737781 top_k = top_k ,
738782 )
739783
784+ # 父文档回填(仅 hybrid_traditional 路;不改排名,仅换上下文内容)
785+ if getattr (self .config , "enable_parent_doc_retrieval" , False ):
786+ final_docs = self ._attach_parent_documents (final_docs )
787+
740788 logger .info (
741789 f"RRF 融合完成:dual={ len (dual_docs )} vector={ len (vector_docs )} "
742790 f"bm25={ len (bm25_docs )} → 最终 { len (final_docs )} 个文档"
0 commit comments