feat: adds multitenant tests via pytest (#1923)

Vasilije1990 · web-flow · commit 16cf9554976e · 2025-12-19T14:16:01.000+01:00
&lt;!-- .github/pull_request_template.md --&gt;

## Description
This PR changes the permission test in e2e tests to use pytest.

Introduces:

- fixtures for the environment setup 
- one eventloop for all pytest tests
- mocking for acreate_structured_output answer generation (for search)
- Asserts in permission test (before we use the example only)


## Acceptance Criteria
&lt;!--
* Key requirements to the new feature or modification;
* Proof that the changes work and meet the requirements;
* Include instructions on how to verify the changes. Describe how to
test it locally;
* Proof that it's sufficiently tested.
--&gt;

## Type of Change
&lt;!-- Please check the relevant option --&gt;
- [ ] Bug fix (non-breaking change that fixes an issue)
- [ ] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [x] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):

## Screenshots/Videos (if applicable)
&lt;!-- Add screenshots or videos to help explain your changes --&gt;

## Pre-submission Checklist
&lt;!-- Please check all boxes that apply before submitting your PR --&gt;
- [x] **I have tested my changes thoroughly before submitting this PR**
- [x] **This PR contains minimal changes necessary to address the
issue/feature**
- [x] My code follows the project's coding standards and style
guidelines
- [x] I have added tests that prove my fix is effective or that my
feature works
- [x] I have added necessary documentation (if applicable)
- [x] All new and existing tests pass
- [x] I have searched existing PRs to ensure this change hasn't been
submitted already
- [x] I have linked any relevant issues in the description
- [x] My commits have clear and descriptive messages

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.


&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
* Entity model now includes description and metadata fields for richer
entity information and indexing.

* **Tests**
* Expanded and restructured permission tests covering multi-tenant and
role-based access flows; improved test scaffolding and stability.
  * E2E test workflow now runs pytest with verbose output and INFO logs.

* **Bug Fixes**
* Access-tracking updates now commit transactions so access timestamps
persist.

* **Chores**
* General formatting, cleanup, and refactoring across modules and
maintenance scripts.

&lt;sub&gt;✏️ Tip: You can customize this high-level summary in your review
settings.&lt;/sub&gt;
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;
diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
@@ -288,7 +288,7 @@ jobs:
           EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
           EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
           EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
-        run: uv run python ./cognee/tests/test_permissions.py
+        run: uv run pytest cognee/tests/test_permissions.py -v --log-level=INFO
 
   test-multi-tenancy:
     name: Test multi tenancy with different situations in Cognee
diff --git a/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py b/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py
@@ -1,52 +1,51 @@
-"""add_last_accessed_to_data  
-  
-Revision ID: e1ec1dcb50b6  
-Revises: 211ab850ef3d  
-Create Date: 2025-11-04 21:45:52.642322  
-  
-"""  
-import os  
-from typing import Sequence, Union  
-  
-from alembic import op  
-import sqlalchemy as sa  
-  
-  
-# revision identifiers, used by Alembic.  
-revision: str = 'e1ec1dcb50b6'  
-down_revision: Union[str, None] = '211ab850ef3d'  
-branch_labels: Union[str, Sequence[str], None] = None  
-depends_on: Union[str, Sequence[str], None] = None  
-  
-  
-def _get_column(inspector, table, name, schema=None):    
-    for col in inspector.get_columns(table, schema=schema):    
-        if col["name"] == name:    
-            return col    
-    return None    
-    
-    
-def upgrade() -> None:    
-    conn = op.get_bind()    
-    insp = sa.inspect(conn)    
-    
-    last_accessed_column = _get_column(insp, "data", "last_accessed")     
-    if not last_accessed_column:    
-        # Always create the column for schema consistency  
-        op.add_column('data',     
-            sa.Column('last_accessed', sa.DateTime(timezone=True), nullable=True)    
-        )    
-          
-        # Only initialize existing records if feature is enabled  
-        enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true"  
-        if enable_last_accessed:  
-            op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")  
-    
-    
-def downgrade() -> None:    
-    conn = op.get_bind()    
-    insp = sa.inspect(conn)    
-        
-    last_accessed_column = _get_column(insp, "data", "last_accessed")    
-    if last_accessed_column:    
-        op.drop_column('data', 'last_accessed')
+"""add_last_accessed_to_data
+
+Revision ID: e1ec1dcb50b6
+Revises: 211ab850ef3d
+Create Date: 2025-11-04 21:45:52.642322
+
+"""
+
+import os
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = "e1ec1dcb50b6"
+down_revision: Union[str, None] = "a1b2c3d4e5f6"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def _get_column(inspector, table, name, schema=None):
+    for col in inspector.get_columns(table, schema=schema):
+        if col["name"] == name:
+            return col
+    return None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    insp = sa.inspect(conn)
+
+    last_accessed_column = _get_column(insp, "data", "last_accessed")
+    if not last_accessed_column:
+        # Always create the column for schema consistency
+        op.add_column("data", sa.Column("last_accessed", sa.DateTime(timezone=True), nullable=True))
+
+        # Only initialize existing records if feature is enabled
+        enable_last_accessed = os.getenv("ENABLE_LAST_ACCESSED", "false").lower() == "true"
+        if enable_last_accessed:
+            op.execute("UPDATE data SET last_accessed = CURRENT_TIMESTAMP")
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+    insp = sa.inspect(conn)
+
+    last_accessed_column = _get_column(insp, "data", "last_accessed")
+    if last_accessed_column:
+        op.drop_column("data", "last_accessed")
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
@@ -2,6 +2,7 @@
 from cognee.modules.engine.models.EntityType import EntityType
 from typing import Optional
 
+
 class Entity(DataPoint):
     name: str
     is_a: Optional[EntityType] = None
diff --git a/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py b/cognee/modules/graph/utils/get_entity_nodes_from_triplets.py
@@ -1,13 +1,12 @@
-
 def get_entity_nodes_from_triplets(triplets):
-        entity_nodes = []
-        seen_ids = set()
-        for triplet in triplets:  
-            if hasattr(triplet, 'node1') and triplet.node1 and triplet.node1.id not in seen_ids:  
-                entity_nodes.append({"id": str(triplet.node1.id)})  
-                seen_ids.add(triplet.node1.id)  
-            if hasattr(triplet, 'node2') and triplet.node2 and triplet.node2.id not in seen_ids:  
-                entity_nodes.append({"id": str(triplet.node2.id)})  
-                seen_ids.add(triplet.node2.id)
+    entity_nodes = []
+    seen_ids = set()
+    for triplet in triplets:
+        if hasattr(triplet, "node1") and triplet.node1 and triplet.node1.id not in seen_ids:
+            entity_nodes.append({"id": str(triplet.node1.id)})
+            seen_ids.add(triplet.node1.id)
+        if hasattr(triplet, "node2") and triplet.node2 and triplet.node2.id not in seen_ids:
+            entity_nodes.append({"id": str(triplet.node2.id)})
+            seen_ids.add(triplet.node2.id)
 
-        return entity_nodes
+    return entity_nodes
diff --git a/cognee/modules/retrieval/chunks_retriever.py b/cognee/modules/retrieval/chunks_retriever.py
@@ -5,7 +5,7 @@
 from cognee.modules.retrieval.base_retriever import BaseRetriever
 from cognee.modules.retrieval.exceptions.exceptions import NoDataError
 from cognee.infrastructure.databases.vector.exceptions.exceptions import CollectionNotFoundError
-from datetime import datetime, timezone  
+from datetime import datetime, timezone
 
 logger = get_logger("ChunksRetriever")
 
@@ -28,7 +28,7 @@ def __init__(
     ):
         self.top_k = top_k
 
-    async def get_context(self, query: str) -> Any:  
+    async def get_context(self, query: str) -> Any:
         """
         Retrieves document chunks context based on the query.
         Searches for document chunks relevant to the specified query using a vector engine.
diff --git a/cognee/modules/retrieval/graph_completion_retriever.py b/cognee/modules/retrieval/graph_completion_retriever.py
@@ -148,8 +148,8 @@ async def get_context(self, query: str) -> List[Edge]:
         # context = await self.resolve_edges_to_text(triplets)
 
         entity_nodes = get_entity_nodes_from_triplets(triplets)
-          
-        await update_node_access_timestamps(entity_nodes) 
+
+        await update_node_access_timestamps(entity_nodes)
         return triplets
 
     async def convert_retrieved_objects_to_context(self, triplets: List[Edge]):
diff --git a/cognee/modules/retrieval/summaries_retriever.py b/cognee/modules/retrieval/summaries_retriever.py
@@ -55,9 +55,9 @@ async def get_context(self, query: str) -> Any:
                 "TextSummary_text", query, limit=self.top_k
             )
             logger.info(f"Found {len(summaries_results)} summaries from vector search")
-            
+
             await update_node_access_timestamps(summaries_results)
-        
+
         except CollectionNotFoundError as error:
             logger.error("TextSummary_text collection not found in vector database")
             raise NoDataError("No data found in the system, please add data first.") from error
diff --git a/cognee/modules/retrieval/utils/access_tracking.py b/cognee/modules/retrieval/utils/access_tracking.py
@@ -1,82 +1,88 @@
-"""Utilities for tracking data access in retrievers."""  
-  
-import json  
-from datetime import datetime, timezone  
-from typing import List, Any  
-from uuid import UUID  
-import os  
-from cognee.infrastructure.databases.graph import get_graph_engine  
-from cognee.infrastructure.databases.relational import get_relational_engine  
-from cognee.modules.data.models import Data  
-from cognee.shared.logging_utils import get_logger  
-from sqlalchemy import update  
-from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph  
-  
-logger = get_logger(__name__)  
-  
-  
-async def update_node_access_timestamps(items: List[Any]):  
-    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":  
-        return  
-  
-    if not items:  
-        return  
-  
-    graph_engine = await get_graph_engine()  
-    timestamp_dt = datetime.now(timezone.utc)  
-  
-    # Extract node IDs  
-    node_ids = []  
-    for item in items:  
-        item_id = item.payload.get("id") if hasattr(item, 'payload') else item.get("id")  
-        if item_id:  
-            node_ids.append(str(item_id))  
-  
-    if not node_ids:  
-        return  
-  
-    # Focus on document-level tracking via projection  
-    try:  
-        doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)  
-        if doc_ids:  
-            await _update_sql_records(doc_ids, timestamp_dt)  
-    except Exception as e:  
-        logger.error(f"Failed to update SQL timestamps: {e}")  
-        raise  
-  
-  
-async def _find_origin_documents_via_projection(graph_engine, node_ids):  
-    """Find origin documents using graph projection instead of DB queries"""  
-    # Project the entire graph with necessary properties  
-    memory_fragment = CogneeGraph()  
-    await memory_fragment.project_graph_from_db(  
-        graph_engine,  
-        node_properties_to_project=["id", "type"],  
-        edge_properties_to_project=["relationship_name"]  
-    )  
-  
-    # Find origin documents by traversing the in-memory graph  
-    doc_ids = set()  
-    for node_id in node_ids:  
-        node = memory_fragment.get_node(node_id)  
-        if node and node.get_attribute("type") == "DocumentChunk":  
-            # Traverse edges to find connected documents  
-            for edge in node.get_skeleton_edges():  
-                # Get the neighbor node  
-                neighbor = edge.get_destination_node() if edge.get_source_node().id == node_id else edge.get_source_node()  
-                if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:  
-                    doc_ids.add(neighbor.id)  
-  
-    return list(doc_ids)  
-  
-  
-async def _update_sql_records(doc_ids, timestamp_dt):  
-    """Update SQL Data table (same for all providers)"""  
-    db_engine = get_relational_engine()  
-    async with db_engine.get_async_session() as session:  
-        stmt = update(Data).where(  
-            Data.id.in_([UUID(doc_id) for doc_id in doc_ids])  
-        ).values(last_accessed=timestamp_dt)  
-  
-        await session.execute(stmt)  
+"""Utilities for tracking data access in retrievers."""
+
+import json
+from datetime import datetime, timezone
+from typing import List, Any
+from uuid import UUID
+import os
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.infrastructure.databases.relational import get_relational_engine
+from cognee.modules.data.models import Data
+from cognee.shared.logging_utils import get_logger
+from sqlalchemy import update
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
+
+logger = get_logger(__name__)
+
+
+async def update_node_access_timestamps(items: List[Any]):
+    if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":
+        return
+
+    if not items:
+        return
+
+    graph_engine = await get_graph_engine()
+    timestamp_dt = datetime.now(timezone.utc)
+
+    # Extract node IDs
+    node_ids = []
+    for item in items:
+        item_id = item.payload.get("id") if hasattr(item, "payload") else item.get("id")
+        if item_id:
+            node_ids.append(str(item_id))
+
+    if not node_ids:
+        return
+
+    # Focus on document-level tracking via projection
+    try:
+        doc_ids = await _find_origin_documents_via_projection(graph_engine, node_ids)
+        if doc_ids:
+            await _update_sql_records(doc_ids, timestamp_dt)
+    except Exception as e:
+        logger.error(f"Failed to update SQL timestamps: {e}")
+        raise
+
+
+async def _find_origin_documents_via_projection(graph_engine, node_ids):
+    """Find origin documents using graph projection instead of DB queries"""
+    # Project the entire graph with necessary properties
+    memory_fragment = CogneeGraph()
+    await memory_fragment.project_graph_from_db(
+        graph_engine,
+        node_properties_to_project=["id", "type"],
+        edge_properties_to_project=["relationship_name"],
+    )
+
+    # Find origin documents by traversing the in-memory graph
+    doc_ids = set()
+    for node_id in node_ids:
+        node = memory_fragment.get_node(node_id)
+        if node and node.get_attribute("type") == "DocumentChunk":
+            # Traverse edges to find connected documents
+            for edge in node.get_skeleton_edges():
+                # Get the neighbor node
+                neighbor = (
+                    edge.get_destination_node()
+                    if edge.get_source_node().id == node_id
+                    else edge.get_source_node()
+                )
+                if neighbor and neighbor.get_attribute("type") in ["TextDocument", "Document"]:
+                    doc_ids.add(neighbor.id)
+
+    return list(doc_ids)
+
+
+async def _update_sql_records(doc_ids, timestamp_dt):
+    """Update SQL Data table (same for all providers)"""
+    db_engine = get_relational_engine()
+    async with db_engine.get_async_session() as session:
+        stmt = (
+            update(Data)
+            .where(Data.id.in_([UUID(doc_id) for doc_id in doc_ids]))
+            .values(last_accessed=timestamp_dt)
+        )
+
+        await session.execute(stmt)
         await session.commit()
diff --git a/cognee/tasks/cleanup/cleanup_unused_data.py b/cognee/tasks/cleanup/cleanup_unused_data.py
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
diff --git a/cognee/tests/test_cleanup_unused_data.py b/cognee/tests/test_cleanup_unused_data.py
diff --git a/cognee/tests/test_permissions.py b/cognee/tests/test_permissions.py